In [0]:
%pip install dbldatagen

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

# ----------------------------
# 1) Configuration parameters
# ----------------------------
# Define the calendar range you want in your date dimension
START_DATE = "2024-01-01"
END_DATE = "2099-12-31"

# ----------------------------
# 2) Build continuous date series
# ----------------------------
# Create a single-row DF and generate a date sequence from START_DATE to END_DATE
date_seq_df = spark.createDataFrame([(START_DATE, END_DATE)], ["start", "end"]).select(
    F.explode(
        F.sequence(F.to_date("start"), F.to_date("end"), F.expr("INTERVAL 1 DAY"))
    ).alias("date")
)

# ----------------------------
# 3) Derive dimension attributes
# ----------------------------

# UDF not required; use built-in functions
# date_sk as INT in YYYYMMDD format:
# - format_string('%Y%m%d', date) gives string "YYYYMMDD"; cast to INT for surrogate key

date_dim = (
    date_seq_df.withColumn(
        "date_sk", F.date_format(F.col("date"), "yyyyMMdd").cast(T.IntegerType())
    )
    .withColumn("day", F.dayofmonth(F.col("date")))
    .withColumn("week", F.weekofyear(F.col("date")))
    .withColumn("month", F.month(F.col("date")))
    .withColumn("quarter", F.quarter(F.col("date")))
    .withColumn("year", F.year(F.col("date")))
    .withColumn(
        "is_weekend",
        F.when(F.dayofweek(F.col("date")).isin([1, 7]), F.lit(True)).otherwise(
            F.lit(False)
        ),
    )
)

In [0]:
date_dim.write.mode("overwrite").saveAsTable("dev.bronze.dim_date")

In [0]:
%sql
select count(*) from dev.bronze.dim_date