In [0]:
# Databricks / PySpark script: Date Dimension Table (2015–2030)
# Fixed Arrow type issue by converting int32 → int64 before Spark conversion

from pyspark.sql import SparkSession
import pandas as pd
from datetime import datetime
import numpy as np

# Initialize Spark
spark = SparkSession.builder.appName("DateDimension").getOrCreate()

# --- Step 1: Generate date range ---
start_date = datetime(2015, 1, 1)
end_date = datetime(2030, 12, 31)
dates = pd.date_range(start=start_date, end=end_date, freq='D')

# --- Step 2: Build Pandas DataFrame with date attributes ---
df = pd.DataFrame({"date": dates})

df["date_key"] = df["date"].dt.strftime("%Y%m%d").astype(np.int64)
df["year"] = df["date"].dt.year.astype(np.int64)
df["quarter"] = df["date"].dt.quarter.astype(np.int64)
df["month"] = df["date"].dt.month.astype(np.int64)
df["month_name"] = df["date"].dt.month_name()
df["day"] = df["date"].dt.day.astype(np.int64)
df["day_name"] = df["date"].dt.day_name()
df["day_of_week"] = (df["date"].dt.weekday + 1).astype(np.int64)  # Monday=1
df["week_of_year"] = df["date"].dt.isocalendar().week.astype(np.int64)
df["day_of_year"] = df["date"].dt.day_of_year.astype(np.int64)
df["is_weekend"] = df["day_of_week"].isin([6, 7])
df["is_leap_year"] = df["date"].dt.is_leap_year

# Fiscal calendar (FY starts in April)
df["fiscal_year"] = np.where(df["month"] >= 4, df["year"] + 1, df["year"]).astype(np.int64)
df["fiscal_quarter"] = (((df["month"] - 4) % 12) // 3 + 1).astype(np.int64)

# --- Step 3: Convert to Spark DataFrame (safe types) ---
df_dates = spark.createDataFrame(df)

# --- Step 4: Save as Delta table ---
df_dates.write.format("delta").mode("overwrite").saveAsTable("lingokids.silver.dim_date")

# --- Step 5: Preview ---
df_dates.show(10, truncate=False)
