In [1]:
# ============================
# Silver -> Gold Load to Lake with Surrogate Key (date_key)
# ============================

from pyspark.sql import functions as F
from delta.tables import DeltaTable

# ---------------------------------------
# PART 1: Read Silver and select only needed columns
# ---------------------------------------

# Read the Silver table into a Spark DataFrame
silver_df = spark.read.format("delta").load("Tables/game_silver")

# Create one row per calendar day (UTC/GMT) and a deterministic surrogate key (YYYYMMDD)
source_df = (
    silver_df
    .select(
        F.to_date(F.col("date_time").cast("timestamp")).alias("full_date")  # keep only the date
    )
    .where(F.col("full_date").isNotNull())
    .dropDuplicates()
    .withColumn("date_key", F.date_format(F.col("full_date"), "yyyyMMdd").cast("int"))
    .select("date_key", "full_date")
)

# ---------------------------------------
# PART 2: Define Gold target path
# ---------------------------------------

target_path = (
    "abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/"
    "2dd626a9-1762-4265-b676-2ccd6b809184/Tables/dim_dates"
)

# ---------------------------------------
# PART 3: Write to Gold (initial or incremental)
# ---------------------------------------

if DeltaTable.isDeltaTable(spark, target_path):
    # Incremental: append only new dates
    existing_df = DeltaTable.forPath(spark, target_path).toDF()
    existing_keys = existing_df.select("date_key").distinct()

    to_append = (
        source_df
        .join(existing_keys, on="date_key", how="left_anti")
        .orderBy("date_key")
    )

    if to_append.rdd.isEmpty():
        print("No new rows to append. dim_dates is already up to date.")
    else:
        (to_append
         .write
         .format("delta")
         .mode("append")
         .save(target_path))
        print(f"✅ Incremental load complete: {to_append.count()} new rows appended to dim_dates in Lakehouse_Gold.")
else:
    # Initial: create the table
    (source_df
     .orderBy("date_key")
     .write
     .format("delta")
     .mode("overwrite")
     .save(target_path))
    print(f"✅ Initial load complete: dim_dates created with {source_df.count()} rows.")

# ---------------------------------------
# PART 4: Verify
# ---------------------------------------

result_df = spark.read.format("delta").load(target_path)
print(f"Rows in Gold after write: {result_df.count()}")
display(result_df.orderBy(F.col("date_key").asc()).limit(10))


StatementMeta(, d4298f93-6b36-48a9-93af-7063177c734f, 3, Finished, Available, Finished)

No new rows to append. dim_dates is already up to date.
Rows in Gold after write: 3728


SynapseWidget(Synapse.DataFrame, 06468464-0bdb-42f3-8f60-a1bd268b1d19)