In [2]:
# ============================
# Silver -> Gold Load to Lake with Surrogate Key (game_key)
# ============================

from pyspark.sql import functions as F
from pyspark.sql.window import Window
from delta.tables import DeltaTable

# ---------------------------------------
# PART 1: Read Silver and Select Only Needed Columns
# ---------------------------------------

# Read the Silver table into a Spark DataFrame
df = spark.read.format("delta").load("Tables/game_silver")

# Select only the 4 business columns we want (exclude surrogate for now)
source_df = (
    df
    .select(
        F.col("game_id").cast("string"),
        F.col("game_season").cast("string"),
        F.col("game_type").cast("string"),
        F.col("game_venue").cast("string")
    )
    .dropDuplicates()  # remove exact duplicates
)

# ---------------------------------------
# PART 2: Define Gold Target Path
# ---------------------------------------

target_path = "abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/2dd626a9-1762-4265-b676-2ccd6b809184/Tables/dim_games"

# Helper: add surrogate key column game_key
def attach_surrogate_keys(input_df, start_offset):
    """
    Assigns sequential surrogate keys as game_key starting from start_offset + 1
    using a deterministic ordering on business key(s).
    """
    window_no_partition = Window.orderBy(F.col("game_id").asc())
    return input_df.withColumn("game_key", F.row_number().over(window_no_partition) + F.lit(start_offset))

# ---------------------------------------
# PART 3: Write to Gold
# ---------------------------------------

if DeltaTable.isDeltaTable(spark, target_path):
    # Table exists: load it
    existing_delta = DeltaTable.forPath(spark, target_path)
    existing_df = existing_delta.toDF()

    # Get existing keys and current max surrogate key
    existing_keys_df = existing_df.select("game_id").distinct()
    current_max_sk = existing_df.agg(F.max("game_key").alias("max_key")).collect()[0]["max_key"]
    if current_max_sk is None:
        current_max_sk = 0

    # Identify only new rows
    new_rows_df = source_df.join(existing_keys_df, on="game_id", how="left_anti")

    if new_rows_df.rdd.isEmpty():
        print("No new rows to append. dim_games is already up to date.")
    else:
        # Add surrogate keys starting from current max
        new_rows_with_sk_df = attach_surrogate_keys(new_rows_df, current_max_sk)

        # Reorder columns: game_key first
        new_rows_with_sk_df = new_rows_with_sk_df.select(
            "game_key", "game_id", "game_season", "game_type", "game_venue"
        )

        (new_rows_with_sk_df
            .write
            .format("delta")
            .mode("append")
            .save(target_path))

        print(f"✅ Incremental load complete: {new_rows_with_sk_df.count()} new rows appended to dim_games in Lakehouse_Gold.")

else:
    # Table does not exist: create it
    initial_rows_with_sk_df = attach_surrogate_keys(source_df, start_offset=0)

    # Reorder columns: game_key first
    initial_rows_with_sk_df = initial_rows_with_sk_df.select(
        "game_key", "game_id", "game_season", "game_type", "game_venue"
    )

    (initial_rows_with_sk_df
        .write
        .format("delta")
        .mode("overwrite")
        .save(target_path))

    print(f"✅ Initial load complete: dim_games created with {initial_rows_with_sk_df.count()} rows.")

# ---------------------------------------
# PART 4: Verify
# ---------------------------------------

result_df = spark.read.format("delta").load(target_path)
print(f"Rows in Gold after write: {result_df.count()}")
display(result_df.orderBy(F.col("game_key").asc()).limit(10))


StatementMeta(, d5b09c82-f44e-4fdb-9152-e33517b8b529, 4, Finished, Available, Finished)

✅ Initial load complete: dim_games created with 23735 rows.
Rows in Gold after write: 23735


SynapseWidget(Synapse.DataFrame, 2f7326d2-ce64-4b2e-871d-6c1f693f7b51)