In [2]:
# ============================
# Silver -> Gold Load to Lake with Surrogate Key (team_key)
# ============================

from pyspark.sql import functions as F
from pyspark.sql.window import Window
from delta.tables import DeltaTable

# ---------------------------------------
# PART 1: Read Silver and Select Only Needed Columns
# ---------------------------------------

# Read the Silver table into a Spark DataFrame
df = spark.read.format("delta").load("Tables/team_info_silver")

# Select only the 4 business columns we want (exclude surrogate for now)
source_df = (
    df
    .select(
        F.col("team_id").cast("string"),
        F.col("team_name").cast("string"),
        F.col("short_name").cast("string"),
        F.col("abbreviation").cast("string")
    )
    .dropDuplicates()  # remove exact duplicates
)

# ---------------------------------------
# PART 2: Define Gold Target Path
# ---------------------------------------

target_path = "abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/2dd626a9-1762-4265-b676-2ccd6b809184/Tables/dim_teams"

# Helper: add surrogate key column team_key
def attach_surrogate_keys(input_df, start_offset):
    """
    Assigns sequential surrogate keys as team_key starting from start_offset + 1
    using a deterministic ordering on business key(s).
    """
    window_no_partition = Window.orderBy(F.col("team_id").asc())
    return input_df.withColumn("team_key", F.row_number().over(window_no_partition) + F.lit(start_offset))

# ---------------------------------------
# PART 3: Write to Gold
# ---------------------------------------

if DeltaTable.isDeltaTable(spark, target_path):
    # Table exists: load it
    existing_delta = DeltaTable.forPath(spark, target_path)
    existing_df = existing_delta.toDF()

    # Get existing keys and current max surrogate key
    existing_keys_df = existing_df.select("team_id").distinct()
    current_max_sk = existing_df.agg(F.max("team_key").alias("max_key")).collect()[0]["max_key"]
    if current_max_sk is None:
        current_max_sk = 0

    # Identify only new rows
    new_rows_df = source_df.join(existing_keys_df, on="team_id", how="left_anti")

    if new_rows_df.rdd.isEmpty():
        print("No new rows to append. dim_teams is already up to date.")
    else:
        # Add surrogate keys starting from current max
        new_rows_with_sk_df = attach_surrogate_keys(new_rows_df, current_max_sk)

        # Reorder columns: team_key first
        new_rows_with_sk_df = new_rows_with_sk_df.select(
            "team_key", "team_id", "team_name", "short_name", "abbreviation"
        )

        (new_rows_with_sk_df
            .write
            .format("delta")
            .mode("append")
            .save(target_path))

        print(f"✅ Incremental load complete: {new_rows_with_sk_df.count()} new rows appended to dim_teams in Lakehouse_Gold.")

else:
    # Table does not exist: create it
    initial_rows_with_sk_df = attach_surrogate_keys(source_df, start_offset=0)

    # Reorder columns: team_key first
    initial_rows_with_sk_df = initial_rows_with_sk_df.select(
        "team_key", "team_id", "team_name", "short_name", "abbreviation"
    )

    (initial_rows_with_sk_df
        .write
        .format("delta")
        .mode("overwrite")
        .save(target_path))

    print(f"✅ Initial load complete: dim_teams created with {initial_rows_with_sk_df.count()} rows.")

# ---------------------------------------
# PART 4: Verify
# ---------------------------------------

result_df = spark.read.format("delta").load(target_path)
print(f"Rows in Gold after write: {result_df.count()}")
display(result_df.orderBy(F.col("team_key").asc()).limit(10))


StatementMeta(, 4db052a9-9f8c-495e-afb3-54188cdedf78, 4, Finished, Available, Finished)

✅ Initial load complete: dim_teams created with 33 rows.
Rows in Gold after write: 33


SynapseWidget(Synapse.DataFrame, f2abd523-c2bf-4b5f-9e87-1f13812a26cf)