In [4]:
# ==============================================
# Silver -> Gold : fact_players (no surrogate key on fact)
# Grain: one row per (date_key, game_key, player_key, team_key)
# ==============================================

from pyspark.sql import functions as F
from delta.tables import DeltaTable

# -----------------------------
# Paths
# -----------------------------
gold_base = (
    "abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/"
    "2dd626a9-1762-4265-b676-2ccd6b809184/Tables"
)
fact_target_path = f"{gold_base}/fact_players"
dim_dates_path   = f"{gold_base}/dim_dates"
dim_games_path   = f"{gold_base}/dim_games"
dim_players_path = f"{gold_base}/dim_players"
dim_teams_path   = f"{gold_base}/dim_teams"

# -----------------------------
# 1) Read Silver sources
# -----------------------------
stats_df = spark.read.format("delta").load("Tables/game_skater_stats_silver")
games_df = spark.read.format("delta").load("Tables/game_silver")

# The games table has the datetime. Detect the right column name just in case.
date_cols = ["date_time", "date_time_gmt", "game_datetime_gmt"]
game_dt_col = next((c for c in date_cols if c in games_df.columns), None)
if game_dt_col is None:
    raise ValueError(f"Could not find a datetime column in game_silver. Looked for: {date_cols}. "
                     f"Available: {games_df.columns}")

games_dates = (
    games_df
    .select(
        F.col("game_id").cast("string").alias("game_id"),
        F.col(game_dt_col).cast("timestamp").alias("event_ts")
    )
    .withColumn("full_date", F.to_date("event_ts"))
    .withColumn("date_key", F.date_format("full_date", "yyyyMMdd").cast("int"))
    .select("game_id", "date_key")
)

# Measures for the fact table
measure_cols = [
    "time_on_ice", "assists", "goals", "shots", "hits",
    "power_play_goals", "power_play_assists",
    "penalty_minutes",
    "faceoff_wins", "faceoff_taken",
    "takeaways", "giveaways",
    "short_handed_goals", "short_handed_assists",
    "blocked", "plus_minus",
    "even_time_on_ice", "short_handed_time_on_ice", "power_play_time_on_ice"
]

# Select natural keys + measures from stats, cast measures to integer
stats_sel = (
    stats_df.select(
        F.col("game_id").cast("string").alias("game_id"),
        F.col("player_id").cast("string").alias("player_id"),
        F.col("team_id").cast("string").alias("team_id"),
        *[F.col(c).cast("int").alias(c) for c in measure_cols]
    )
)

# Replace nulls with zero for all measures so aggregations behave
for c in measure_cols:
    stats_sel = stats_sel.withColumn(c, F.coalesce(F.col(c), F.lit(0)))

# Join to get date_key, then (optionally) aggregate to the game level if duplicates exist
stats_with_date = stats_sel.join(games_dates, on="game_id", how="inner")

# If your source can emit duplicate rows per (game_id, player_id, team_id), this safely collapses them.
group_cols = ["game_id", "player_id", "team_id", "date_key"]
agg_exprs = [F.sum(c).alias(c) for c in measure_cols]
stats_agg = stats_with_date.groupBy(*group_cols).agg(*agg_exprs)

# -----------------------------
# 2) Read Gold dimensions (keys only)
# -----------------------------
dim_dates   = spark.read.format("delta").load(dim_dates_path).select("date_key").distinct()
dim_games   = spark.read.format("delta").load(dim_games_path).select("game_key", "game_id").distinct()
dim_players = spark.read.format("delta").load(dim_players_path).select("player_key", "player_id").distinct()
dim_teams   = spark.read.format("delta").load(dim_teams_path).select("team_key", "team_id").distinct()

# -----------------------------
# 3) Build the fact rows (map natural -> surrogate keys)
# -----------------------------
fact_df = (
    stats_agg
      .join(dim_games,   on="game_id",   how="inner")
      .join(dim_players, on="player_id", how="inner")
      .join(dim_teams,   on="team_id",   how="inner")
      .join(dim_dates,   on="date_key",  how="inner")
      .select("date_key", "game_key", "player_key", "team_key", *measure_cols)
      .dropDuplicates(["date_key", "game_key", "player_key", "team_key"])
)

# -----------------------------
# 4) Write to Gold (initial or incremental append)
# -----------------------------
from delta.tables import DeltaTable

if DeltaTable.isDeltaTable(spark, fact_target_path):
    existing_keys = (
        DeltaTable.forPath(spark, fact_target_path)
        .toDF()
        .select("date_key", "game_key", "player_key", "team_key")
        .distinct()
    )

    to_append = fact_df.join(
        existing_keys,
        on=["date_key", "game_key", "player_key", "team_key"],
        how="left_anti"
    )

    if to_append.rdd.isEmpty():
        print("No new rows to append. fact_players is already up to date.")
    else:
        (to_append
         .write
         .format("delta")
         .mode("append")
         .save(fact_target_path))
        print(f"✅ Incremental load complete: {to_append.count()} new rows appended to fact_players.")
else:
    (fact_df
     .orderBy("date_key", "game_key", "player_key", "team_key")
     .write
     .format("delta")
     .mode("overwrite")
     .save(fact_target_path))
    print(f"✅ Initial load complete: fact_players created with {fact_df.count()} rows.")

# -----------------------------
# 5) Verify
# -----------------------------
result_df = spark.read.format("delta").load(fact_target_path)
print(f"Rows in fact_players after write: {result_df.count()}")
display(result_df.orderBy("date_key", "game_key", "player_key", "team_key").limit(10))


StatementMeta(, 17451436-07f1-4af3-82d8-a96fe0a6f1f0, 6, Finished, Available, Finished)

✅ Initial load complete: fact_players created with 853314 rows.
Rows in fact_players after write: 853314


SynapseWidget(Synapse.DataFrame, a93e733b-bea2-4e11-b2a0-fdb5742e18b0)