In [1]:
# ==============================================
# Silver -> Gold : fact_goalies (no surrogate key on fact)
# Grain: one row per (date_key, game_key, player_key, team_key)
# ==============================================

from pyspark.sql import functions as F
from delta.tables import DeltaTable

# -----------------------------
# Paths
# -----------------------------
gold_base = (
    "abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/"
    "2dd626a9-1762-4265-b676-2ccd6b809184/Tables"
)
fact_target_path = f"{gold_base}/fact_goalies"
dim_dates_path   = f"{gold_base}/dim_dates"
dim_games_path   = f"{gold_base}/dim_games"
dim_players_path = f"{gold_base}/dim_players"
dim_teams_path   = f"{gold_base}/dim_teams"

# -----------------------------
# 1) Read Silver sources
# -----------------------------
stats_df = spark.read.format("delta").load("Tables/game_goalie_stats_silver")
games_df = spark.read.format("delta").load("Tables/game_silver")

# Detect the datetime column in games
date_cols = ["date_time", "date_time_gmt", "game_datetime_gmt"]
game_dt_col = next((c for c in date_cols if c in games_df.columns), None)
if game_dt_col is None:
    raise ValueError(f"Could not find a datetime column in game_silver. Looked for: {date_cols}. "
                     f"Available: {games_df.columns}")

games_dates = (
    games_df
    .select(
        F.col("game_id").cast("string").alias("game_id"),
        F.col(game_dt_col).cast("timestamp").alias("event_ts")
    )
    .withColumn("full_date", F.to_date("event_ts"))
    .withColumn("date_key", F.date_format("full_date", "yyyyMMdd").cast("int"))
    .select("game_id", "date_key")
)

# -----------------------------
# 2) Measures
# -----------------------------
measure_cols = [
    "time_on_ice", "assists", "goals", "shots", "hits", "penalty_mins", "saves",
    "power_play_saves", "short_handed_saves", "even_saves", "short_handed_shots_against", "even_shots_against",
    "save_percentage", "power_play_save_percentage", "even_strength_save_percentage"
]

# Split into integer-like and percentage measures
percent_cols = ["save_percentage", "power_play_save_percentage", "even_strength_save_percentage"]
int_cols = [c for c in measure_cols if c not in percent_cols]

# -----------------------------
# 3) Select natural keys + measures; cast and fill nulls
# -----------------------------
# Start with the natural keys
cols_to_select = [
    F.col("game_id").cast("string").alias("game_id"),
    F.col("player_id").cast("string").alias("player_id"),
    F.col("team_id").cast("string").alias("team_id")
]

# Add present integer measures
cols_to_select += [F.col(c).cast("int").alias(c) for c in int_cols if c in stats_df.columns]
# Add present percentage measures
cols_to_select += [F.col(c).cast("double").alias(c) for c in percent_cols if c in stats_df.columns]

stats_sel = stats_df.select(*cols_to_select)

# Add any missing measures so the output schema always has your full list
for c in int_cols:
    if c not in stats_sel.columns:
        stats_sel = stats_sel.withColumn(c, F.lit(0).cast("int"))
for c in percent_cols:
    if c not in stats_sel.columns:
        stats_sel = stats_sel.withColumn(c, F.lit(0.0).cast("double"))

# Coalesce nulls
for c in int_cols:
    stats_sel = stats_sel.withColumn(c, F.coalesce(F.col(c), F.lit(0)))
for c in percent_cols:
    stats_sel = stats_sel.withColumn(c, F.coalesce(F.col(c), F.lit(0.0)))

# -----------------------------
# 4) Add date_key and collapse duplicates if any
# -----------------------------
stats_with_date = stats_sel.join(games_dates, on="game_id", how="inner")

group_cols = ["game_id", "player_id", "team_id", "date_key"]
agg_exprs = [F.sum(c).alias(c) for c in int_cols] + [F.avg(c).alias(c) for c in percent_cols]

stats_agg = stats_with_date.groupBy(*group_cols).agg(*agg_exprs)

# -----------------------------
# 5) Read Gold dimensions (keys only)
# -----------------------------
dim_dates   = spark.read.format("delta").load(dim_dates_path).select("date_key").distinct()
dim_games   = spark.read.format("delta").load(dim_games_path).select("game_key", "game_id").distinct()
dim_players = spark.read.format("delta").load(dim_players_path).select("player_key", "player_id").distinct()
dim_teams   = spark.read.format("delta").load(dim_teams_path).select("team_key", "team_id").distinct()

# -----------------------------
# 6) Map natural IDs -> surrogate keys and build the fact rows
# -----------------------------
fact_df = (
    stats_agg
      .join(dim_games,   on="game_id",   how="inner")
      .join(dim_players, on="player_id", how="inner")
      .join(dim_teams,   on="team_id",   how="inner")
      .join(dim_dates,   on="date_key",  how="inner")
      .select("date_key", "game_key", "player_key", "team_key", *measure_cols)
      .dropDuplicates(["date_key", "game_key", "player_key", "team_key"])
)

# -----------------------------
# 7) Write to Gold (initial or incremental append)
# -----------------------------
if DeltaTable.isDeltaTable(spark, fact_target_path):
    existing_keys = (
        DeltaTable.forPath(spark, fact_target_path)
        .toDF()
        .select("date_key", "game_key", "player_key", "team_key")
        .distinct()
    )
    to_append = fact_df.join(
        existing_keys,
        on=["date_key", "game_key", "player_key", "team_key"],
        how="left_anti"
    )
    if to_append.rdd.isEmpty():
        print("No new rows to append. fact_goalies is already up to date.")
    else:
        (to_append
         .write
         .format("delta")
         .mode("append")
         .save(fact_target_path))
        print(f"✅ Incremental load complete: {to_append.count()} new rows appended to fact_goalies.")
else:
    (fact_df
     .orderBy("date_key", "game_key", "player_key", "team_key")
     .write
     .format("delta")
     .mode("overwrite")
     .save(fact_target_path))
    print(f"✅ Initial load complete: fact_goalies created with {fact_df.count()} rows.")

# -----------------------------
# 8) Verify
# -----------------------------
result_df = spark.read.format("delta").load(fact_target_path)
print(f"Rows in fact_goalies after write: {result_df.count()}")
display(result_df.orderBy("date_key", "game_key", "player_key", "team_key").limit(10))


StatementMeta(, 7b2e5e05-3236-4fa5-8a04-532c472f9c74, 3, Finished, Available, Finished)

✅ Initial load complete: fact_goalies created with 51143 rows.
Rows in fact_goalies after write: 51143


SynapseWidget(Synapse.DataFrame, d15463b9-810e-474f-92f6-e70a15b11bd1)