##### This Notebook loads the latest (by time stamp) raw data drop from raw_partition_zone to Delta format in Lakehouse_Bronze
##### Create if missing; else OVERWRITE or idempotent APPEND without duplicates

In [2]:
import os
from pyspark.sql import functions as F
from delta.tables import DeltaTable

# SCD Type 1 (overwrite)
# These are dimensional tables where (1) values within (e.g., position) may change over time, (2) history not required, and (3) dataset remains relatively small over time
OVERWRITE_TABLES = {
    "player_info",
    "team_info",
}

# Idempotent Append (upsert-by-key, fall back to hash de-dup if no keys)
# These are fact tables where (1) values mid-table typically does not change, (2) data gets added incrementally and grows large quickly over time
UPSERT_TABLES_KEYS = {
    "game": ["game_id"],
    "game_plays": ["game_id", "play_id"],
    "game_plays_players": ["game_id", "play_id", "player_id", "playerType"],
    "game_goals": ["play_id"],
    "game_penalties": ["play_id"],
    "game_goalie_stats": ["game_id", "player_id"],
    "game_skater_stats": ["game_id", "player_id"],
    "game_teams_stats": ["game_id", "team_id"],
    "game_officials": ["game_id"],
    "game_scratches": ["game_id", "player_id"],
    "game_shifts": ["game_id", "player_id"],
}

# Root path to partitioned raw zone (note the trailing slash)
files_root = (
    "abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/"
    "f8bc9aa5-a717-4b16-a592-33bcfe0202bb/Files/raw_partition_zone/"
)

# Helper: stable row hash across all columns (used when keys are not provided)
def with_row_hash(df):
    cols = sorted(df.columns)
    return df.withColumn(
        "_row_hash",
        F.sha2(F.concat_ws("||", *[F.col(c).cast("string") for c in cols]), 256)
    )

# 1) List timestamped subfolders
entries = mssparkutils.fs.ls(files_root)
subfolders = [e for e in entries if e.isDir]
if len(subfolders) == 0:
    raise RuntimeError("No subfolders found in raw_partition_zone.")

# 2) Pick the latest folder (yyyy-MM-dd_HH-mm-ss sorts correctly)
latest_folder = sorted([f.name.strip("/") for f in subfolders])[-1]
latest_path = files_root + latest_folder + "/"
print(f"Latest folder: {latest_folder}\nPath: {latest_path}")

# 3) Find all CSV paths
all_files_df = spark.read.format("binaryFile").load(latest_path + "*.csv").select("path")
csv_files = [r.path for r in all_files_df.collect()]
if len(csv_files) == 0:
    raise RuntimeError(f"No CSV files found in: {latest_path}")
print(f"Found {len(csv_files)} CSVs.")

# 4) Process each CSV
for file_path in csv_files:
    file_name = os.path.basename(file_path)
    base_name = os.path.splitext(file_name)[0]
    table_name = base_name + "_bronze"

    print(f"\nProcessing {file_name} → {table_name}")

    df = (
        spark.read
        .format("csv")
        .option("header", "true")
        .option("inferSchema", "true")
        .load(file_path)
    )

    table_exists = spark.catalog.tableExists(table_name)

    if not table_exists:
        # First time: create table
        (df.write
            .format("delta")
            .mode("overwrite")
            .option("overwriteSchema", "true")
            .saveAsTable(table_name))
        print(f"Created {table_name} with {df.count()} rows.")
        continue

    # Table exists
    if base_name in OVERWRITE_TABLES:
        # Always snapshot overwrite for these tables
        (df.write
            .format("delta")
            .mode("overwrite")
            .option("overwriteSchema", "true")
            .saveAsTable(table_name))
        print(f"Overwrote {table_name} with {df.count()} rows.")
    else:
        # Idempotent append for facts
        if base_name in UPSERT_TABLES_KEYS and len(UPSERT_TABLES_KEYS[base_name]) > 0:
            # Insert only new keys via MERGE
            key_cols = UPSERT_TABLES_KEYS[base_name]
            df_dedup = df.dropDuplicates(key_cols)
            target = DeltaTable.forName(spark, table_name)
            merge_condition = " AND ".join([f"t.`{k}` = s.`{k}`" for k in key_cols])
            insert_map = {c: f"s.`{c}`" for c in df_dedup.columns}
            (target.alias("t")
                   .merge(df_dedup.alias("s"), merge_condition)
                   .whenNotMatchedInsert(values=insert_map)
                   .execute())

            metrics = DeltaTable.forName(spark, table_name).history(1).select("operationMetrics").collect()[0][0]
            inserted = int(metrics.get("numTargetRowsInserted", "0"))
            print(f"Inserted {inserted} new rows into {table_name}.")
        else:
            # Generic, keyless de-dup using row hash
            src_h = with_row_hash(df)
            tgt_h = with_row_hash(spark.table(table_name)).select("_row_hash").distinct()
            new_rows = src_h.join(tgt_h, on="_row_hash", how="left_anti").drop("_row_hash")
            if new_rows.limit(1).count() == 0:
                print("No new rows found (all duplicates).")
            else:
                (new_rows.write
                    .format("delta")
                    .mode("append")
                    .saveAsTable(table_name))
                print(f"Appended {new_rows.count()} new rows to {table_name} (hash de-dup).")

print("\nAll files processed successfully.")


StatementMeta(, 37103623-7f11-49e8-bcb2-74737cdc2d70, 6, Finished, Available, Finished)

Latest folder: 2025-09-28_03-56-31
Path: abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/f8bc9aa5-a717-4b16-a592-33bcfe0202bb/Files/raw_partition_zone/2025-09-28_03-56-31/


Found 13 CSVs.

Processing game_plays.csv → game_plays_bronze


Inserted 0 new rows into game_plays_bronze.

Processing game_shifts.csv → game_shifts_bronze


Inserted 0 new rows into game_shifts_bronze.

Processing game_plays_players.csv → game_plays_players_bronze


Inserted 0 new rows into game_plays_players_bronze.

Processing game_skater_stats.csv → game_skater_stats_bronze
Inserted 0 new rows into game_skater_stats_bronze.

Processing game_penalties.csv → game_penalties_bronze
Inserted 0 new rows into game_penalties_bronze.

Processing game_goals.csv → game_goals_bronze
Inserted 0 new rows into game_goals_bronze.

Processing game_goalie_stats.csv → game_goalie_stats_bronze
Inserted 0 new rows into game_goalie_stats_bronze.

Processing game_teams_stats.csv → game_teams_stats_bronze
Inserted 0 new rows into game_teams_stats_bronze.

Processing game_officials.csv → game_officials_bronze
Inserted 0 new rows into game_officials_bronze.

Processing game.csv → game_bronze
Inserted 0 new rows into game_bronze.

Processing game_scratches.csv → game_scratches_bronze
Inserted 0 new rows into game_scratches_bronze.

Processing player_info.csv → player_info_bronze
Overwrote player_info_bronze with 3925 rows.

Processing team_info.csv → team_info_bronze
Ove