## Clean/transform game_goalie_stats_bronze -> game_goalie_stats_silver

##### 1. Import and load table

In [1]:
from delta.tables import DeltaTable
from pyspark.sql import functions as F
from pyspark.sql import functions as F, Window

# Read a Delta table into a Spark DataFrame
df = spark.read.format("delta").load("Tables/game_goalie_stats_bronze")

StatementMeta(, 762881bb-b5dc-4b25-a94c-c6de0c818f5a, 3, Finished, Available, Finished)

##### 2. Clean and transform data

In [2]:
# 1. Remove duplicate rows with exact information
df = df.dropDuplicates()

# 2. Function to convert camelCase / PascalCase to snake_case
def to_snake_case(name):
    return ''.join(['_' + c.lower() if c.isupper() else c for c in name]).lstrip('_')

# Apply to all columns in one line
df = df.toDF(*[to_snake_case(c) for c in df.columns])

# 3: Cast data types
df = (
    df  # ✅ Start from df, not df_clean
    .withColumn("game_id", F.col("game_id").cast("string"))
    .withColumn("player_id", F.col("player_id").cast("string"))
    .withColumn("team_id", F.col("team_id").cast("string"))
    .withColumn("save_percentage", F.col("save_percentage").cast("float"))
    .withColumn("power_play_save_percentage", F.col("power_play_save_percentage").cast("float"))
    .withColumn("even_strength_save_percentage", F.col("even_strength_save_percentage").cast("float"))
)

# 4 Explicit rename for special case
df = df.withColumnRenamed("pim", "penalty_mins")

"""
# Show results
display(df.limit(10))

rows_final = df.count()
cols_final = len(df.columns)
print("")
print("Number of rows:", rows_final)
print("Number of columns:", cols_final)
"""

StatementMeta(, 762881bb-b5dc-4b25-a94c-c6de0c818f5a, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 4de62f4d-bada-4ee6-ad74-e7ecfca1f193)


Number of rows: 51163
Number of columns: 19


##### 3. Load data to silver table

In [3]:
# Incoming Bronze dataframe
source_df = df  
key_col = "game_id"

# Path to Silver Lakehouse Delta table
target_path = "abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/ce7ef0e9-78af-44db-b5ee-839dcf1c9e98/Tables/game_goalie_stats_silver"

if DeltaTable.isDeltaTable(spark, target_path):
    # Load existing target table
    existing_df = spark.read.format("delta").load(target_path).select(key_col).distinct()

    # Keep only new keys
    new_rows_df = source_df.join(existing_df, on=key_col, how="left_anti")

    if new_rows_df.limit(1).count() > 0:
        (new_rows_df.write
            .format("delta")
            .mode("append")
            .save(target_path))
        print(f"✅ Appended {new_rows_df.count()} new rows to game_goalie_stats_silver in Lakehouse_Silver.")
    else:
        print("No new rows to append. game_goalie_stats_silver is already up to date.")
else:
    # If first load → create the Silver table
    (source_df.write
        .format("delta")
        .mode("overwrite")
        .save(target_path))
    print(f"✅ Initial load complete: created game_goalie_stats_silver in Lakehouse_Silver with {source_df.count()} rows.")


StatementMeta(, 762881bb-b5dc-4b25-a94c-c6de0c818f5a, 5, Finished, Available, Finished)

No new rows to append. game_goalie_stats_silver is already up to date.


In [1]:
"""
# Show first 10 rows
display(df.limit(10))

# Check shape of data
rows = df.count()
cols = len(df.columns)
print("")
print("Number of rows:", rows)
print("Number of columns:", cols)
print("")

# Check for nulls
null_counts = df.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in df.columns
])
null_counts.show()

# Check for exact duplicate rows
dup_count = df.groupBy(df.columns).count().filter(F.col("count") > 1).count()
print("Number of exact duplicated rows:", dup_count)
print("")

# Check schema data type
df.printSchema()
"""

StatementMeta(, bd2f6fcc-9577-4302-b090-23cb1ed107ad, 3, Finished, Available, Finished)

NameError: name 'df' is not defined

##### 2. Check duplicated rows

In [None]:
"""
# Create a window partitioned by all columns
w = Window.partitionBy(df.columns)

# Add a count of how many times each row appears
duplicates_all = (
    df.withColumn("dup_count", F.count("*").over(w))
      .filter(F.col("dup_count") > 1)   # keep only duplicates
)

display(duplicates_all)
"""

StatementMeta(, cca2db7d-db0e-46f5-98e5-69952e8060d2, 5, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 938db8a3-f93b-47c2-b524-95e7a05b4f76)

##### 3. Cleaned/transformed to silver