## Clean/transform game_penalties_bronze -> game_penalties_silver

##### 1. Import and load table

In [1]:
from delta.tables import DeltaTable
from pyspark.sql import functions as F
from pyspark.sql import functions as F, Window

# Read a Delta table into a Spark DataFrame
df = spark.read.format("delta").load("Tables/game_penalties_bronze")

StatementMeta(, b3a44262-f0fa-426c-b726-9b37a8808803, 3, Finished, Available, Finished)

##### 2. Clean and transform data

In [2]:
# 1. Idempotent: Remove duplicate rows with exact information
df = df.dropDuplicates()

# 2. Function to convert camelCase / PascalCase to snake_case
def to_snake_case(name):
    return ''.join(['_' + c.lower() if c.isupper() else c for c in name]).lstrip('_')

# Apply to all columns in one line
df = df.toDF(*[to_snake_case(c) for c in df.columns])

"""
# Show results
display(df.limit(10))

rows_final = df.count()
cols_final = len(df.columns)
print("")
print("Number of rows:", rows_final)
print("Number of columns:", cols_final)
"""

StatementMeta(, b3a44262-f0fa-426c-b726-9b37a8808803, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 35e248cf-6654-4055-94b7-fcc680a27eb6)


Number of rows: 229228
Number of columns: 3


##### 3. Load data to silver table

In [3]:
# Incoming Bronze dataframe
source_df = df  
key_col = "play_id"

# Path to Silver Lakehouse Delta table
target_path = "abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/ce7ef0e9-78af-44db-b5ee-839dcf1c9e98/Tables/game_penalties_silver"

if DeltaTable.isDeltaTable(spark, target_path):
    # Load existing target table
    existing_df = spark.read.format("delta").load(target_path).select(key_col).distinct()

    # Keep only new keys
    new_rows_df = source_df.join(existing_df, on=key_col, how="left_anti")

    if new_rows_df.limit(1).count() > 0:
        (new_rows_df.write
            .format("delta")
            .mode("append")
            .save(target_path))
        print(f"✅ Appended {new_rows_df.count()} new rows to game_penalties_silver in Lakehouse_Silver.")
    else:
        print("No new rows to append. game_penalties_silver is already up to date.")
else:
    # First load → create the Silver table
    (source_df.write
        .format("delta")
        .mode("overwrite")
        .save(target_path))
    print(f"✅ Initial load complete: created game_penalties_silver in Lakehouse_Silver with {source_df.count()} rows.")


StatementMeta(, b3a44262-f0fa-426c-b726-9b37a8808803, 5, Finished, Available, Finished)

No new rows to append. game_penalties_silver is already up to date.


In [1]:
"""
# Show first 10 rows
display(df.limit(10))

# Check shape of data
rows = df.count()
cols = len(df.columns)
print("")
print("Number of rows:", rows)
print("Number of columns:", cols)
print("")

# Check for nulls
null_counts = df.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in df.columns
])
null_counts.show()

# Check for exact duplicate rows
dup_count = df.groupBy(df.columns).count().filter(F.col("count") > 1).count()
print("Number of exact duplicated rows:", dup_count)
print("")

# Check schema data type
df.printSchema()
"""

StatementMeta(, 12645726-5957-49b0-94f4-dfa5483dcd6a, 3, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 43dbdffd-c1b2-464f-9f8c-0383147c6e46)


Number of rows: 247828
Number of columns: 3

+-------+---------------+--------------+
|play_id|penaltySeverity|penaltyMinutes|
+-------+---------------+--------------+
|      0|              0|             0|
+-------+---------------+--------------+

Number of exact duplicated rows: 18600

root
 |-- play_id: string (nullable = true)
 |-- penaltySeverity: string (nullable = true)
 |-- penaltyMinutes: integer (nullable = true)



##### 2. Check duplicated rows

In [2]:
"""
# Create a window partitioned by all columns
w = Window.partitionBy(df.columns)

# Add a count of how many times each row appears
duplicates_all = (
    df.withColumn("dup_count", F.count("*").over(w))
      .filter(F.col("dup_count") > 1)   # keep only duplicates
)

display(duplicates_all)
"""

StatementMeta(, 12645726-5957-49b0-94f4-dfa5483dcd6a, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, db5322ba-ddbf-41a1-80b3-456e1f54e035)

##### 3. Cleaned/transformed to silver