## Clean/transform game_goals_bronze -> game_goals_silver

##### 1. Import and load table

In [1]:
from delta.tables import DeltaTable
from pyspark.sql import functions as F
from pyspark.sql import functions as F, Window

# Read a Delta table into a Spark DataFrame
df = spark.read.format("delta").load("Tables/game_goals_bronze")

StatementMeta(, e7feb155-2173-42d8-8c2d-8369565f6a1a, 3, Finished, Available, Finished)

##### 2. Clean and transform data

In [2]:
# 1. Remove duplicate rows with exact information
df = df.dropDuplicates()

# 2. Function to convert camelCase / PascalCase to snake_case
def to_snake_case(name):
    return ''.join(['_' + c.lower() if c.isupper() else c for c in name]).lstrip('_')

# Apply to all columns in one line
df = df.toDF(*[to_snake_case(c) for c in df.columns])

# Explicit rename for special case
df = df.withColumnRenamed("strength", "game_strength")

# 3: Cast data types
df = (
    df  # ✅ Start from df, not df_clean
    .withColumn("game_winning_goal", F.col("game_winning_goal").cast("boolean"))
    .withColumn("empty_net", F.col("empty_net").cast("boolean"))
    
)

"""
# Show results
display(df.limit(10))

rows_final = df.count()
cols_final = len(df.columns)
print("")
print("Number of rows:", rows_final)
print("Number of columns:", cols_final)
"""

StatementMeta(, e7feb155-2173-42d8-8c2d-8369565f6a1a, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, a3e7cc07-0ee8-4550-b853-a83fd937fb94)


Number of rows: 133345
Number of columns: 4


##### 3. Load data to silver table

In [3]:
# Incoming Bronze dataframe
source_df = df  
key_col = "play_id"

# Path to Silver Lakehouse Delta table
target_path = "abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/ce7ef0e9-78af-44db-b5ee-839dcf1c9e98/Tables/game_goals_silver"

if DeltaTable.isDeltaTable(spark, target_path):
    # Load existing target table
    existing_df = spark.read.format("delta").load(target_path).select(key_col).distinct()

    # Keep only new keys
    new_rows_df = source_df.join(existing_df, on=key_col, how="left_anti")

    if new_rows_df.limit(1).count() > 0:
        (new_rows_df.write
            .format("delta")
            .mode("append")
            .save(target_path))
        print(f"✅ Appended {new_rows_df.count()} new rows to game_goals_silver in Lakehouse_Silver.")
    else:
        print("No new rows to append. game_goals_silver is already up to date.")
else:
    # First load → create the Silver table
    (source_df.write
        .format("delta")
        .mode("overwrite")
        .save(target_path))
    print(f"✅ Initial load complete: created game_goals_silver in Lakehouse_Silver with {source_df.count()} rows.")


StatementMeta(, e7feb155-2173-42d8-8c2d-8369565f6a1a, 5, Finished, Available, Finished)

No new rows to append. game_goals_silver is already up to date.


In [4]:
"""
# Show first 10 rows
display(df.limit(10))

# Check shape of data
rows = df.count()
cols = len(df.columns)
print("")
print("Number of rows:", rows)
print("Number of columns:", cols)
print("")

# Check for nulls
null_counts = df.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in df.columns
])
null_counts.show()

# Check for exact duplicate rows
dup_count = df.groupBy(df.columns).count().filter(F.col("count") > 1).count()
print("Number of exact duplicated rows:", dup_count)
print("")

# Check schema data type
df.printSchema()
"""

StatementMeta(, 339211e3-eb3c-4f41-8bdc-d201224f5b79, 6, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 7d9dd4d2-c3b8-408e-bfb8-06b5436b843c)


Number of rows: 148992
Number of columns: 4

+-------+--------+---------------+--------+
|play_id|strength|gameWinningGoal|emptyNet|
+-------+--------+---------------+--------+
|      0|       0|              0|       0|
+-------+--------+---------------+--------+

Number of exact duplicated rows: 15647

root
 |-- play_id: string (nullable = true)
 |-- strength: string (nullable = true)
 |-- gameWinningGoal: string (nullable = true)
 |-- emptyNet: string (nullable = true)



##### 2. Check duplicated rows

##### 3. Cleaned/transformed to silver

In [2]:
"""
# Create a window partitioned by all columns
w = Window.partitionBy(df.columns)

# Add a count of how many times each row appears
duplicates_all = (
    df.withColumn("dup_count", F.count("*").over(w))
      .filter(F.col("dup_count") > 1)   # keep only duplicates
)

display(duplicates_all)
"""

StatementMeta(, 7a7ee9c3-8abf-4265-ac75-d639745080cf, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 3e333113-dfbb-4ce8-a9bb-a2c119144275)