## Clean/transform game_bronze -> game_silver

##### 1. Preliminary check on data

In [7]:
from delta.tables import DeltaTable
from pyspark.sql import functions as F
from pyspark.sql import functions as F, Window

# Read a Delta table into a Spark DataFrame
df = spark.read.format("delta").load("Tables/game_bronze")

StatementMeta(, 5a34b5ad-8482-4ecc-9667-05c4b680a729, 9, Finished, Available, Finished)

##### 2. Clean and transform data

In [2]:
# 1. Drop columns without data
df = df.drop("venue_link")

# 2. Remove duplicate rows with exact information
df = df.dropDuplicates()

# 3. Function to convert camelCase / PascalCase to snake_case
def to_snake_case(name):
    return ''.join(['_' + c.lower() if c.isupper() else c for c in name]).lstrip('_')

# Apply conversion to all columns except "date_time_GMT"
df = df.toDF(*[c if c == "date_time_GMT" else to_snake_case(c) for c in df.columns])

# Explicit rename for special case
df = df.withColumnRenamed("date_time_GMT", "date_time")
df = df.withColumnRenamed("type", "game_type")
df = df.withColumnRenamed("season", "game_season")
df = df.withColumnRenamed("venue", "game_venue")
df = df.withColumnRenamed("home_rink_side_start", "game_home_rink_side_start")
df = df.withColumnRenamed("outcome", "game_outcome")

# 4: Cast data types
df = (
    df  # ✅ Start from df, not df_clean
    .withColumn("game_id", F.col("game_id").cast("string"))
    .withColumn("game_season", F.col("game_season").cast("string"))
    .withColumn("away_team_id", F.col("away_team_id").cast("string"))
    .withColumn("home_team_id", F.col("home_team_id").cast("string"))
)

# Show results
display(df.limit(10))

rows_final = df.count()
cols_final = len(df.columns)
print("")
print("Number of rows:", rows_final)
print("Number of columns:", cols_final)



StatementMeta(, 5a34b5ad-8482-4ecc-9667-05c4b680a729, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, f85dc835-4ac8-42b9-86e9-10a5d00a0256)


Number of rows: 23735
Number of columns: 14


##### 3. Load data to silver table

In [8]:
# Incoming Bronze dataframe
source_df = df  
key_col = "game_id"

# Path to Silver Lakehouse Delta table
target_path = "abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/ce7ef0e9-78af-44db-b5ee-839dcf1c9e98/Tables/game_silver"

if DeltaTable.isDeltaTable(spark, target_path):
    # Load existing target table
    existing_df = spark.read.format("delta").load(target_path).select(key_col).distinct()

    # Keep only new keys
    new_rows_df = source_df.join(existing_df, on=key_col, how="left_anti")

    if new_rows_df.limit(1).count() > 0:
        (new_rows_df.write
            .format("delta")
            .mode("append")
            .save(target_path))
        print(f"✅ Appended {new_rows_df.count()} new rows to game_silver in Lakehouse_Silver.")
    else:
        print("No new rows to append. game_silver is already up to date.")
else:
    # First load → create the Silver table
    (source_df.write
        .format("delta")
        .mode("overwrite")
        .save(target_path))
    print(f"✅ Initial load complete: created game_silver in Lakehouse_Silver with {source_df.count()} rows.")


StatementMeta(, 5a34b5ad-8482-4ecc-9667-05c4b680a729, 10, Finished, Available, Finished)

No new rows to append. game_silver is already up to date.


In [1]:
"""

# Show first 10 rows
display(df.limit(10))

# Check shape of data
rows = df.count()
cols = len(df.columns)
print("")
print("Number of rows:", rows)
print("Number of columns:", cols)
print("")

# Check for nulls
null_counts = df.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in df.columns
])
null_counts.show()

# Check for exact duplicate rows
dup_count = df.groupBy(df.columns).count().filter(F.col("count") > 1).count()
print("Number of exact duplicated rows:", dup_count)
print("")

# Check schema data type
df.printSchema()

"""

StatementMeta(, 5a34b5ad-8482-4ecc-9667-05c4b680a729, 3, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, da7bb9a1-eb5e-435e-a392-8d83d3391219)


Number of rows: 26305
Number of columns: 15

+-------+------+----+-------------+------------+------------+----------+----------+-------+--------------------+-----+----------+------------------+----------------------+------------------+
|game_id|season|type|date_time_GMT|away_team_id|home_team_id|away_goals|home_goals|outcome|home_rink_side_start|venue|venue_link|venue_time_zone_id|venue_time_zone_offset|venue_time_zone_tz|
+-------+------+----+-------------+------------+------------+----------+----------+-------+--------------------+-----+----------+------------------+----------------------+------------------+
|      0|     0|   0|            0|           0|           0|         0|         0|      0|                   0|    0|         0|                 0|                     0|                 0|
+-------+------+----+-------------+------------+------------+----------+----------+-------+--------------------+-----+----------+------------------+----------------------+------------------+

##### Check duplicated rows

In [2]:
"""

# Create a window partitioned by all columns
w = Window.partitionBy(df.columns)

# Add a count of how many times each row appears
duplicates_all = (
    df.withColumn("dup_count", F.count("*").over(w))
      .filter(F.col("dup_count") > 1)   # keep only duplicates
)

display(duplicates_all)

"""

StatementMeta(, 0b1934b6-bf6e-4cf5-856a-6123ebe82768, 11, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, e0226208-fcdd-4fc0-be42-51f25cd49994)