## Clean/transform game_teams_stats_bronze 

##### 1. Import and load table

In [8]:
from delta.tables import DeltaTable
from pyspark.sql import functions as F
from pyspark.sql import functions as F, Window

df = spark.table("game_teams_stats_bronze")

StatementMeta(, 1781a2c5-0cd8-4b26-bb5b-a0b6afcdc2b6, 10, Finished, Available, Finished)

##### 2. Clean and transform data

In [10]:
# 1. Remove duplicate rows with exact information
df = df.dropDuplicates()

# 2. Function to convert camelCase / PascalCase to snake_case
def to_snake_case(name):
    return ''.join(['_' + c.lower() if c.isupper() else c for c in name]).lstrip('_')

# Apply to all columns in one line
df = df.toDF(*[to_snake_case(c) for c in df.columns])

# Explicit rename for special case
df = df.withColumnRenamed("ho_a", "home_or_away")
df = df.withColumnRenamed("pim", "penalty_mins")

# 3: Cast data types
df = (
    df  # ✅ Start from df, not df_clean
    .withColumn("game_id", F.col("game_id").cast("string"))
    .withColumn("team_id", F.col("team_id").cast("string"))
    .withColumn("goals", F.col("goals").cast("int"))
    .withColumn("shots", F.col("shots").cast("int"))
    .withColumn("hits", F.col("hits").cast("int"))
    .withColumn("penalty_mins", F.col("penalty_mins").cast("int"))
    .withColumn("power_play_opportunities", F.col("power_play_opportunities").cast("int"))
    .withColumn("power_play_goals", F.col("power_play_goals").cast("int"))
    .withColumn("giveaways", F.col("giveaways").cast("int"))
    .withColumn("takeaways", F.col("takeaways").cast("int"))
    .withColumn("blocked", F.col("blocked").cast("int"))
    .withColumn("face_off_win_percentage", F.col("face_off_win_percentage").cast("float"))
    
)

"""
# Show results
display(df.limit(10))

rows_final = df.count()
cols_final = len(df.columns)
print("")
print("Number of rows:", rows_final)
print("Number of columns:", cols_final)
"""


StatementMeta(, 1781a2c5-0cd8-4b26-bb5b-a0b6afcdc2b6, 12, Finished, Available, Finished)

'\n# Show results\ndisplay(df.limit(10))\n\nrows_final = df.count()\ncols_final = len(df.columns)\nprint("")\nprint("Number of rows:", rows_final)\nprint("Number of columns:", cols_final)\n'

##### 3. Load data to silver table

In [11]:
# Incoming Bronze dataframe
source_df = df  
key_col = "game_id"

# Path to Silver Lakehouse Delta table
target_path = "abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/ce7ef0e9-78af-44db-b5ee-839dcf1c9e98/Tables/game_teams_stats_silver"

if DeltaTable.isDeltaTable(spark, target_path):
    # Load existing target table
    existing_df = spark.read.format("delta").load(target_path).select(key_col).distinct()

    # Keep only new keys
    new_rows_df = source_df.join(existing_df, on=key_col, how="left_anti")

    if new_rows_df.limit(1).count() > 0:
        (new_rows_df.write
            .format("delta")
            .mode("append")
            .save(target_path))
        print(f"✅ Appended {new_rows_df.count()} new rows to game_teams_stats in Lakehouse_Silver.")
    else:
        print("No new rows to append. game_teams_stats is already up to date.")
else:
    # First load → create the Silver table
    (source_df.write
        .format("delta")
        .mode("overwrite")
        .save(target_path))
    print(f"✅ Initial load complete: created game_teams_stats in Lakehouse_Silver with {source_df.count()} rows.")


StatementMeta(, 1781a2c5-0cd8-4b26-bb5b-a0b6afcdc2b6, 13, Finished, Available, Finished)

No new rows to append. game_teams_stats is already up to date.


##### 1. Preliminary Validation Check
Check data type and row count.

In [2]:
"""

# Get summary stats
df_stats = df_game_teams_stats.describe()
display(df_stats)

# Check for schema data type 
df_game_teams_stats.printSchema()

display(df_game_teams_stats.limit(5))

#Total rows and columns
total_rows = df_game_teams_stats.count()
total_cols = len(df_game_teams_stats.columns)

print(f"Total Rows: {total_rows}")
print(f"Total Columns: {total_cols}")

"""

StatementMeta(, f9c834a1-d388-4dc5-abb3-86f1ae2a603f, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 30028ff9-efc8-4e73-952e-68dab1776abc)

root
 |-- game_id: integer (nullable = true)
 |-- team_id: integer (nullable = true)
 |-- HoA: string (nullable = true)
 |-- won: boolean (nullable = true)
 |-- settled_in: string (nullable = true)
 |-- head_coach: string (nullable = true)
 |-- goals: string (nullable = true)
 |-- shots: string (nullable = true)
 |-- hits: string (nullable = true)
 |-- pim: string (nullable = true)
 |-- powerPlayOpportunities: string (nullable = true)
 |-- powerPlayGoals: string (nullable = true)
 |-- faceOffWinPercentage: string (nullable = true)
 |-- giveaways: string (nullable = true)
 |-- takeaways: string (nullable = true)
 |-- blocked: string (nullable = true)
 |-- startRinkSide: string (nullable = true)



SynapseWidget(Synapse.DataFrame, 12922957-20af-4b90-85a1-16ef54c8a45f)

Total Rows: 52610
Total Columns: 17


##### 1. Preliminary Validation Check
Check for null values.

In [3]:
"""
df_game_teams_stats.toPandas().info()

# Check for any null values
df_null_counts = df_game_teams_stats.select([
    sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in df_game_teams_stats.columns
])

display(df_null_counts.toPandas())
"""

StatementMeta(, f9c834a1-d388-4dc5-abb3-86f1ae2a603f, 5, Finished, Available, Finished)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52610 entries, 0 to 52609
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   game_id                 52610 non-null  int32 
 1   team_id                 52610 non-null  int32 
 2   HoA                     52610 non-null  object
 3   won                     52610 non-null  bool  
 4   settled_in              52610 non-null  object
 5   head_coach              52610 non-null  object
 6   goals                   52610 non-null  object
 7   shots                   52610 non-null  object
 8   hits                    52610 non-null  object
 9   pim                     52610 non-null  object
 10  powerPlayOpportunities  52610 non-null  object
 11  powerPlayGoals          52610 non-null  object
 12  faceOffWinPercentage    52610 non-null  object
 13  giveaways               52610 non-null  object
 14  takeaways               52610 non-null  object
 15  bl

SynapseWidget(Synapse.DataFrame, ef4feb49-f7e6-4a2d-af1d-81808ac6879c)

##### 1. Preliminary Validation Check
Check for exact duplicate rows.

In [4]:
"""
all_columns = df_game_teams_stats.columns

# Partition by every column and mark rows that occur more than once
w = Window.partitionBy(*all_columns)
df_duplicates = (
    df_game_teams_stats
    .withColumn("duplicate_count", F.count("*").over(w))
    .filter(F.col("duplicate_count") > 1)
    .drop("duplicate_count")
)

total_duplicate_rows = df_duplicates.count()                

print(f"Total duplicate rows: {total_duplicate_rows}")

# Show the duplicated rows 
display(df_duplicates.orderBy(all_columns))
"""

StatementMeta(, f9c834a1-d388-4dc5-abb3-86f1ae2a603f, 6, Finished, Available, Finished)

Total duplicate rows: 10280


SynapseWidget(Synapse.DataFrame, 35834966-5579-48d5-84ee-8f85753d8912)

##### 2. Clean / Transform: Remove Duplicate Rows

In [5]:
"""
all_columns = df_game_teams_stats.columns

df_game_teams_stats = df_game_teams_stats.dropDuplicates(all_columns)

print(f"Total rows after removing duplicates: {df_game_teams_stats.count()}")

display(df_game_teams_stats.limit(10))
"""

StatementMeta(, f9c834a1-d388-4dc5-abb3-86f1ae2a603f, 7, Finished, Available, Finished)

Total rows after removing duplicates: 47470


SynapseWidget(Synapse.DataFrame, 44d6f5d8-1341-4c69-af2c-7ddc875573c8)

In [6]:
#df_game_teams_stats.toPandas().info()

StatementMeta(, f9c834a1-d388-4dc5-abb3-86f1ae2a603f, 8, Finished, Available, Finished)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47470 entries, 0 to 47469
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   game_id                 47470 non-null  int32 
 1   team_id                 47470 non-null  int32 
 2   HoA                     47470 non-null  object
 3   won                     47470 non-null  bool  
 4   settled_in              47470 non-null  object
 5   head_coach              47470 non-null  object
 6   goals                   47470 non-null  object
 7   shots                   47470 non-null  object
 8   hits                    47470 non-null  object
 9   pim                     47470 non-null  object
 10  powerPlayOpportunities  47470 non-null  object
 11  powerPlayGoals          47470 non-null  object
 12  faceOffWinPercentage    47470 non-null  object
 13  giveaways               47470 non-null  object
 14  takeaways               47470 non-null  object
 15  bl

##### 2. Clean / Transform: Convert Data Types

Convert the following columns from  **string** to **integer**
- `game_id`
- `team_id`

Convert the following columns from **string** to **integer** and **float** to ensure proper numeric analysis:

- `goals`
- `shots`
- `hits`
- `pim`
- `powerPlayOpportunities`
- `powerPlayGoals`
- `giveaways`
- `takeaways`
- `blocked`
- `faceOffWinPercentage` - float

In [7]:
"""
# Columns to cast to String
columns_to_string = ["game_id", "team_id"]

for col in columns_to_string:
    df_game_teams_stats = df_game_teams_stats.withColumn(col, F.col(col).cast(StringType()))

# Columns to cast to Integer
columns_to_int = [
    "goals", "shots", "hits", "pim", "powerPlayOpportunities", "powerPlayGoals", 
    "giveaways", "takeaways", "blocked"
]

for col_name in columns_to_int:
    df_game_teams_stats = df_game_teams_stats.withColumn(col_name, F.col(col_name).cast(IntegerType()))

# Columns to cast to Float
columns_to_float = ["faceOffWinPercentage"]

for col_name in columns_to_float:
    df_game_teams_stats = df_game_teams_stats.withColumn(col_name, F.col(col_name).cast(FloatType()))

# Verify schema
df_game_teams_stats.printSchema()
"""


StatementMeta(, f9c834a1-d388-4dc5-abb3-86f1ae2a603f, 9, Finished, Available, Finished)

root
 |-- game_id: string (nullable = true)
 |-- team_id: string (nullable = true)
 |-- HoA: string (nullable = true)
 |-- won: boolean (nullable = true)
 |-- settled_in: string (nullable = true)
 |-- head_coach: string (nullable = true)
 |-- goals: integer (nullable = true)
 |-- shots: integer (nullable = true)
 |-- hits: integer (nullable = true)
 |-- pim: integer (nullable = true)
 |-- powerPlayOpportunities: integer (nullable = true)
 |-- powerPlayGoals: integer (nullable = true)
 |-- faceOffWinPercentage: float (nullable = true)
 |-- giveaways: integer (nullable = true)
 |-- takeaways: integer (nullable = true)
 |-- blocked: integer (nullable = true)
 |-- startRinkSide: string (nullable = true)



##### 2. Clean / Transform: Standardize column name
Standardize column names.

In [8]:
"""
# Rename column names

# Dictionary of columns to rename: old_name -> new_name
columns_to_rename = {
    "HoA": "home_or_away",
    "pim": "penalty_mins"
}

# Apply renaming
for old_name, new_name in columns_to_rename.items():
    df_game_teams_stats = df_game_teams_stats.withColumnRenamed(old_name, new_name)

# Function to convert camelCase / PascalCase to snake_case
def to_snake_case(name):
    return ''.join(['_' + c.lower() if c.isupper() else c for c in name]).lstrip('_')

# Apply to all columns in one line
df_game_teams_stats = df_game_teams_stats.toDF(*[to_snake_case(c) for c in df_game_teams_stats.columns])

display(df_game_teams_stats.limit(20))
df_game_teams_stats.printSchema()
"""

StatementMeta(, f9c834a1-d388-4dc5-abb3-86f1ae2a603f, 10, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, cbfc48b0-2ecc-4de4-b38d-5b4028707092)

root
 |-- game_id: string (nullable = true)
 |-- team_id: string (nullable = true)
 |-- home_or_away: string (nullable = true)
 |-- won: boolean (nullable = true)
 |-- settled_in: string (nullable = true)
 |-- head_coach: string (nullable = true)
 |-- goals: integer (nullable = true)
 |-- shots: integer (nullable = true)
 |-- hits: integer (nullable = true)
 |-- penalty_mins: integer (nullable = true)
 |-- power_play_opportunities: integer (nullable = true)
 |-- power_play_goals: integer (nullable = true)
 |-- face_off_win_percentage: float (nullable = true)
 |-- giveaways: integer (nullable = true)
 |-- takeaways: integer (nullable = true)
 |-- blocked: integer (nullable = true)
 |-- start_rink_side: string (nullable = true)



##### 3. Load table to Lakehouse_Silver 

In [9]:
"""
from delta.tables import DeltaTable

# Source dataframe from Lakehouse_Bronze
source_df = df_game_teams_stats  

# Path to Lakehouse_Silver table
target_path = "abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/ce7ef0e9-78af-44db-b5ee-839dcf1c9e98/Tables/game_teams_stats_silver"

try:
    # Load existing Delta table if it exists
    delta_tbl = DeltaTable.forPath(spark, target_path)

    # Get distinct keys from existing table
    existing_keys = (
        delta_tbl.toDF()
        .select("game_id", "team_id")
        .distinct()
    )

    # Keep only new rows that don’t already exist
    new_rows_df = (
        source_df
        .join(existing_keys, on=["game_id", "team_id"], how="left_anti")
    )

    if new_rows_df.rdd.isEmpty():
        print("No new rows to append. game_teams_stats_silver is already up to date.")
    else:
        (new_rows_df.write
            .format("delta")
            .mode("append")
            .save(target_path))
        print(f"✅ Incremental load complete: {new_rows_df.count()} new rows appended to game_teams_stats_silver in Lakehouse_Silver.")

except Exception as e:
    # Table does not exist, create it
    (source_df.write
        .format("delta")
        .mode("overwrite")
        .save(target_path))
    print(f"✅ Initial load complete: new table created in Lakehouse_Silver with {source_df.count()} rows.")
    
"""


StatementMeta(, f9c834a1-d388-4dc5-abb3-86f1ae2a603f, 11, Finished, Available, Finished)

✅ Initial load complete: new table created in Lakehouse_Silver with 47470 rows.
