## Clean/transform game_skater_stats_bronze 

##### 1. Import and load table

In [6]:
from delta.tables import DeltaTable
from pyspark.sql import functions as F
from pyspark.sql import functions as F, Window

df = spark.table("game_skater_stats_bronze")

StatementMeta(, c68551c8-38f1-4bb8-90e2-29259bf665b2, 8, Finished, Available, Finished)

##### 2. Clean and transform data

In [7]:
# 1. Remove duplicate rows with exact information
df = df.dropDuplicates()

# 2. Function to convert camelCase / PascalCase to snake_case
def to_snake_case(name):
    return ''.join(['_' + c.lower() if c.isupper() else c for c in name]).lstrip('_')

df = df.toDF(*[to_snake_case(c) for c in df.columns])

# Explicit rename for special case
df = df.withColumnRenamed("face_off_wins", "faceoff_wins")

# 3: Cast data types
df = (
    df  # ✅ Start from df, not df_clean
    .withColumn("game_id", F.col("game_id").cast("string"))
    .withColumn("player_id", F.col("player_id").cast("string"))
    .withColumn("team_id", F.col("team_id").cast("string"))
    .withColumn("hits", F.col("hits").cast("int"))
    .withColumn("takeaways", F.col("hits").cast("int"))
    .withColumn("giveaways", F.col("hits").cast("int"))
    .withColumn("blocked", F.col("hits").cast("int"))
)
"""

# Show results
display(df.limit(10))

rows_final = df.count()
cols_final = len(df.columns)
print("")
print("Number of rows:", rows_final)
print("Number of columns:", cols_final)
"""


StatementMeta(, c68551c8-38f1-4bb8-90e2-29259bf665b2, 9, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 267c831a-5245-4ffc-8c36-923e627f0dba)


Number of rows: 853404
Number of columns: 22


##### 3. Load data to silver table

In [8]:
# Source dataframe from Lakehouse_Bronze
source_df = df  

# Path to Lakehouse_Silver table
target_path = "abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/ce7ef0e9-78af-44db-b5ee-839dcf1c9e98/Tables/game_skater_stats_silver"

try:
    # Load existing Delta table if it exists
    delta_tbl = DeltaTable.forPath(spark, target_path)

    # Get distinct keys from existing table
    existing_keys = (
        delta_tbl.toDF()
        .select("game_id", "player_id")
        .distinct()
    )

    # Filter out rows already existing
    new_rows_df = (
        source_df
        .join(existing_keys, on=["game_id", "player_id"], how="left_anti")
    )

    if new_rows_df.rdd.isEmpty():
        print("No new rows to append. game_skater_stats_silver is already up to date.")
    else:
        (new_rows_df.write
            .format("delta")
            .mode("append")
            .save(target_path))
        print(f"✅ Incremental load complete: {new_rows_df.count()} new rows appended to game_skater_stats_silver in Lakehouse_Silver.")

except Exception as e:
    # Table does not exist, create it
    (source_df.write
        .format("delta")
        .mode("overwrite")
        .save(target_path))
    print(f"✅ Initial load complete: new table created in Lakehouse_Silver with {source_df.count()} rows.")


StatementMeta(, c68551c8-38f1-4bb8-90e2-29259bf665b2, 10, Finished, Available, Finished)

No new rows to append. game_skater_stats_silver is already up to date.


##### 1. Preliminary Validation Check
Check data type and row count.

In [2]:
"""

# Get summary stats
# df_stats = df_game_skater_stats.describe()
# display(df_stats)

# Check for schema data type 
df_game_skater_stats.printSchema()

display(df_game_skater_stats.limit(5))

#Total rows and columns
total_rows = df_game_skater_stats.count()
total_cols = len(df_game_skater_stats.columns)

print(f"Total Rows: {total_rows}")
print(f"Total Columns: {total_cols}")

"""

StatementMeta(, 4464d84f-50bc-4f17-ae3e-d641614265f8, 10, Finished, Available, Finished)

root
 |-- game_id: integer (nullable = true)
 |-- player_id: integer (nullable = true)
 |-- team_id: integer (nullable = true)
 |-- timeOnIce: integer (nullable = true)
 |-- assists: integer (nullable = true)
 |-- goals: integer (nullable = true)
 |-- shots: integer (nullable = true)
 |-- hits: string (nullable = true)
 |-- powerPlayGoals: integer (nullable = true)
 |-- powerPlayAssists: integer (nullable = true)
 |-- penaltyMinutes: integer (nullable = true)
 |-- faceOffWins: integer (nullable = true)
 |-- faceoffTaken: integer (nullable = true)
 |-- takeaways: string (nullable = true)
 |-- giveaways: string (nullable = true)
 |-- shortHandedGoals: integer (nullable = true)
 |-- shortHandedAssists: integer (nullable = true)
 |-- blocked: string (nullable = true)
 |-- plusMinus: integer (nullable = true)
 |-- evenTimeOnIce: integer (nullable = true)
 |-- shortHandedTimeOnIce: integer (nullable = true)
 |-- powerPlayTimeOnIce: integer (nullable = true)



SynapseWidget(Synapse.DataFrame, 1a510471-2e58-403f-bb50-fda2a06edc84)

Total Rows: 1891660
Total Columns: 22


##### 1. Preliminary Validation Check
Check for null values.

In [3]:
"""

# df_game_skater_stats.toPandas().info()

# Check for any null values
df_null_counts = df_game_skater_stats.select([
    sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in df_game_skater_stats.columns
])

display(df_null_counts)

"""

StatementMeta(, 4464d84f-50bc-4f17-ae3e-d641614265f8, 12, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 3e08fefc-9a3b-47f1-9686-39b75ab5ab89)

##### 1. Preliminary Validation Check
Check for exact duplicate rows.



#### Detecting Duplicate Rows in Spark -Window Function Method
- **Approach**: Partition by every column and mark rows that occur more than once.
- **Drawback**: Heavy workload since Spark has to materialize counts for **every row**.

```python
w = Window.partitionBy(*all_columns)

df_duplicates = (
    df_game_skater_stats
    .withColumn("duplicate_count", F.count("*").over(w))
    .filter(F.col("duplicate_count") > 1)
    .drop("duplicate_count")
)

total_duplicate_rows = df_duplicates.count()                

print(f"Total duplicate rows: {total_duplicate_rows}")


In [4]:
"""

all_columns = df_game_skater_stats.columns

# Find duplicates by grouping and counting
df_duplicates = (
    df_game_skater_stats
    .groupBy(all_columns)
    .count()
    .filter(F.col("count") > 1)
)

total_duplicate_rows = df_duplicates.select(F.sum("count")).collect()[0][0]         

print(f"Total duplicate rows: {total_duplicate_rows}")

# Show the duplicated rows 
#display(df_duplicates.orderBy(all_columns))

"""


StatementMeta(, 4464d84f-50bc-4f17-ae3e-d641614265f8, 13, Finished, Available, Finished)

Total duplicate rows: 1891660


##### 2. Clean / Transform: Remove Duplicate Rows


In [3]:
"""

all_columns = df_game_skater_stats.columns

df_game_skater_stats = df_game_skater_stats.dropDuplicates(all_columns)


print(f"Total rows after removing duplicates: {df_game_skater_stats.count()}")

display(df_game_skater_stats.limit(10))

"""

StatementMeta(, a1f55be0-da64-4502-bdb9-c43971ed6027, 5, Finished, Available, Finished)

Total rows after removing duplicates: 853404


SynapseWidget(Synapse.DataFrame, 7a5c8d21-c59a-4f6b-80d1-47fb89e9554b)

In [6]:
#df_game_skater_stats.toPandas().info()

StatementMeta(, 8dcaf8eb-138d-4026-b24e-238ec6f0243e, 8, Finished, Available, Finished)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 853404 entries, 0 to 853403
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   game_id               853404 non-null  int32 
 1   player_id             853404 non-null  int32 
 2   team_id               853404 non-null  int32 
 3   timeOnIce             853404 non-null  int32 
 4   assists               853404 non-null  int32 
 5   goals                 853404 non-null  int32 
 6   shots                 853404 non-null  int32 
 7   hits                  853404 non-null  object
 8   powerPlayGoals        853404 non-null  int32 
 9   powerPlayAssists      853404 non-null  int32 
 10  penaltyMinutes        853404 non-null  int32 
 11  faceOffWins           853404 non-null  int32 
 12  faceoffTaken          853404 non-null  int32 
 13  takeaways             853404 non-null  object
 14  giveaways             853404 non-null  object
 15  shortHandedGoals 

##### 2. Clean / Transform: Convert Data Types

Convert the following columns from  **integer** to **string**
- `game_id`
- `player_id`
- `team_id`

Convert the following columns from **string** to **integer** to ensure proper numeric analysis:

- `hits`
- `takeaways`
- `giveaways`
- `blocked`

In [5]:
"""

# Define casting rules
cast_map = {
    "hits": IntegerType(),
    "takeaways": IntegerType(),
    "giveaways": IntegerType(),
    "blocked": IntegerType(),
    "game_id": StringType(),
    "player_id": StringType(),
    "team_id": StringType()
}

# Apply casts in one loop
for col, dtype in cast_map.items():
    df_game_skater_stats = df_game_skater_stats.withColumn(col, F.col(col).cast(dtype))

# df_game_skater_stats.printSchema()

"""

StatementMeta(, a1f55be0-da64-4502-bdb9-c43971ed6027, 7, Finished, Available, Finished)

root
 |-- game_id: string (nullable = true)
 |-- player_id: string (nullable = true)
 |-- team_id: string (nullable = true)
 |-- timeOnIce: integer (nullable = true)
 |-- assists: integer (nullable = true)
 |-- goals: integer (nullable = true)
 |-- shots: integer (nullable = true)
 |-- hits: integer (nullable = true)
 |-- powerPlayGoals: integer (nullable = true)
 |-- powerPlayAssists: integer (nullable = true)
 |-- penaltyMinutes: integer (nullable = true)
 |-- faceOffWins: integer (nullable = true)
 |-- faceoffTaken: integer (nullable = true)
 |-- takeaways: integer (nullable = true)
 |-- giveaways: integer (nullable = true)
 |-- shortHandedGoals: integer (nullable = true)
 |-- shortHandedAssists: integer (nullable = true)
 |-- blocked: integer (nullable = true)
 |-- plusMinus: integer (nullable = true)
 |-- evenTimeOnIce: integer (nullable = true)
 |-- shortHandedTimeOnIce: integer (nullable = true)
 |-- powerPlayTimeOnIce: integer (nullable = true)



##### 2. Clean / Transform: Standardize column name
Standardize column names to snake_case format.
Note: faceOffWins to faceoff_wins

In [6]:
"""

# Function to convert camelCase / PascalCase to snake_case
def to_snake_case(name):
    return ''.join(['_' + c.lower() if c.isupper() else c for c in name]).lstrip('_')

# Apply to all columns in one line
df_game_skater_stats = df_game_skater_stats.toDF(*[to_snake_case(c) for c in df_game_skater_stats.columns])

# Rename column - faceoff_wins
df_game_skater_stats = df_game_skater_stats.withColumnRenamed("face_off_wins", "faceoff_wins")


#df_game_skater_stats.printSchema()
#display(df_game_skater_stats.limit(5))

"""


StatementMeta(, a1f55be0-da64-4502-bdb9-c43971ed6027, 8, Finished, Available, Finished)

root
 |-- game_id: string (nullable = true)
 |-- player_id: string (nullable = true)
 |-- team_id: string (nullable = true)
 |-- time_on_ice: integer (nullable = true)
 |-- assists: integer (nullable = true)
 |-- goals: integer (nullable = true)
 |-- shots: integer (nullable = true)
 |-- hits: integer (nullable = true)
 |-- power_play_goals: integer (nullable = true)
 |-- power_play_assists: integer (nullable = true)
 |-- penalty_minutes: integer (nullable = true)
 |-- faceoff_wins: integer (nullable = true)
 |-- faceoff_taken: integer (nullable = true)
 |-- takeaways: integer (nullable = true)
 |-- giveaways: integer (nullable = true)
 |-- short_handed_goals: integer (nullable = true)
 |-- short_handed_assists: integer (nullable = true)
 |-- blocked: integer (nullable = true)
 |-- plus_minus: integer (nullable = true)
 |-- even_time_on_ice: integer (nullable = true)
 |-- short_handed_time_on_ice: integer (nullable = true)
 |-- power_play_time_on_ice: integer (nullable = true)



##### 3. Load table to Lakehouse_Silver 

In [7]:
"""
# Source dataframe from Lakehouse_Bronze
source_df = df_game_skater_stats  

# Path to Lakehouse_Silver table
target_path = "abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/ce7ef0e9-78af-44db-b5ee-839dcf1c9e98/Tables/game_skater_stats_silver"

try:
    # Load existing Delta table if it exists
    delta_tbl = DeltaTable.forPath(spark, target_path)

    # Get distinct keys from existing table
    existing_keys = (
        delta_tbl.toDF()
        .select("game_id", "player_id")
        .distinct()
    )

    # Filter out rows already existing
    new_rows_df = (
        source_df
        .join(existing_keys, on=["game_id", "player_id"], how="left_anti")
    )

    if new_rows_df.rdd.isEmpty():
        print("No new rows to append. game_skater_stats_silver is already up to date.")
    else:
        (new_rows_df.write
            .format("delta")
            .mode("append")
            .save(target_path))
        print(f"✅ Incremental load complete: {new_rows_df.count()} new rows appended to game_skater_stats_silver in Lakehouse_Silver.")

except Exception as e:
    # Table does not exist, create it
    (source_df.write
        .format("delta")
        .mode("overwrite")
        .save(target_path))
    print(f"✅ Initial load complete: new table created in Lakehouse_Silver with {source_df.count()} rows.")

"""


StatementMeta(, a1f55be0-da64-4502-bdb9-c43971ed6027, 9, Finished, Available, Finished)

No new rows to append. game_skater_stats_silver is already up to date.
