## Clean/transform game_plays_players_bronze -> game_plays_players_silver

##### 1. Import and load table

In [1]:
from delta.tables import DeltaTable
from pyspark.sql import functions as F
from pyspark.sql import functions as F, Window

# Read a Delta table into a Spark DataFrame
df = spark.read.format("delta").load("Tables/game_plays_players_bronze")

StatementMeta(, 917c1b9f-cd02-4757-91bb-cb7c63caa5ce, 3, Finished, Available, Finished)

##### 2. Clean and transform data

In [3]:
# 2. Trim spaces and standardize case
df = df.withColumn("playerType", F.initcap(F.trim(F.col("playerType"))))

# 3. Drop duplicates
df = df.dropDuplicates()

# 4. Drop rows with nulls in key columns
df = df.na.drop(subset=["play_id", "game_id", "player_id", "playerType"])

# 5. Cast columns
df = (
    df
    .withColumn("play_id", F.col("play_id").cast("string"))
    .withColumn("game_id", F.col("game_id").cast("string"))
    .withColumn("player_id", F.col("player_id").cast("string"))
)

# 6. Count rows/columns
rows_final = df.count()
cols_final = len(df.columns)
print("Number of rows:", rows_final)
print("Number of columns:", cols_final)
df.show()

# 7. Rename all columns to snake_case
def to_snake_case(name):
    return ''.join(['_' + c.lower() if c.isupper() else c for c in name]).lstrip('_')

df = df.toDF(*[to_snake_case(c) for c in df.columns])

# 8. Check schema
#df.printSchema()
#display(df.limit(5))

StatementMeta(, 917c1b9f-cd02-4757-91bb-cb7c63caa5ce, 5, Finished, Available, Finished)

Number of rows: 6362804
Number of columns: 4
+--------------+----------+---------+----------+
|       play_id|   game_id|player_id|playerType|
+--------------+----------+---------+----------+
| 2006020985_19|2006020985|  8459579|   Unknown|
| 2009021094_20|2009021094|  8467950|   Unknown|
| 2009020183_17|2009020183|  8467511|   Unknown|
| 2005020349_30|2005020349|  8448535|   Unknown|
| 2009020909_14|2009020909|  8460712|   Unknown|
| 2009020647_25|2009020647|  8460704|   Unknown|
|2018020526_240|2018020526|  8478365|  Servedby|
|2017020109_333|2017020109|  8478131|  Servedby|
|2015020874_146|2015020874|  8477839|  Servedby|
|2017020489_299|2017020489|  8477407|  Servedby|
|2012020698_283|2012020698|  8473548|  Servedby|
| 2012020634_73|2012020634|  8469765|  Servedby|
|2013020053_276|2013020053|  8476887|  Servedby|
|2010020102_223|2010020102|  8471697|  Servedby|
|2011021226_263|2011021226|  8474571|  Servedby|
|2010020964_207|2010020964|  8474059|  Servedby|
|2019020680_243|20190206

SynapseWidget(Synapse.DataFrame, a0b9dafc-272d-40ae-a425-ae68b0b4a20a)

##### 3. Load data to silver table

In [4]:
# Incoming Bronze dataframe
source_df = df  
key_col = "game_id"

# Path to Silver Lakehouse Delta table
target_path = "abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/ce7ef0e9-78af-44db-b5ee-839dcf1c9e98/Tables/games_plays_players_silver"

if DeltaTable.isDeltaTable(spark, target_path):
    # Load existing target table
    existing_df = spark.read.format("delta").load(target_path).select(key_col).distinct()

    # Keep only new keys
    new_rows_df = source_df.join(existing_df, on=key_col, how="left_anti")

    if new_rows_df.limit(1).count() > 0:
        (new_rows_df.write
            .format("delta")
            .mode("append")
            .save(target_path))
        print(f"✅ Appended {new_rows_df.count()} new rows to game_plays_players_silver in Lakehouse_Silver.")
    else:
        print("No new rows to append. game_plays_players_silver is already up to date.")
else:
    # First load → create the Silver table
    (source_df.write
        .format("delta")
        .mode("overwrite")
        .save(target_path))
    print(f"✅ Initial load complete: created game_plays_players_silver in Lakehouse_Silver with {source_df.count()} rows.")


StatementMeta(, 917c1b9f-cd02-4757-91bb-cb7c63caa5ce, 6, Finished, Available, Finished)

No new rows to append. game_plays_players_silver is already up to date.


In [1]:
"""

# Import functions and read table
import pyspark.sql.functions as F
from delta.tables import DeltaTable 

df = spark.read.format("delta").load("Tables/game_plays_players_bronze")
display(df.limit(5))

"""

StatementMeta(, f8a6ded5-a979-43d4-b133-e70b098c76b2, 3, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 6ecbba82-89d2-4ca4-a986-e69b6f0c7ea2)

In [2]:
"""

# 2. Trim spaces and standardize case
df_clean = df.withColumn("playerType", F.initcap(F.trim(F.col("playerType"))))

# 3. Drop duplicates
df_clean = df_clean.dropDuplicates()

# 4. Drop rows with nulls in key columns
df_clean = df_clean.na.drop(subset=["play_id", "game_id", "player_id", "playerType"])

# 5. Cast columns
df_clean = (
    df_clean
    .withColumn("play_id", F.col("play_id").cast("string"))
    .withColumn("game_id", F.col("game_id").cast("string"))
    .withColumn("player_id", F.col("player_id").cast("string"))
)

# 6. Count rows/columns
rows_final = df_clean.count()
cols_final = len(df_clean.columns)
print("\nNumber of rows:", rows_final)
print("Number of columns:", cols_final)
df_clean.show()

# 7. Rename all columns to snake_case
def to_snake_case(name):
    return ''.join(['_' + c.lower() if c.isupper() else c for c in name]).lstrip('_')

df_clean = df_clean.toDF(*[to_snake_case(c) for c in df_clean.columns])

# 8. Check schema
df_clean.printSchema()
display(df_clean.limit(5))

# 9. Target path to write cleaned data
target_path = "abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/ce7ef0e9-78af-44db-b5ee-839dcf1c9e98/Tables/games_plays_players_silver"

# 10. Try writing or merging into existing Delta table
try:
    delta_tbl = DeltaTable.forPath(spark, target_path)

    # Perform merge (example: based on play_id, player_id, game_id)
    delta_tbl.alias("tgt").merge(
        df_clean.alias("src"),
        "tgt.play_id = src.play_id AND tgt.player_id = src.player_id AND tgt.game_id = src.game_id"
    ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

    print("Delta table updated successfully.")

except Exception as e:
    print("Delta table not found or merge failed. Writing new table...")
    df_clean.write.format("delta").mode("overwrite").save(target_path)
    print("Delta table written successfully.")

"""

StatementMeta(, f8a6ded5-a979-43d4-b133-e70b098c76b2, 4, Finished, Available, Finished)


Number of rows: 6362804
Number of columns: 4
+--------------+----------+---------+----------+
|       play_id|   game_id|player_id|playerType|
+--------------+----------+---------+----------+
| 2016020045_49|2016020045|  8477290|    Hitter|
|2016020045_155|2016020045|  8473512|     Loser|
|2017020812_222|2017020812|  8475164|    Hitter|
|2017020812_231|2017020812|  8475185|   Shooter|
|2017020812_307|2017020812|  8475158|    Winner|
| 2015020314_76|2015020314|  8473575|    Goalie|
|2015020314_147|2015020314|  8471262|   Blocker|
|2015020314_164|2015020314|  8474574|    Hittee|
|2015020849_255|2015020849|  8476460|     Loser|
| 2017020586_39|2017020586|  8469608|    Goalie|
| 2016020610_48|2016020610|  8476880|    Hittee|
| 2016020610_52|2016020610|  8471338|    Winner|
| 2016020610_58|2016020610|  8474651|    Goalie|
|2015020606_184|2015020606|  8471734|    Goalie|
|2017020240_194|2017020240|  8475780|     Loser|
| 2017020624_29|2017020624|  8473541|    Goalie|
| 2017020624_69|2017020

SynapseWidget(Synapse.DataFrame, a0a42db5-21ff-4533-bccb-1f376aa48e8d)

Delta table not found or merge failed. Writing new table...
Delta table written successfully.
