##### 1. Import and load table

In [7]:
from delta.tables import DeltaTable
from pyspark.sql import functions as F
from pyspark.sql import functions as F, Window

# Read a Delta table into a Spark DataFrame
df = spark.read.format("delta").load("Tables/game_scratches_bronze")

StatementMeta(, f9a1548a-9c72-4e34-899a-65d34a51be2e, 9, Finished, Available, Finished)

##### 2. Clean and transform data

In [8]:
# 1. Cast data types—for visualization
df = (
    df
    .withColumn("game_id", F.col("game_id").cast("string"))
    .withColumn("team_id", F.col("team_id").cast("string"))
    .withColumn("player_id", F.col("player_id").cast("string"))
)

# 2. Drop exact duplicate rows
df = df.dropDuplicates()

# 3. Drop rows with nulls in key columns
df = df.na.drop(subset=["game_id", "team_id", "player_id"])

"""
# 4. Count exact duplicate rows in the original DataFrame
dup_count = (
    df.groupBy(df.columns)
    .count()
    .filter(F.col("count") > 1)
    .count()
)
print("Number of exact duplicated rows:", dup_count)
"""

# 5. Function to convert camelCase or PascalCase to snake_case
def to_snake_case(name):
    return ''.join(['_' + c.lower() if c.isupper() else c for c in name]).lstrip('_')

# 6. Rename all columns to snake_case
df = df.toDF(*[to_snake_case(c) for c in df.columns])  # Use df

# 7. Show schema and sample
#df.printSchema()
#display(df.limit(5))

StatementMeta(, f9a1548a-9c72-4e34-899a-65d34a51be2e, 10, Finished, Available, Finished)

##### 3. Load data to silver table

In [9]:
# Incoming Bronze dataframe
source_df = df  
key_col = "game_id"

# Path to Silver Lakehouse Delta table
target_path = "abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/ce7ef0e9-78af-44db-b5ee-839dcf1c9e98/Tables/game_scratches_silver"

if DeltaTable.isDeltaTable(spark, target_path):
    # Load existing target table
    existing_df = spark.read.format("delta").load(target_path).select(key_col).distinct()

    # Keep only new keys
    new_rows_df = source_df.join(existing_df, on=key_col, how="left_anti")

    if new_rows_df.limit(1).count() > 0:
        (new_rows_df.write
            .format("delta")
            .mode("append")
            .save(target_path))
        print(f"✅ Appended {new_rows_df.count()} new rows to game_scratches_silver in Lakehouse_Silver.")
    else:
        print("No new rows to append. game_scratches_silver is already up to date.")
else:
    # First load → create the Silver table
    (source_df.write
        .format("delta")
        .mode("overwrite")
        .save(target_path))
    print(f"✅ Initial load complete: created game_scratches_silver in Lakehouse_Silver with {source_df.count()} rows.")


StatementMeta(, f9a1548a-9c72-4e34-899a-65d34a51be2e, 11, Finished, Available, Finished)

No new rows to append. game_scratches_silver is already up to date.


In [2]:
"""
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from delta.tables import DeltaTable  # ✅ Import this for Delta support

# Start Spark session
spark = SparkSession.builder.getOrCreate()

# Load the data
df = spark.read.format("delta").load("Tables/game_scratches_bronze")

# Show preview
display(df.limit(5))

# 1. Cast data types—for visualization
df_clean = (
    df
    .withColumn("game_id", F.col("game_id").cast("string"))
    .withColumn("team_id", F.col("team_id").cast("string"))
    .withColumn("player_id", F.col("player_id").cast("string"))
)

# 2. Drop exact duplicate rows
df_clean = df_clean.dropDuplicates()

# 3. Drop rows with nulls in key columns
df_clean = df_clean.na.drop(subset=["game_id", "team_id", "player_id"])

# 4. Count exact duplicate rows in the original DataFrame
dup_count = (
    df.groupBy(df.columns)
    .count()
    .filter(F.col("count") > 1)
    .count()
)
print("Number of exact duplicated rows:", dup_count)

# 5. Function to convert camelCase or PascalCase to snake_case
def to_snake_case(name):
    return ''.join(['_' + c.lower() if c.isupper() else c for c in name]).lstrip('_')

# 6. Rename all columns to snake_case
df_clean = df_clean.toDF(*[to_snake_case(c) for c in df_clean.columns])  # Use df_clean, not df

# 7. Show schema and sample
df_clean.printSchema()
display(df_clean.limit(5))

# 8. Path to Silver table
target_path = "abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/ce7ef0e9-78af-44db-b5ee-839dcf1c9e98/Tables/game_scratches_silver"

# 9. Write or merge into Silver table
try:
    # Try to load existing Delta table
    delta_tbl = DeltaTable.forPath(spark, target_path)

    # Merge data (adjust keys as needed)
    delta_tbl.alias("tgt").merge(
        df_clean.alias("src"),
        "tgt.game_id = src.game_id AND tgt.player_id = src.player_id AND tgt.team_id = src.team_id"
    ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

    print("Delta table updated successfully.")

except Exception as e:
    # Table does not exist or other error – create new
    print("Delta table not found or merge failed. Writing new table...")
    df_clean.write.format("delta").mode("overwrite").save(target_path)
    print("Delta table written successfully.")
    
    """

StatementMeta(, 655fe5e3-aaef-4cd8-81c0-7d0ab2ea67fd, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, abac8c47-039e-4bc8-9f9f-60c946832d20)

Number of exact duplicated rows: 17584
root
 |-- game_id: string (nullable = true)
 |-- team_id: string (nullable = true)
 |-- player_id: string (nullable = true)



SynapseWidget(Synapse.DataFrame, 5dae2332-7548-4663-9a74-24895124be85)

Delta table not found or merge failed. Writing new table...
Delta table written successfully.
