## Clean/transform team_info_bronze -> team_info_silver

##### 1. Import and load table

In [3]:
from delta.tables import DeltaTable
from pyspark.sql import functions as F
from pyspark.sql import functions as F, Window

df = spark.table("team_info_bronze")

StatementMeta(, 829716e3-de2a-4107-b54e-14aac379b958, 5, Finished, Available, Finished)

##### 2. Clean and transform data

In [4]:
# 1. Remove the 'link' column
df = df.drop("link")

# 2. Remove duplicate rows with exact information
df = df.dropDuplicates()

# 3. Function to convert camelCase / PascalCase to snake_case
def to_snake_case(name):
    return ''.join(['_' + c.lower() if c.isupper() else c for c in name]).lstrip('_')

df = df.toDF(*[to_snake_case(c) for c in df.columns])

# Explicit rename for special case
#df = df.withColumnRenamed("face_off_wins", "faceoff_wins")

# 4: Cast data types
df = (
    df  # ✅ Start from df, not df_clean
    .withColumn("franchise_id", F.col("franchise_id").cast("string"))
    .withColumn("team_id", F.col("team_id").cast("string"))
)
"""

# Show results
display(df.limit(10))

rows_final = df.count()
cols_final = len(df.columns)
print("")
print("Number of rows:", rows_final)
print("Number of columns:", cols_final)
"""


StatementMeta(, 829716e3-de2a-4107-b54e-14aac379b958, 6, Finished, Available, Finished)

'\n\n# Show results\ndisplay(df.limit(10))\n\nrows_final = df.count()\ncols_final = len(df.columns)\nprint("")\nprint("Number of rows:", rows_final)\nprint("Number of columns:", cols_final)\n'

##### 3. Load data to silver table

In [5]:
# Incoming Bronze dataframe
source_df = df  
key_col = "franchise_id"

# Path to Silver Lakehouse Delta table
target_path = "abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/ce7ef0e9-78af-44db-b5ee-839dcf1c9e98/Tables/team_info_silver"

if DeltaTable.isDeltaTable(spark, target_path):
    # Load existing target table
    existing_df = spark.read.format("delta").load(target_path).select(key_col).distinct()

    # Keep only new keys
    new_rows_df = source_df.join(existing_df, on=key_col, how="left_anti")

    if new_rows_df.limit(1).count() > 0:
        (new_rows_df.write
            .format("delta")
            .mode("append")
            .save(target_path))
        print(f"✅ Appended {new_rows_df.count()} new rows to team_info_silver in Lakehouse_Silver.")
    else:
        print("No new rows to append. team_info_silver is already up to date.")
else:
    # First load → create the Silver table
    (source_df.write
        .format("delta")
        .mode("overwrite")
        .save(target_path))
    print(f"✅ Initial load complete: created team_info_silver in Lakehouse_Silver with {source_df.count()} rows.")


StatementMeta(, 829716e3-de2a-4107-b54e-14aac379b958, 7, Finished, Available, Finished)

No new rows to append. team_info_silver is already up to date.


##### 1. Preliminary Validation Check
Check data type and row count.

In [4]:
"""

# Get summary stats
df_stats = df_team_info.describe()
display(df_stats)

# Check for schema data type 
df_team_info.printSchema()

display(df_team_info.limit(5))

#Total rows and columns
total_rows = df_team_info.count()
total_cols = len(df_team_info.columns)

print(f"Total Rows: {total_rows}")
print(f"Total Columns: {total_cols}")

"""

StatementMeta(, ce0908e5-af23-4224-9a43-6d889795249b, 36, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 99cdf496-f2b5-4065-b171-d47ec1ae2cf7)

root
 |-- team_id: integer (nullable = true)
 |-- franchiseId: integer (nullable = true)
 |-- shortName: string (nullable = true)
 |-- teamName: string (nullable = true)
 |-- abbreviation: string (nullable = true)
 |-- link: string (nullable = true)



SynapseWidget(Synapse.DataFrame, 463fbc88-a109-47de-8bdf-77e021b6b47d)

Total Rows: 33
Total Columns: 6


##### 1. Preliminary Validation Check
Check for null values.

In [5]:
"""

df_team_info.toPandas().info()

# Check for any null values
df_null_counts = df_team_info.select([
    sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in df_team_info.columns
])

display(df_null_counts.toPandas())

"""

StatementMeta(, ce0908e5-af23-4224-9a43-6d889795249b, 37, Finished, Available, Finished)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   team_id       33 non-null     int32 
 1   franchiseId   33 non-null     int32 
 2   shortName     33 non-null     object
 3   teamName      33 non-null     object
 4   abbreviation  33 non-null     object
 5   link          33 non-null     object
dtypes: int32(2), object(4)
memory usage: 1.4+ KB


SynapseWidget(Synapse.DataFrame, 7d9a3906-81d3-4813-808b-29d3fb7459fe)

##### 1. Preliminary Validation Check
Check for exact duplicate rows.

In [7]:
"""

all_columns = df_team_info.columns

# Partition by every column and mark rows that occur more than once
w = Window.partitionBy(*all_columns)
df_duplicates = (
    df_team_info
    .withColumn("duplicate_count", F.count("*").over(w))
    .filter(F.col("duplicate_count") > 1)
    .drop("duplicate_count")
)

total_duplicate_rows = df_duplicates.count()                

print(f"Total duplicate rows: {total_duplicate_rows}")

"""

StatementMeta(, ce0908e5-af23-4224-9a43-6d889795249b, 38, Finished, Available, Finished)

Total duplicate rows: 0


##### 2. Clean / Transform: Standardize column name
Rename column:  
- 'franchiseId' to 'franchise_id'
- 'shortName' to 'short_name'
- 'teamName' to 'team_name'

In [1]:
"""

# Rename column names

# Dictionary of columns to rename: old_name -> new_name
columns_to_rename = {
    "franchiseId": "franchise_id",
    "shortName": "short_name",
    "teamName": "team_name"
}

# Apply renaming
for old_name, new_name in columns_to_rename.items():
    df_team_info = df_team_info.withColumnRenamed(old_name, new_name)

# Show the updated DataFrame
#display(df_team_info.limit(10))

"""

StatementMeta(, , -1, SessionStarting, , SessionStarting)

##### 2. Clean / Transform: Change data type
- team_id : integer to string 
- franchise_id : integer to string 

In [9]:
"""

# Convert data type from integer to string

df_team_info = (
    df_team_info
    .withColumn("team_id", col("team_id").cast("string"))
    .withColumn("franchise_id", col("franchise_id").cast("string"))
)

#df_team_info.printSchema()

"""

StatementMeta(, ce0908e5-af23-4224-9a43-6d889795249b, 40, Finished, Available, Finished)

root
 |-- team_id: string (nullable = true)
 |-- franchise_id: string (nullable = true)
 |-- short_name: string (nullable = true)
 |-- team_name: string (nullable = true)
 |-- abbreviation: string (nullable = true)
 |-- link: string (nullable = true)



##### 2. Clean / Transform: Drop column -link
Remove **link** column.

In [10]:
# Remove the 'link' column
#df_team_info = df_team_info.drop("link")

# Show the updated DataFrame
#display(df_team_info.limit(10))


StatementMeta(, ce0908e5-af23-4224-9a43-6d889795249b, 41, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 5c3e8016-1796-4358-be89-6c9610bad197)

##### 3. Load table to Lakehouse_Silver 

In [41]:
"""

# Target path for Silver Lakehouse
target_path = "abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/ce7ef0e9-78af-44db-b5ee-839dcf1c9e98/Tables/team_info_silver"


# Write dataframe to Silver Lakehouse
df_team_info.write.format("delta").mode("overwrite").save(target_path)


print(f"✅ Initial load complete: new table created in Lakehouse_Silver with {df_team_info.count()} rows.")

"""



StatementMeta(, ce0908e5-af23-4224-9a43-6d889795249b, 43, Finished, Available, Finished)

✅ Initial load complete: new table created in Lakehouse_Silver with 33 rows.
