## Clean/transform player_info_bronze -> player_info_stage

##### 1. Import and load table

In [26]:
from delta.tables import DeltaTable
from pyspark.sql import functions as F
from pyspark.sql import functions as F, Window
from pyspark.sql.functions import col, when, lit, date_format, to_timestamp, to_date, split

df = spark.table("player_info_bronze")

# Show results
display(df.limit(10))
df.printSchema()

StatementMeta(, 2239e5cb-ed94-468d-8433-072b677cf2ff, 28, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, f7204706-2e52-43d8-a3ca-b65eace90775)

root
 |-- player_id: integer (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- birthCity: string (nullable = true)
 |-- primaryPosition: string (nullable = true)
 |-- birthDate: string (nullable = true)
 |-- birthStateProvince: string (nullable = true)
 |-- height: string (nullable = true)
 |-- height_cm: string (nullable = true)
 |-- weight: string (nullable = true)
 |-- shootsCatches: string (nullable = true)



##### 2. Clean and transform data

In [27]:
# 1. Remove height column
df = df.drop("height")

# 2. Remove duplicate rows with exact information
df = df.dropDuplicates()


# 3. Function to convert camelCase / PascalCase to snake_case
def to_snake_case(name):
    return ''.join(['_' + c.lower() if c.isupper() else c for c in name]).lstrip('_')

# Apply to all columns in one line
df = df.toDF(*[to_snake_case(c) for c in df.columns])

# 4. Cast data types
df = (
    df  
    .withColumn("player_id", F.col("player_id").cast("string"))
    .withColumn("height_cm", F.col("height_cm").cast("float"))
    .withColumn("weight", F.col("weight").cast("int"))
    
)

# 5. Format birth_date as MM-dd-yyyy (works for 'yyyy-MM-dd HH:mm:ss')
# df = df.withColumn(
#     "birth_date",
#     F.date_format(
#         F.to_timestamp(F.col("birth_date"), "M/d/yy H:mm"),
#         "MM-dd-yyyy"
#     )
# )

# 5. Parse birth_date, fix wrong centuries, then format as MM-dd-yyyy
parsed_birth_ts = F.to_timestamp(F.col("birth_date"), "M/d/yy H:mm")

corrected_birth_ts = F.when(
    F.year(parsed_birth_ts) > F.year(F.current_date()),           # if year is in the future (e.g., 2076)
    F.add_months(parsed_birth_ts, -12 * 100)                      # subtract 100 years
).otherwise(parsed_birth_ts)

df = df.withColumn(
    "birth_date",
    F.date_format(corrected_birth_ts, "MM-dd-yyyy")
)

# 6. 
df = df.withColumn(
    "birth_state_province",
    when((col("birth_state_province") == "NA") | (col("birth_state_province") == ""), lit(None))
    .otherwise(col("birth_state_province"))
)

# Show results
display(df.limit(10))

rows_final = df.count()
cols_final = len(df.columns)
print("")
print("Number of rows:", rows_final)
print("Number of columns:", cols_final)

StatementMeta(, 2239e5cb-ed94-468d-8433-072b677cf2ff, 29, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 4e58f3d2-7f1e-408c-9862-683c881efadb)


Number of rows: 3926
Number of columns: 11


##### 3. Load data to stage in Silver Lakehouse to run Great Expectations check

In [28]:
# Incoming Bronze dataframe
source_df = df  
key_col = "player_id"

# Path to Silver Lakehouse Delta table
target_path = "abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/ce7ef0e9-78af-44db-b5ee-839dcf1c9e98/Tables/player_info_stage"

if DeltaTable.isDeltaTable(spark, target_path):
    # Load existing target table
    existing_df = spark.read.format("delta").load(target_path).select(key_col).distinct()

    # Keep only new keys
    new_rows_df = source_df.join(existing_df, on=key_col, how="left_anti")

    if new_rows_df.limit(1).count() > 0:
        (new_rows_df.write
            .format("delta")
            .mode("append")
            .save(target_path))
        print(f"✅ Appended {new_rows_df.count()} new rows to player_info_stage in Lakehouse_Silver.")
    else:
        print("No new rows to append. player_info_stage is already up to date.")
else:
    # First load → create the Silver table
    (source_df.write
        .format("delta")
        .mode("overwrite")
        .save(target_path))
    print(f"✅ Initial load complete: created player_info_stage in Lakehouse_Silver with {source_df.count()} rows.")


StatementMeta(, 2239e5cb-ed94-468d-8433-072b677cf2ff, 30, Finished, Available, Finished)

✅ Initial load complete: created player_info_stage in Lakehouse_Silver with 3926 rows.


##### 1. Preliminary Validation Check
Check data type and row count.

In [2]:
"""

# Get summary stats
df_stats = df_player_info.describe()
display(df_stats)

# Check for schema data type 
df_player_info.printSchema()

display(df_player_info.limit(5))

#Total rows and columns
total_rows = df_player_info.count()
total_cols = len(df_player_info.columns)

print(f"Total Rows: {total_rows}")
print(f"Total Columns: {total_cols}")

"""

StatementMeta(, 002e7e54-f5ed-4ab9-926a-ec2d509dd774, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, af3889f1-1989-4727-aced-a3af7037791c)

root
 |-- player_id: integer (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- birthCity: string (nullable = true)
 |-- primaryPosition: string (nullable = true)
 |-- birthDate: timestamp (nullable = true)
 |-- birthStateProvince: string (nullable = true)
 |-- height: string (nullable = true)
 |-- height_cm: string (nullable = true)
 |-- weight: string (nullable = true)
 |-- shootsCatches: string (nullable = true)



SynapseWidget(Synapse.DataFrame, a3cbeacc-2a87-4318-b5de-94fec525d589)

Total Rows: 7850
Total Columns: 12


##### 1. Preliminary Validation Check
Check for null values.

In [4]:
"""

df_player_info.toPandas().info()

# Check for any null values
df_null_counts = df_player_info.select([
    sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in df_player_info.columns
])

display(df_null_counts.toPandas())
"""

StatementMeta(, 002e7e54-f5ed-4ab9-926a-ec2d509dd774, 6, Finished, Available, Finished)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7850 entries, 0 to 7849
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   player_id           7850 non-null   int32         
 1   firstName           7850 non-null   object        
 2   lastName            7850 non-null   object        
 3   nationality         7850 non-null   object        
 4   birthCity           7850 non-null   object        
 5   primaryPosition     7850 non-null   object        
 6   birthDate           7850 non-null   datetime64[us]
 7   birthStateProvince  7850 non-null   object        
 8   height              7850 non-null   object        
 9   height_cm           7850 non-null   object        
 10  weight              7850 non-null   object        
 11  shootsCatches       7850 non-null   object        
dtypes: datetime64[us](1), int32(1), object(10)
memory usage: 705.4+ KB


SynapseWidget(Synapse.DataFrame, de01ff2c-e7f1-499e-a735-c75e4c5ded7f)

##### 1. Preliminary Validation Check
Check for exact duplicate rows.

In [5]:
"""
all_columns = df_player_info.columns

# Partition by every column and mark rows that occur more than once
w = Window.partitionBy(*all_columns)
df_duplicates = (
    df_player_info
    .withColumn("duplicate_count", F.count("*").over(w))
    .filter(F.col("duplicate_count") > 1)
    .drop("duplicate_count")
)

total_duplicate_rows = df_duplicates.count()                

print(f"Total duplicate rows: {total_duplicate_rows}")
"""


StatementMeta(, 002e7e54-f5ed-4ab9-926a-ec2d509dd774, 7, Finished, Available, Finished)

Total duplicate rows: 7850


##### 2. Clean / Transform: Change data type
- player_id : integer to string
- height_cm : string to float
- weight : string to integer

In [6]:
"""
# Convert data type from integer to string

df_player_info = (
    df_player_info
    .withColumn("player_id", col("player_id").cast("string"))
    .withColumn("height_cm", col("height_cm").cast(FloatType()))
    .withColumn("weight", col("weight").cast(IntegerType()))
)

df_player_info.printSchema()
"""

StatementMeta(, 002e7e54-f5ed-4ab9-926a-ec2d509dd774, 8, Finished, Available, Finished)

root
 |-- player_id: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- birthCity: string (nullable = true)
 |-- primaryPosition: string (nullable = true)
 |-- birthDate: timestamp (nullable = true)
 |-- birthStateProvince: string (nullable = true)
 |-- height: string (nullable = true)
 |-- height_cm: float (nullable = true)
 |-- weight: integer (nullable = true)
 |-- shootsCatches: string (nullable = true)



##### 2. Clean / Transform: Format column - BirthDate
Format birthDate column as DD-MM-YYYY and excluded time.

In [7]:
"""
# Format birthDate as DD-MM-YYYY
df_player_info = df_player_info.withColumn("birthDate", date_format(col("birthDate"), "dd-MM-yyyy"))
display(df_player_info.limit(10))
"""

StatementMeta(, 002e7e54-f5ed-4ab9-926a-ec2d509dd774, 9, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 830a717f-f84d-4e16-925b-cea2be37d2a2)

In [8]:
"""
# Filter rows where birthStateProvince is "NA"
df_birth_state_province_na = df_player_info.filter(col("birthStateProvince") == "NA")

# Count rows with 'NA' as birthStateProvince
print(f"Rows with 'NA' as birthStateProvince: {df_birth_state_province_na.count()}")

display(df_birth_state_province_na.limit(30).toPandas())
"""

StatementMeta(, 002e7e54-f5ed-4ab9-926a-ec2d509dd774, 10, Finished, Available, Finished)

Rows with 'NA' as birthStateProvince: 2246


SynapseWidget(Synapse.DataFrame, db89629b-9df7-4706-a618-1ed809864610)

##### 2. Clean / Transform: Convert 'NA' to Null - birthStateProvince 
The value 'NA' - Not Applicable convert to null 

In [9]:

"""
df_player_info = df_player_info.withColumn(
    "birthStateProvince",
    when((col("birthStateProvince") == "NA") | (col("birthStateProvince") == ""), lit(None))
    .otherwise(col("birthStateProvince"))
)

display(df_player_info.limit(10))

# Check how many rows match NULL in birthStateProvince
# Count rows where birthStateProvince is NULL

null_count = df_player_info.filter(col("birthStateProvince").isNull()).count()

display(df_player_info.filter(col("birthStateProvince").isNull()).limit(30))

print(f"Rows with NULL as birthStateProvince: {null_count}")

display(df_player_info.limit(10))
"""

StatementMeta(, 002e7e54-f5ed-4ab9-926a-ec2d509dd774, 11, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, ae0a9279-b18f-47e6-a8c9-a897f6fd74ca)

SynapseWidget(Synapse.DataFrame, 3845e972-49d2-4993-8356-e0d2d69add1d)

Rows with NULL as birthStateProvince: 2246


SynapseWidget(Synapse.DataFrame, 3e19bd43-d6b4-4a98-b4f6-07d27fe5aa7f)

In [10]:
#df_player_info.toPandas().info()

StatementMeta(, 002e7e54-f5ed-4ab9-926a-ec2d509dd774, 12, Finished, Available, Finished)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7850 entries, 0 to 7849
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   player_id           7850 non-null   object 
 1   firstName           7850 non-null   object 
 2   lastName            7850 non-null   object 
 3   nationality         7850 non-null   object 
 4   birthCity           7850 non-null   object 
 5   primaryPosition     7850 non-null   object 
 6   birthDate           7850 non-null   object 
 7   birthStateProvince  5604 non-null   object 
 8   height              7850 non-null   object 
 9   height_cm           7844 non-null   float32
 10  weight              7844 non-null   float64
 11  shootsCatches       7850 non-null   object 
dtypes: float32(1), float64(1), object(10)
memory usage: 705.4+ KB


##### 2. Clean / Transform: Drop column -height
Remove **height** column.

In [11]:
"""
# Remove height column
df_player_info = df_player_info.drop("height")

# Show the updated DataFrame
display(df_player_info.limit(10))
"""

StatementMeta(, 002e7e54-f5ed-4ab9-926a-ec2d509dd774, 13, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, f3d7269a-f32b-47fe-b77f-09426e88262c)

##### 2. Clean / Transform: Standardize column name
Standardize column names to snake_case format.

In [12]:
"""
# Function to convert camelCase / PascalCase to snake_case
def to_snake_case(name):
    return ''.join(['_' + c.lower() if c.isupper() else c for c in name]).lstrip('_')

# Apply to all columns in one line
df_player_info = df_player_info.toDF(*[to_snake_case(c) for c in df_player_info.columns])


df_player_info.printSchema()
display(df_player_info.limit(5))

"""

StatementMeta(, 002e7e54-f5ed-4ab9-926a-ec2d509dd774, 14, Finished, Available, Finished)

root
 |-- player_id: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- birth_city: string (nullable = true)
 |-- primary_position: string (nullable = true)
 |-- birth_date: string (nullable = true)
 |-- birth_state_province: string (nullable = true)
 |-- height_cm: float (nullable = true)
 |-- weight: integer (nullable = true)
 |-- shoots_catches: string (nullable = true)



SynapseWidget(Synapse.DataFrame, 48ebdcbd-28ec-4f1c-bc96-b95b2022ce8c)

In [13]:
#df_player_info.printSchema()

StatementMeta(, 002e7e54-f5ed-4ab9-926a-ec2d509dd774, 15, Finished, Available, Finished)

root
 |-- player_id: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- birth_city: string (nullable = true)
 |-- primary_position: string (nullable = true)
 |-- birth_date: string (nullable = true)
 |-- birth_state_province: string (nullable = true)
 |-- height_cm: float (nullable = true)
 |-- weight: integer (nullable = true)
 |-- shoots_catches: string (nullable = true)



##### 3. Load table to Lakehouse_Silver 

In [14]:
"""
# Target path for Silver Lakehouse
target_path = "abfss://dc478dd4-e53e-4f21-add0-2e376dc173fe@onelake.dfs.fabric.microsoft.com/ce7ef0e9-78af-44db-b5ee-839dcf1c9e98/Tables/player_info_silver"


# Write dataframe to Silver Lakehouse
df_player_info.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(target_path)


print(f"✅ Initial load complete: new table created in Lakehouse_Silver with {df_player_info.count()} rows.")
"""

StatementMeta(, 002e7e54-f5ed-4ab9-926a-ec2d509dd774, 16, Finished, Available, Finished)

✅ Initial load complete: new table created in Lakehouse_Silver with 7850 rows.
