#  Bronze to Silver Layer Notebook

##  Objective:
This notebook reads data from the Bronze layer, performs cleaning operations, joins relevant datasets, and stores the cleaned data in the Silver layer.

---

##  Step 1: Start Spark Session
Required to perform data operations in PySpark.

---

##  Step 2: Read Parquet Files from Bronze
Parquet files written in the Bronze container are now read into DataFrames.

---

##  Step 3: Data Cleaning
- Dropping null or irrelevant records.
- Renaming columns for consistency.
- Filling missing values (e.g., NA → 0 or default).

---

##  Step 4: Joining Datasets
Joins are performed between:
- Matches + Players
- Player-Team + Player Performance
- Stadium + Match

This creates a unified data view.

---

##  Step 5: Write to Silver Container
The cleaned and enriched data is written to the Silver container using `overwrite` mode in Parquet format for further analytics.


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("Statement of Work").getOrCreate()

In [0]:
display(dbutils.fs.mounts())

mountPoint,source,encryptionType
/databricks-datasets,databricks-datasets,
/mnt/blob_storage,wasbs://raw@iplblob.blob.core.windows.net,
/Volumes,UnityCatalogVolumes,
/databricks/mlflow-tracking,databricks/mlflow-tracking,
/mnt/adls_storage/silver,wasbs://silver-ipl@adlsipl.blob.core.windows.net,
/databricks-results,databricks-results,
/databricks/mlflow-registry,databricks/mlflow-registry,
/mnt/adls_storage/bronze,wasbs://bronze-ipl@adlsipl.blob.core.windows.net,
/Volume,DbfsReserved,
/volumes,DbfsReserved,


In [0]:
display(dbutils.fs.ls("/mnt/adls_storage/bronze"))

path,name,size,modificationTime
dbfs:/mnt/adls_storage/bronze/match_performance_bronze.parquet/,match_performance_bronze.parquet/,0,0
dbfs:/mnt/adls_storage/bronze/match_stadium_bronze.parquet/,match_stadium_bronze.parquet/,0,0
dbfs:/mnt/adls_storage/bronze/player_performance_bronze.parquet/,player_performance_bronze.parquet/,0,0
dbfs:/mnt/adls_storage/bronze/player_team_bronze.parquet/,player_team_bronze.parquet/,0,0
dbfs:/mnt/adls_storage/bronze/stadium_bronze.parquet/,stadium_bronze.parquet/,0,0
dbfs:/mnt/adls_storage/bronze/team_bronze.parquet/,team_bronze.parquet/,0,0


In [0]:
df_match_performance = spark.read.parquet("/mnt/adls_storage/bronze/match_performance_bronze.parquet")
df_match_stadium = spark.read.parquet("/mnt/adls_storage/bronze/match_stadium_bronze.parquet")
df_player_performance = spark.read.parquet("/mnt/adls_storage/bronze/player_performance_bronze.parquet")
df_player_team = spark.read.parquet("/mnt/adls_storage/bronze/player_team_bronze.parquet")
df_stadium = spark.read.parquet("/mnt/adls_storage/bronze/stadium_bronze.parquet")
df_team = spark.read.parquet("/mnt/adls_storage/bronze/team_bronze.parquet")

In [0]:
display(df_match_performance)


match_id,team_id,opponent_team_id,runs_scored,wickets_taken,match_result,ingestion_date,source_file
201,4,1,213.0,7,Tie,2025-04-05,match_performance.csv
202,1,3,202.0,10,Loss,2025-04-05,match_performance.csv
203,2,3,238.0,8,Win,2025-04-05,match_performance.csv
204,4,3,,9,Loss,2025-04-05,match_performance.csv
205,4,3,123.0,8,Tie,2025-04-05,match_performance.csv
206,1,2,215.0,6,Loss,2025-04-05,match_performance.csv
207,5,3,211.0,5,Win,2025-04-05,match_performance.csv
208,3,5,171.0,6,Unknown,2025-04-05,match_performance.csv
209,4,1,201.0,5,Tie,2025-04-05,match_performance.csv
210,4,3,233.0,7,Win,2025-04-05,match_performance.csv


In [0]:
display(df_match_stadium)

match_id,stadium_id,ingestion_date,source_file
201,301,2025-04-05,match_stadium.csv
202,304,2025-04-05,match_stadium.csv
203,302,2025-04-05,match_stadium.csv
204,304,2025-04-05,match_stadium.csv
205,304,2025-04-05,match_stadium.csv
206,305,2025-04-05,match_stadium.csv
207,305,2025-04-05,match_stadium.csv
208,302,2025-04-05,match_stadium.csv
209,301,2025-04-05,match_stadium.csv
210,305,2025-04-05,match_stadium.csv


In [0]:
display(df_player_performance)

match_id,player_id,runs_scored,wickets_taken,ball_taken,ingestion_date,source_file
214,101,82,2,50,2025-04-05,player_performance.csv
211,102,97,2,50,2025-04-05,player_performance.csv
207,103,29,2,50,2025-04-05,player_performance.csv
218,104,14,3,50,2025-04-05,player_performance.csv
209,105,5,1,50,2025-04-05,player_performance.csv
218,106,44,1,50,2025-04-05,player_performance.csv
202,107,68,2,50,2025-04-05,player_performance.csv
213,108,94,4,50,2025-04-05,player_performance.csv
215,109,93,2,50,2025-04-05,player_performance.csv
219,110,75,3,50,2025-04-05,player_performance.csv


In [0]:
display(df_player_team)

player_id,player_name,team_id,player_role,ingestion_date,source_file
101,Thomas Mendez,1,All-rounder,2025-04-05,player_team.csv
102,Laurie Brooks,5,Batsman,2025-04-05,player_team.csv
103,Warren Morgan,5,All-rounder,2025-04-05,player_team.csv
104,Amber Bell,2,Bowler,2025-04-05,player_team.csv
105,Shawn Bowers,1,Batsman,2025-04-05,player_team.csv
106,,2,Bowler,2025-04-05,player_team.csv
107,Autumn Chang,3,All-rounder,2025-04-05,player_team.csv
108,Jared Knox,2,Bowler,2025-04-05,player_team.csv
109,Kevin Conrad,2,Bowler,2025-04-05,player_team.csv
110,Alan Padilla,4,All-rounder,2025-04-05,player_team.csv


In [0]:
display(df_stadium)

stadium_id,stadium_name,city,capacity,ingestion_date,source_file
301,Wankhede Stadium,Mumbai,33108.0,2025-04-05,stadium.csv
302,M. A. Chidambaram Stadium,Chennai,,2025-04-05,stadium.csv
303,Arun Jaitley Stadium,Delhi,41820.0,2025-04-05,stadium.csv
304,Eden Gardens,Kolkata,66000.0,2025-04-05,stadium.csv
305,Rajiv Gandhi Intl. Cricket Stadium,,55000.0,2025-04-05,stadium.csv


In [0]:
display(df_team)

team_id,team_name,home_ground,captain,ingestion_date,source_file
1,Mumbai Indians,Wankhede Stadium,Rohit Sharma,2025-04-05,team.csv
2,Chennai Super Kings,M. A. Chidambaram Stadium,MS Dhoni,2025-04-05,team.csv
3,Delhi Capitals,,Rishabh Pant,2025-04-05,team.csv
4,Kolkata Knight Riders,Eden Gardens,Shreyas Iyer,2025-04-05,team.csv
5,Sunrisers Hyderabad,Rajiv Gandhi Intl. Cricket Stadium,,2025-04-05,team.csv


In [0]:
for i in df_match_performance.columns:
  print(i, df_match_performance.filter(df_match_performance[i].isNull()).count())

match_id 0
team_id 0
opponent_team_id 0
runs_scored 1
wickets_taken 0
match_result 0
ingestion_date 0
source_file 0


In [0]:
df_match_performance_cleaned = df_match_performance.fillna({'runs_scored': 0}).na.replace('Unknown', 'Cancelled').withColumnRenamed('match_id', 'match_performance_match_id')


In [0]:
df_match_performance_cleaned.display()

match_performance_match_id,team_id,opponent_team_id,runs_scored,wickets_taken,match_result,ingestion_date,source_file
201,4,1,213.0,7,Tie,2025-04-05,match_performance.csv
202,1,3,202.0,10,Loss,2025-04-05,match_performance.csv
203,2,3,238.0,8,Win,2025-04-05,match_performance.csv
204,4,3,0.0,9,Loss,2025-04-05,match_performance.csv
205,4,3,123.0,8,Tie,2025-04-05,match_performance.csv
206,1,2,215.0,6,Loss,2025-04-05,match_performance.csv
207,5,3,211.0,5,Win,2025-04-05,match_performance.csv
208,3,5,171.0,6,Cancelled,2025-04-05,match_performance.csv
209,4,1,201.0,5,Tie,2025-04-05,match_performance.csv
210,4,3,233.0,7,Win,2025-04-05,match_performance.csv


In [0]:
for i in df_match_stadium.columns:
    print(i, df_match_stadium.filter(df_match_stadium[i].isNull()).count())

match_id 0
stadium_id 0
ingestion_date 0
source_file 0


In [0]:
df_match_stadium_cleaned = df_match_stadium.dropna(subset=['stadium_id'])

In [0]:
df_match_stadium_cleaned.display()

match_id,stadium_id,ingestion_date,source_file
201,301,2025-04-05,match_stadium.csv
202,304,2025-04-05,match_stadium.csv
203,302,2025-04-05,match_stadium.csv
204,304,2025-04-05,match_stadium.csv
205,304,2025-04-05,match_stadium.csv
206,305,2025-04-05,match_stadium.csv
207,305,2025-04-05,match_stadium.csv
208,302,2025-04-05,match_stadium.csv
209,301,2025-04-05,match_stadium.csv
210,305,2025-04-05,match_stadium.csv


In [0]:
for i in df_player_performance.columns:
    print(i, df_player_performance.filter(df_player_performance[i].isNull()).count())

match_id 0
player_id 0
runs_scored 0
wickets_taken 0
ball_taken 0
ingestion_date 0
source_file 0


In [0]:
df_player_performance_cleaned = df_player_performance.fillna({'runs_scored': 0, 'wickets_taken': 0, 'ball_taken': 0}).dropna(subset=['match_id','player_id']).withColumnRenamed('runs_scored', 'player_runs_scored').withColumnRenamed('wickets_taken', 'player_wickets_taken')

In [0]:
df_player_performance_cleaned.display()

match_id,player_id,player_runs_scored,player_wickets_taken,ball_taken,ingestion_date,source_file
214,101,82,2,50,2025-04-05,player_performance.csv
211,102,97,2,50,2025-04-05,player_performance.csv
207,103,29,2,50,2025-04-05,player_performance.csv
218,104,14,3,50,2025-04-05,player_performance.csv
209,105,5,1,50,2025-04-05,player_performance.csv
218,106,44,1,50,2025-04-05,player_performance.csv
202,107,68,2,50,2025-04-05,player_performance.csv
213,108,94,4,50,2025-04-05,player_performance.csv
215,109,93,2,50,2025-04-05,player_performance.csv
219,110,75,3,50,2025-04-05,player_performance.csv


In [0]:
df_player_team.display()

player_id,player_name,team_id,player_role,ingestion_date,source_file
101,Thomas Mendez,1,All-rounder,2025-04-05,player_team.csv
102,Laurie Brooks,5,Batsman,2025-04-05,player_team.csv
103,Warren Morgan,5,All-rounder,2025-04-05,player_team.csv
104,Amber Bell,2,Bowler,2025-04-05,player_team.csv
105,Shawn Bowers,1,Batsman,2025-04-05,player_team.csv
106,,2,Bowler,2025-04-05,player_team.csv
107,Autumn Chang,3,All-rounder,2025-04-05,player_team.csv
108,Jared Knox,2,Bowler,2025-04-05,player_team.csv
109,Kevin Conrad,2,Bowler,2025-04-05,player_team.csv
110,Alan Padilla,4,All-rounder,2025-04-05,player_team.csv


In [0]:
df_player_team_cleaned = df_player_team.dropna(subset=['player_id', 'team_id']).fillna({'player_name': 'Unknown', 'player_role': 'Unknown'}).withColumnRenamed('team_id', 'player_team_id')

In [0]:
df_player_team_cleaned.display()

player_id,player_name,player_team_id,player_role,ingestion_date,source_file
101,Thomas Mendez,1,All-rounder,2025-04-05,player_team.csv
102,Laurie Brooks,5,Batsman,2025-04-05,player_team.csv
103,Warren Morgan,5,All-rounder,2025-04-05,player_team.csv
104,Amber Bell,2,Bowler,2025-04-05,player_team.csv
105,Shawn Bowers,1,Batsman,2025-04-05,player_team.csv
106,Unknown,2,Bowler,2025-04-05,player_team.csv
107,Autumn Chang,3,All-rounder,2025-04-05,player_team.csv
108,Jared Knox,2,Bowler,2025-04-05,player_team.csv
109,Kevin Conrad,2,Bowler,2025-04-05,player_team.csv
110,Alan Padilla,4,All-rounder,2025-04-05,player_team.csv


In [0]:
df_stadium.display()

stadium_id,stadium_name,city,capacity,ingestion_date,source_file
301,Wankhede Stadium,Mumbai,33108.0,2025-04-05,stadium.csv
302,M. A. Chidambaram Stadium,Chennai,,2025-04-05,stadium.csv
303,Arun Jaitley Stadium,Delhi,41820.0,2025-04-05,stadium.csv
304,Eden Gardens,Kolkata,66000.0,2025-04-05,stadium.csv
305,Rajiv Gandhi Intl. Cricket Stadium,,55000.0,2025-04-05,stadium.csv


In [0]:
from pyspark.sql.functions import when, col

df_stadium_cleaned = (
    df_stadium
    .dropna(subset=['stadium_id'])  # Remove rows where stadium_id is NULL
    .fillna({'stadium_name': 'Unknown', 'city': 'Unknown', 'capacity': 30000})  # Fill missing values
    .withColumn(
        "city",
        when(col("stadium_name") == "Rajiv Gandhi Intl. Cricket Stadium", "Hyderabad")
        .otherwise(col("city"))  # Use col() instead of df_stadium['city']
    )
)

In [0]:
df_stadium_cleaned.display()

stadium_id,stadium_name,city,capacity,ingestion_date,source_file
301,Wankhede Stadium,Mumbai,33108,2025-04-05,stadium.csv
302,M. A. Chidambaram Stadium,Chennai,30000,2025-04-05,stadium.csv
303,Arun Jaitley Stadium,Delhi,41820,2025-04-05,stadium.csv
304,Eden Gardens,Kolkata,66000,2025-04-05,stadium.csv
305,Rajiv Gandhi Intl. Cricket Stadium,Hyderabad,55000,2025-04-05,stadium.csv


In [0]:
df_team.display()

team_id,team_name,home_ground,captain,ingestion_date,source_file
1,Mumbai Indians,Wankhede Stadium,Rohit Sharma,2025-04-05,team.csv
2,Chennai Super Kings,M. A. Chidambaram Stadium,MS Dhoni,2025-04-05,team.csv
3,Delhi Capitals,,Rishabh Pant,2025-04-05,team.csv
4,Kolkata Knight Riders,Eden Gardens,Shreyas Iyer,2025-04-05,team.csv
5,Sunrisers Hyderabad,Rajiv Gandhi Intl. Cricket Stadium,,2025-04-05,team.csv


In [0]:
%python
from pyspark.sql.functions import col, when

df_team_cleaned = (
    df_team
    .dropna(subset=['team_id'])  # Remove rows where team_id is NULL
    .fillna({'captain': 'Unknown'})  # Fill missing values
    .withColumn(
        "home_ground",
        when(col("team_name") == "Delhi Capitals", "Arun Jaitley Stadium")
        .otherwise(col("home_ground"))  # Use col() instead of df_stadium
    )
)

In [0]:
display(df_team_cleaned)

team_id,team_name,home_ground,captain,ingestion_date,source_file
1,Mumbai Indians,Wankhede Stadium,Rohit Sharma,2025-04-05,team.csv
2,Chennai Super Kings,M. A. Chidambaram Stadium,MS Dhoni,2025-04-05,team.csv
3,Delhi Capitals,Arun Jaitley Stadium,Rishabh Pant,2025-04-05,team.csv
4,Kolkata Knight Riders,Eden Gardens,Shreyas Iyer,2025-04-05,team.csv
5,Sunrisers Hyderabad,Rajiv Gandhi Intl. Cricket Stadium,Unknown,2025-04-05,team.csv


In [0]:
df_join = df_player_performance_cleaned.join(df_match_performance_cleaned, df_match_performance_cleaned.match_performance_match_id == df_player_performance_cleaned.match_id, how='inner')\
        .withColumn('Process_Date', current_timestamp())

In [0]:
df_join = df_join.drop('ingestion_date', 'source_file')

In [0]:
df_join.display()

match_id,player_id,player_runs_scored,player_wickets_taken,ball_taken,match_performance_match_id,team_id,opponent_team_id,runs_scored,wickets_taken,match_result,Process_Date
202,107,68,2,50,202,1,3,202.0,10,Loss,2025-04-05T06:16:16.232+0000
203,118,80,1,50,203,2,3,238.0,8,Win,2025-04-05T06:16:16.232+0000
204,111,38,2,50,204,4,3,0.0,9,Loss,2025-04-05T06:16:16.232+0000
205,115,85,0,50,205,4,3,123.0,8,Tie,2025-04-05T06:16:16.232+0000
207,103,29,2,50,207,5,3,211.0,5,Win,2025-04-05T06:16:16.232+0000
208,117,35,4,50,208,3,5,171.0,6,Cancelled,2025-04-05T06:16:16.232+0000
209,105,5,1,50,209,4,1,201.0,5,Tie,2025-04-05T06:16:16.232+0000
211,102,97,2,50,211,5,1,144.0,10,Loss,2025-04-05T06:16:16.232+0000
213,119,21,3,50,213,4,2,190.0,9,Loss,2025-04-05T06:16:16.232+0000
213,108,94,4,50,213,4,2,190.0,9,Loss,2025-04-05T06:16:16.232+0000


In [0]:
dbutils.fs.unmount("/mnt/adls_storage/silver")

/mnt/adls_storage/silver has been unmounted.
Out[33]: True

In [0]:
dbutils.fs.mount(
  source="wasbs://silver-ipl@adlsipl.blob.core.windows.net",
  mount_point="/mnt/adls_storage/silver",
  extra_configs={
      "fs.azure.account.key.adlsipl.blob.core.windows.net": "xwfw4Y9fMPe4mMkfTzU8HckStsCQeSgHxkeyKWeENgpuVE7YYPqRJdYvKbvr+WtUe1GfrtHKRVnF+AStyw9REw=="
  }
)

Out[34]: True

In [0]:
display(dbutils.fs.mounts())

mountPoint,source,encryptionType
/databricks-datasets,databricks-datasets,
/mnt/blob_storage,wasbs://raw@iplblob.blob.core.windows.net,
/Volumes,UnityCatalogVolumes,
/databricks/mlflow-tracking,databricks/mlflow-tracking,
/mnt/adls_storage/silver,wasbs://silver-ipl@adlsipl.blob.core.windows.net,
/databricks-results,databricks-results,
/databricks/mlflow-registry,databricks/mlflow-registry,
/mnt/adls_storage/bronze,wasbs://bronze-ipl@adlsipl.blob.core.windows.net,
/Volume,DbfsReserved,
/volumes,DbfsReserved,


In [0]:
display(dbutils.fs.ls("/mnt/adls_storage/silver"))

path,name,size,modificationTime
dbfs:/mnt/adls_storage/silver/match_performance_silver.parquet/,match_performance_silver.parquet/,0,0
dbfs:/mnt/adls_storage/silver/match_stadium_silver.parquet/,match_stadium_silver.parquet/,0,0
dbfs:/mnt/adls_storage/silver/player_performance_silver.parquet/,player_performance_silver.parquet/,0,0
dbfs:/mnt/adls_storage/silver/player_team_silver.parquet/,player_team_silver.parquet/,0,0
dbfs:/mnt/adls_storage/silver/stadium_silver.parquet/,stadium_silver.parquet/,0,0
dbfs:/mnt/adls_storage/silver/team_silver.parquet/,team_silver.parquet/,0,0


In [0]:
df_match_performance_cleaned.write.format("parquet").mode("overwrite").save("/mnt/adls_storage/silver/match_performance_silver.parquet")

df_match_stadium_cleaned.write.format("parquet").mode("overwrite").save("/mnt/adls_storage/silver/match_stadium_silver.parquet")

df_player_performance_cleaned.write.format("parquet").mode("overwrite").save("/mnt/adls_storage/silver/player_performance_silver.parquet")

df_player_team_cleaned.write.format("parquet").mode("overwrite").save("/mnt/adls_storage/silver/player_team_silver.parquet")     

df_stadium_cleaned.write.format("parquet").mode("overwrite").save("/mnt/adls_storage/silver/stadium_silver.parquet")

df_team_cleaned.write.format("parquet").mode("overwrite").save("/mnt/adls_storage/silver/team_silver.parquet")

In [0]:
# JDBC connection string
jdbc_url = f"jdbc:sqlserver://ipldata123.database.windows.net:1433;databaseName=ipl_data"
 
# Connection properties
connection_properties = {
    "user": "satyamsingh",
    "password": "Satyam@961",
    "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}
 
 

In [0]:
df_join.write \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", "silver_db.master_table") \
    .options(**connection_properties) \
    .mode("OVERWRITE") \
    .save()
 

In [0]:
df_match_performance_cleaned.write \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", "silver_db.match_performance") \
    .options(**connection_properties) \
    .mode("OVERWRITE") \
    .save()
 

In [0]:
df_match_stadium_cleaned.write \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", "silver_db.match_stadium") \
    .options(**connection_properties) \
    .mode("OVERWRITE") \
    .save()
 

In [0]:
df_player_performance_cleaned.write \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", "silver_db.player_performance") \
    .options(**connection_properties) \
    .mode("OVERWRITE") \
    .save()
 

In [0]:
df_player_team_cleaned.write \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", "silver_db.player_team") \
    .options(**connection_properties) \
    .mode("OVERWRITE") \
    .save()
 

In [0]:
df_stadium_cleaned.write \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", "silver_db.stadium") \
    .options(**connection_properties) \
    .mode("OVERWRITE") \
    .save()
 

In [0]:
df_team_cleaned.write \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", "silver_db.team") \
    .options(**connection_properties) \
    .mode("OVERWRITE") \
    .save()
 