In [0]:
dbutils.widgets.text("p_data_souce", "")

In [0]:
v_data_source = dbutils.widgets.get("p_data_souce")

In [0]:
%run "../Includes/configs"

In [0]:
%run "../SetUp/setup"

### Access Azure Data Lake using Service Principal
**Steps to follow:**
1. Register Azure AD Application/ Service Principal
2. Generate a secret/ password for the application
3. Set spark config with App/ Client Id, Directory/ Tenant Id & Secret
4. Assign role "Storage Blob Data Contributor" to the Data Lake

path,name,size,modificationTime
abfss://raw@forrmulaa1dl.dfs.core.windows.net/circuits.csv,circuits.csv,10044,1767877118000
abfss://raw@forrmulaa1dl.dfs.core.windows.net/constructors.json,constructors.json,30415,1767877118000
abfss://raw@forrmulaa1dl.dfs.core.windows.net/drivers.json,drivers.json,180812,1767877118000
abfss://raw@forrmulaa1dl.dfs.core.windows.net/lap_times/,lap_times/,0,1767877145000
abfss://raw@forrmulaa1dl.dfs.core.windows.net/pit_stops.json,pit_stops.json,1369387,1767877119000
abfss://raw@forrmulaa1dl.dfs.core.windows.net/qualifying/,qualifying/,0,1767877183000
abfss://raw@forrmulaa1dl.dfs.core.windows.net/races.csv,races.csv,116847,1767877118000
abfss://raw@forrmulaa1dl.dfs.core.windows.net/results.json,results.json,7165641,1767877120000


_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8
circuitId,circuitRef,name,location,country,lat,lng,alt,url
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,http://en.wikipedia.org/wiki/Melbourne_Grand_Prix_Circuit
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,http://en.wikipedia.org/wiki/Sepang_International_Circuit
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,http://en.wikipedia.org/wiki/Bahrain_International_Circuit
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,http://en.wikipedia.org/wiki/Circuit_de_Barcelona-Catalunya
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,http://en.wikipedia.org/wiki/Istanbul_Park
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7,http://en.wikipedia.org/wiki/Circuit_de_Monaco
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13,http://en.wikipedia.org/wiki/Circuit_Gilles_Villeneuve
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228,http://en.wikipedia.org/wiki/Circuit_de_Nevers_Magny-Cours
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153,http://en.wikipedia.org/wiki/Silverstone_Circuit


**Read the JSON file using spark dataframe reader**

In [0]:
#DDL style defining the schema:
#constructor_schema = "constructorId INT, constructorRef STRING, name STRING, nationality STRING, url STRING"

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [0]:
constructor_schema = StructType([
    StructField("constructorId", IntegerType(), False),
    StructField("constructorRef", StringType(), True),
    StructField("name", StringType(), True),
    StructField("nationality", StringType(), True),
    StructField("url", StringType(), True)
])

In [0]:
constructor_df = spark.read.schema(constructor_schema).json(f"{raw_folder_path}/constructors.json")

In [0]:
display(constructor_df)

constructorId,constructorRef,name,nationality,url
1,mclaren,McLaren,British,http://en.wikipedia.org/wiki/McLaren
2,bmw_sauber,BMW Sauber,German,http://en.wikipedia.org/wiki/BMW_Sauber
3,williams,Williams,British,http://en.wikipedia.org/wiki/Williams_Grand_Prix_Engineering
4,renault,Renault,French,http://en.wikipedia.org/wiki/Renault_in_Formula_One
5,toro_rosso,Toro Rosso,Italian,http://en.wikipedia.org/wiki/Scuderia_Toro_Rosso
6,ferrari,Ferrari,Italian,http://en.wikipedia.org/wiki/Scuderia_Ferrari
7,toyota,Toyota,Japanese,http://en.wikipedia.org/wiki/Toyota_Racing
8,super_aguri,Super Aguri,Japanese,http://en.wikipedia.org/wiki/Super_Aguri_F1
9,red_bull,Red Bull,Austrian,http://en.wikipedia.org/wiki/Red_Bull_Racing
10,force_india,Force India,Indian,http://en.wikipedia.org/wiki/Racing_Point_Force_India


**Drop the unwanted columns from the dataframe**

In [0]:
from pyspark.sql.functions import col, current_timestamp, lit

In [0]:
constructor_dropped_df = constructor_df.drop(col("url"))

In [0]:
display(constructor_dropped_df)

constructorId,constructorRef,name,nationality
1,mclaren,McLaren,British
2,bmw_sauber,BMW Sauber,German
3,williams,Williams,British
4,renault,Renault,French
5,toro_rosso,Toro Rosso,Italian
6,ferrari,Ferrari,Italian
7,toyota,Toyota,Japanese
8,super_aguri,Super Aguri,Japanese
9,red_bull,Red Bull,Austrian
10,force_india,Force India,Indian


In [0]:
constructor_final_df = constructor_dropped_df.withColumnRenamed("constructorId", "constructor_id").withColumnRenamed("constructorRef", "constructor_ref").withColumn("ingestion_date", current_timestamp()).withColumn("data_source", lit(v_data_source))

In [0]:
display(constructor_final_df)

constructor_id,constructor_ref,name,nationality,ingestion_date,data_source
1,mclaren,McLaren,British,2026-01-10T11:27:53.294984Z,Ergast API
2,bmw_sauber,BMW Sauber,German,2026-01-10T11:27:53.294984Z,Ergast API
3,williams,Williams,British,2026-01-10T11:27:53.294984Z,Ergast API
4,renault,Renault,French,2026-01-10T11:27:53.294984Z,Ergast API
5,toro_rosso,Toro Rosso,Italian,2026-01-10T11:27:53.294984Z,Ergast API
6,ferrari,Ferrari,Italian,2026-01-10T11:27:53.294984Z,Ergast API
7,toyota,Toyota,Japanese,2026-01-10T11:27:53.294984Z,Ergast API
8,super_aguri,Super Aguri,Japanese,2026-01-10T11:27:53.294984Z,Ergast API
9,red_bull,Red Bull,Austrian,2026-01-10T11:27:53.294984Z,Ergast API
10,force_india,Force India,Indian,2026-01-10T11:27:53.294984Z,Ergast API


**Write output to the parquet file**

In [0]:
constructor_final_df.write.mode("overwrite").parquet(f"{processed_folder_path}/constructors")