In [0]:
dbutils.widgets.text("p_data_source", "")

In [0]:
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
%run "../Includes/configs"

In [0]:
%run "../SetUp/setup"

### Access Azure Data Lake using Service Principal
**Steps to follow:**
1. Register Azure AD Application/ Service Principal
2. Generate a secret/ password for the application
3. Set spark config with App/ Client Id, Directory/ Tenant Id & Secret
4. Assign role "Storage Blob Data Contributor" to the Data Lake

path,name,size,modificationTime
abfss://raw@forrmulaa1dl.dfs.core.windows.net/circuits.csv,circuits.csv,10044,1767877118000
abfss://raw@forrmulaa1dl.dfs.core.windows.net/constructors.json,constructors.json,30415,1767877118000
abfss://raw@forrmulaa1dl.dfs.core.windows.net/drivers.json,drivers.json,180812,1767877118000
abfss://raw@forrmulaa1dl.dfs.core.windows.net/lap_times/,lap_times/,0,1767877145000
abfss://raw@forrmulaa1dl.dfs.core.windows.net/pit_stops.json,pit_stops.json,1369387,1767877119000
abfss://raw@forrmulaa1dl.dfs.core.windows.net/qualifying/,qualifying/,0,1767877183000
abfss://raw@forrmulaa1dl.dfs.core.windows.net/races.csv,races.csv,116847,1767877118000
abfss://raw@forrmulaa1dl.dfs.core.windows.net/results.json,results.json,7165641,1767877120000


_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8
circuitId,circuitRef,name,location,country,lat,lng,alt,url
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,http://en.wikipedia.org/wiki/Melbourne_Grand_Prix_Circuit
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,http://en.wikipedia.org/wiki/Sepang_International_Circuit
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,http://en.wikipedia.org/wiki/Bahrain_International_Circuit
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,http://en.wikipedia.org/wiki/Circuit_de_Barcelona-Catalunya
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,http://en.wikipedia.org/wiki/Istanbul_Park
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7,http://en.wikipedia.org/wiki/Circuit_de_Monaco
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13,http://en.wikipedia.org/wiki/Circuit_Gilles_Villeneuve
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228,http://en.wikipedia.org/wiki/Circuit_de_Nevers_Magny-Cours
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153,http://en.wikipedia.org/wiki/Silverstone_Circuit


**Read the JSON file using spark dataframe reader API**

In [0]:
from pyspark.sql.types import StructField, StructType, IntegerType, StringType

In [0]:
pitstops_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                     StructField("driverId", IntegerType(), True),
                                     StructField("stop", StringType(), True),
                                     StructField("lap", IntegerType(), True),
                                     StructField("time", IntegerType(), True),
                                     StructField("duration", IntegerType(), True),
                                     StructField("milliseconds", IntegerType(), True)])

In [0]:
pitstops_df = spark.read.option("multiline", True).schema(pitstops_schema).json(f"{raw_folder_path}/pit_stops.json")

In [0]:
display(pitstops_df) #spark by default doesn't deal with multi-line files, so we added option("multiline", True)

raceId,driverId,stop,lap,time,duration,milliseconds
841,153,1,1,,,26898
841,30,1,1,,,25021
841,17,1,11,,,23426
841,4,1,12,,,23251
841,13,1,13,,,23842
841,22,1,13,,,23643
841,20,1,14,,,22603
841,814,1,14,,,24863
841,816,1,14,,,25259
841,67,1,15,,,25342


In [0]:
from pyspark.sql.functions import current_timestamp, lit

In [0]:
pitstops_final_df = pitstops_df.withColumnRenamed("raceId", "race_id").withColumnRenamed("driverId", "driver_id").withColumn("ingestion_date", current_timestamp()).withColumn("data_source", lit(v_data_source))

In [0]:
pitstops_final_df.write.mode("overwrite").parquet(f"{processed_folder_path}/pit_stops")

In [0]:
display(spark.read.parquet(f"{processed_folder_path}/pit_stops"))

race_id,driver_id,stop,lap,time,duration,milliseconds,ingestion_date,data_source
841,153,1,1,,,26898,2026-01-10T11:37:37.920874Z,Ergast API
841,30,1,1,,,25021,2026-01-10T11:37:37.920874Z,Ergast API
841,17,1,11,,,23426,2026-01-10T11:37:37.920874Z,Ergast API
841,4,1,12,,,23251,2026-01-10T11:37:37.920874Z,Ergast API
841,13,1,13,,,23842,2026-01-10T11:37:37.920874Z,Ergast API
841,22,1,13,,,23643,2026-01-10T11:37:37.920874Z,Ergast API
841,20,1,14,,,22603,2026-01-10T11:37:37.920874Z,Ergast API
841,814,1,14,,,24863,2026-01-10T11:37:37.920874Z,Ergast API
841,816,1,14,,,25259,2026-01-10T11:37:37.920874Z,Ergast API
841,67,1,15,,,25342,2026-01-10T11:37:37.920874Z,Ergast API
