### Ingest circuits.csv file

In [1]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

To use databricks widgets interactively in your notebook, please install databricks sdk using:
	pip install 'databricks-sdk[notebook]'
Falling back to default_value_only implementation for databricks widgets.


In [2]:
from formula1.formula1_constants import raw_folder_path, processed_folder_path, presentation_folder_path

In [3]:
from formula1.formula1_utils import add_ingestion_date, re_arrange_partition_column, df_column_to_list

##### Step 1 - Read the CSV file using the spark dataframe reader

In [4]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

In [6]:
circuits_schema = StructType(fields=[StructField("circuitId", IntegerType(), False),
                                     StructField("circuitRef", StringType(), True),
                                     StructField("name", StringType(), True),
                                     StructField("location", StringType(), True),
                                     StructField("country", StringType(), True),
                                     StructField("lat", DoubleType(), True),
                                     StructField("lng", DoubleType(), True),
                                     StructField("alt", IntegerType(), True),
                                     StructField("url", StringType(), True)
])

In [7]:
circuits_df = spark.read \
.option("header", True) \
.schema(circuits_schema) \
.csv(f"{raw_folder_path}/circuits.csv")

##### Step 2 - Select only the required columns

In [8]:
from pyspark.sql.functions import col

In [9]:
circuits_selected_df = circuits_df.select(col("circuitId"), col("circuitRef"), col("name"), col("location"), col("country"), col("lat"), col("lng"), col("alt"))

##### Step 3 - Rename the columns as required

In [10]:
from pyspark.sql.functions import lit

In [11]:
circuits_renamed_df = circuits_selected_df.withColumnRenamed("circuitId", "circuit_id") \
.withColumnRenamed("circuitRef", "circuit_ref") \
.withColumnRenamed("lat", "latitude") \
.withColumnRenamed("lng", "longitude") \
.withColumnRenamed("alt", "altitude") \
.withColumn("data_source", lit(v_data_source))

##### Step 4 - Add ingestion date to the dataframe

In [12]:
circuits_final_df = add_ingestion_date(circuits_renamed_df)

##### Step 5 - Write data to datalake as parquet

In [13]:
circuits_final_df.write.mode("overwrite").parquet(f"{processed_folder_path}/circuits")

In [None]:
#display(spark.read.parquet(f"{processed_folder_path}/circuits"))

Unnamed: 0,circuit_id,circuit_ref,name,location,country,latitude,longitude,altitude,data_source,ingestion_date
0,1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,,2026-01-18 18:03:03.213112
1,2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,,2026-01-18 18:03:03.213112
2,3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,,2026-01-18 18:03:03.213112
3,4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,,2026-01-18 18:03:03.213112
4,5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,,2026-01-18 18:03:03.213112
5,6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7,,2026-01-18 18:03:03.213112
6,7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13,,2026-01-18 18:03:03.213112
7,8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228,,2026-01-18 18:03:03.213112
8,9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153,,2026-01-18 18:03:03.213112
9,10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103,,2026-01-18 18:03:03.213112
