### Bronze Layer: Ingest races.csv

Load raw CSV from landing zone and save as parquet in bronze/raw layer.

In [None]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, TimestampType
from pyspark.sql.functions import current_timestamp, lit, to_timestamp, concat, col

In [None]:
from formula1.formula1_constants import landing_folder_path, raw_folder_path

##### Step 1 - Define schema and read CSV from landing zone

In [None]:
races_schema = StructType(fields=[
    StructField("raceId", IntegerType(), False),
    StructField("year", IntegerType(), True),
    StructField("round", IntegerType(), True),
    StructField("circuitId", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("date", DateType(), True),
    StructField("time", StringType(), True),
    StructField("url", StringType(), True)
])

In [None]:
races_df = spark.read \
    .option("header", True) \
    .schema(races_schema) \
    .csv(f"{landing_folder_path}/races.csv")

##### Step 2 - Add ingestion metadata

In [None]:
races_df = races_df \
    .withColumn("ingestion_date", current_timestamp()) \
    .withColumn("data_source", lit(v_data_source))

##### Step 3 - Write to bronze/raw layer as parquet

In [None]:
races_df.write.mode("overwrite").parquet(f"{raw_folder_path}/races")
