## Read Data

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [0]:
races_schema = StructType([StructField("raceId", IntegerType(), False), 
                           StructField("year", IntegerType(), True), 
                           StructField("round", IntegerType(), True), 
                           StructField("circuitId", IntegerType(), True), 
                           StructField("name", StringType(), True),
                           StructField("date", StringType(), True),
                           StructField("time", StringType(), True),
                           StructField("url", StringType(), True)])

In [0]:
races_df = spark.read.options(header=True)\
            .csv("abfss://raw@formula1project2025.dfs.core.windows.net/races.csv", schema = races_schema)

races_df.printSchema()

##Transform Data

In [0]:
races_selected_df = races_df.select("raceId", "year", "round", "circuitId", "name", "date", "time")

In [0]:
races_renamed_df = races_selected_df.withColumnsRenamed({"raceId": "race_id", "year": "race_year", "circuitId": "circuit_id"})

In [0]:
races_new_cols_df = races_renamed_df.withColumn("ingestion_date", current_timestamp()).\
                                    withColumn("race_timestamp", to_timestamp(concat(col("date"), lit(" "), col("time")), "yyyy-MM-dd HH:mm:ss"))

In [0]:
races_final_df = races_new_cols_df.drop("date", "time")

In [0]:
display(races_final_df)

## Write Data

In [0]:
races_final_df.write.mode("overwrite").partitionBy('race_year').parquet("abfss://processed@formula1project2025.dfs.core.windows.net/races")

In [0]:
display(spark.read.parquet("abfss://processed@formula1project2025.dfs.core.windows.net/races"))