### Ingest results.json file

##### Step 1 - Read the JSON file using the spark dataframe reader API

In [1]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

In [2]:
results_schema = StructType(fields=[StructField("resultId", IntegerType(), False),
                                    StructField("raceId", IntegerType(), True),
                                    StructField("driverId", IntegerType(), True),
                                    StructField("constructorId", IntegerType(), True),
                                    StructField("number", IntegerType(), True),
                                    StructField("grid", IntegerType(), True),
                                    StructField("position", IntegerType(), True),
                                    StructField("positionText", StringType(), True),
                                    StructField("positionOrder", IntegerType(), True),
                                    StructField("points", FloatType(), True),
                                    StructField("laps", IntegerType(), True),
                                    StructField("time", StringType(), True),
                                    StructField("milliseconds", IntegerType(), True),
                                    StructField("fastestLap", IntegerType(), True),
                                    StructField("rank", IntegerType(), True),
                                    StructField("fastestLapTime", StringType(), True),
                                    StructField("fastestLapSpeed", FloatType(), True),
                                    StructField("statusId", StringType(), True)])

In [3]:
results_df = spark.read \
.schema(results_schema) \
.json("/mnt/gualterformula1dl/raw/results.json")

##### Step 2 - Rename columns and add new columns

In [4]:
from pyspark.sql.functions import current_timestamp

In [5]:
results_with_columns_df = results_df.withColumnRenamed("resultId", "result_id") \
                                    .withColumnRenamed("raceId", "race_id") \
                                    .withColumnRenamed("driverId", "driver_id") \
                                    .withColumnRenamed("constructorId", "constructor_id") \
                                    .withColumnRenamed("positionText", "position_text") \
                                    .withColumnRenamed("positionOrder", "position_order") \
                                    .withColumnRenamed("fastestLap", "fastest_lap") \
                                    .withColumnRenamed("fastestLapTime", "fastest_lap_time") \
                                    .withColumnRenamed("fastestLapSpeed", "fastest_lap_speed") \
                                    .withColumn("ingestion_date", current_timestamp()) 

##### Step 3 - Drop the unwanted column

In [6]:
from pyspark.sql.functions import col

In [7]:
results_final_df = results_with_columns_df.drop(col("statusId"))

##### Step 4 - Write to output to processed container in parquet format

In [8]:
results_final_df.write.mode("overwrite").partitionBy('race_id').parquet("/mnt/gualterformula1dl/processed/results")