## Create spark context

In [1]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
spark = SparkSession.builder.config("spark.sql.shuffle.partitions", "2").appName("Analysis").master("local[2]").getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext



In [2]:
%run "../includes/configuration"

## Read all the data required

In [3]:
# Renaming columns so there will be less ambiguity because of similar names
drivers_df = spark.read.parquet(f"{processed_folder_path}/drivers").withColumnRenamed("number", "driver_number")\
.withColumnRenamed("name", "driver_name").withColumnRenamed("nationality", "driver_nationality")

constructors_df = spark.read.parquet(f"{processed_folder_path}/constructors").withColumnRenamed("name", "team")

circuits_df = spark.read.parquet(f"{processed_folder_path}/circuits").withColumnRenamed("location", "circuit_location")

races_df = spark.read.parquet(f"{processed_folder_path}/races").withColumnRenamed("name", "race_name")\
.withColumnRenamed("race_timestamp", "race_date")

results_df = spark.read.parquet(f"{processed_folder_path}/results").withColumnRenamed("time", "race_time")

## Join circuits to races

In [4]:
race_circuits_df = races_df.join(circuits_df, races_df.circuit_id == circuits_df.circuit_id, "inner")\
.select(races_df.race_id, races_df.race_year, races_df.race_name, races_df.race_date, circuits_df.circuit_location)

In [5]:
race_circuits_df.show(5)

+-------+---------+--------------------+-------------------+----------------+
|race_id|race_year|           race_name|          race_date|circuit_location|
+-------+---------+--------------------+-------------------+----------------+
|   1053|     2021|Emilia Romagna Gr...|2021-04-18 13:00:00|           Imola|
|   1052|     2021|  Bahrain Grand Prix|2021-03-28 15:00:00|          Sakhir|
|   1051|     2021|Australian Grand ...|2021-11-21 06:00:00|       Melbourne|
|   1054|     2021|                 TBC|               null|         Nürburg|
|   1055|     2021|  Spanish Grand Prix|2021-05-09 13:00:00|        Montmeló|
+-------+---------+--------------------+-------------------+----------------+
only showing top 5 rows



In [6]:
## Join results to all other dataframes

In [7]:
race_results_df = results_df.join(race_circuits_df, results_df.race_id == race_circuits_df.race_id)\
.join(drivers_df, results_df.driver_id == drivers_df.driver_id)\
.join(constructors_df, results_df.constructor_id == constructors_df.constructor_id)

In [8]:
# Check if there are repeated columns
race_results_df.columns

['result_id',
 'race_id',
 'driver_id',
 'constructor_id',
 'number',
 'grid',
 'position',
 'position_text',
 'position_order',
 'points',
 'laps',
 'race_time',
 'milliseconds',
 'fastest_lap',
 'rank',
 'fastest_lap_time',
 'fastest_lap_speed',
 'ingestion_date',
 'race_id',
 'race_year',
 'race_name',
 'race_date',
 'circuit_location',
 'driver_id',
 'driver_ref',
 'driver_number',
 'code',
 'driver_name',
 'dob',
 'driver_nationality',
 'ingestion_date',
 'constructor_id',
 'constructor_ref',
 'team',
 'nationality',
 'ingestion_date']

In [9]:
final_df = race_results_df.select("race_year", "race_name", "race_date", "circuit_location", "driver_name", "driver_number", "driver_nationality", "team", "grid", "fastest_lap", "race_time", "points", "position")\
.withColumn("created_date", current_timestamp())

In [18]:
final_df.columns

['race_year',
 'race_name',
 'race_date',
 'circuit_location',
 'driver_name',
 'driver_number',
 'driver_nationality',
 'team',
 'grid',
 'fastest_lap',
 'race_time',
 'points',
 'position',
 'created_date']

In [10]:
final_df.filter("race_year == 2020 and race_name == 'Abu Dhabi Grand Prix'").orderBy(final_df.points.desc()).show(5)

+---------+--------------------+-------------------+----------------+---------------+-------------+------------------+--------+----+-----------+-----------+------+--------+--------------------+
|race_year|           race_name|          race_date|circuit_location|    driver_name|driver_number|driver_nationality|    team|grid|fastest_lap|  race_time|points|position|        created_date|
+---------+--------------------+-------------------+----------------+---------------+-------------+------------------+--------+----+-----------+-----------+------+--------+--------------------+
|     2020|Abu Dhabi Grand Prix|2020-12-13 13:10:00|       Abu Dhabi| Max Verstappen|           33|             Dutch|Red Bull|   1|         14|1:36:28.645|  25.0|       1|2023-08-20 17:41:...|
|     2020|Abu Dhabi Grand Prix|2020-12-13 13:10:00|       Abu Dhabi|Valtteri Bottas|           77|           Finnish|Mercedes|   2|         40|    +15.976|  18.0|       2|2023-08-20 17:41:...|
|     2020|Abu Dhabi Grand Pri

In [11]:
final_df.write.mode("overwrite").parquet(f"{presentation_folder_path}/race_results")