## Create spark context

In [12]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
spark = SparkSession.builder.config("spark.sql.shuffle.partitions", "2").appName("Analysis").master("local[2]").getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [3]:
%run "../includes/configuration"

## Read the data required

In [4]:
race_results_df = spark.read.parquet(f"{presentation_folder_path}/race_results") 

In [5]:
race_results_df.show(5)

+---------+--------------------+-------------------+----------------+---------------+-------------+------------------+------------+----+-----------+---------+------+--------+--------------------+
|race_year|           race_name|          race_date|circuit_location|    driver_name|driver_number|driver_nationality|        team|grid|fastest_lap|race_time|points|position|        created_date|
+---------+--------------------+-------------------+----------------+---------------+-------------+------------------+------------+----+-----------+---------+------+--------+--------------------+
|     2018|Australian Grand ...|2018-03-25 05:10:00|       Melbourne|Sergey Sirotkin|           35|           Russian|    Williams|  19|          3|       \N|   0.0|    null|2023-08-17 15:42:...|
|     2018|Australian Grand ...|2018-03-25 05:10:00|       Melbourne|Marcus Ericsson|            9|           Swedish|      Sauber|  17|          4|       \N|   0.0|    null|2023-08-17 15:42:...|
|     2018|Australia

In [10]:
driver_standings_df = race_results_df\
.groupBy("race_year", "driver_name", "driver_nationality", "team")\
.agg(sum("points").alias("total_points"), count(when(col("position") == 1, True)).alias("wins"))

In [11]:
driver_standings_df.filter("race_year == 2020").show()

+---------+------------------+------------------+------------+------------+----+
|race_year|       driver_name|driver_nationality|        team|total_points|wins|
+---------+------------------+------------------+------------+------------+----+
|     2020|  Daniel Ricciardo|        Australian|     Renault|       119.0|   0|
|     2020|   Romain Grosjean|            French|Haas F1 Team|         2.0|   0|
|     2020|    Kimi Räikkönen|           Finnish|  Alfa Romeo|         4.0|   0|
|     2020|      Esteban Ocon|            French|     Renault|        62.0|   0|
|     2020|      Sergio Pérez|           Mexican|Racing Point|       125.0|   1|
|     2020|      Carlos Sainz|           Spanish|     McLaren|       105.0|   0|
|     2020|   Nico Hülkenberg|            German|Racing Point|        10.0|   0|
|     2020| Pietro Fittipaldi|         Brazilian|Haas F1 Team|         0.0|   0|
|     2020|    George Russell|           British|    Mercedes|         3.0|   0|
|     2020|    Max Verstappe

In [14]:
driver_rank_spec = Window.partitionBy("race_year").orderBy(desc("total_points"), desc("wins"))
final_df = driver_standings_df.withColumn("rank", rank().over(driver_rank_spec))

In [16]:
final_df.filter("race_year == 2020").show()

+---------+------------------+------------------+------------+------------+----+----+
|race_year|       driver_name|driver_nationality|        team|total_points|wins|rank|
+---------+------------------+------------------+------------+------------+----+----+
|     2020|    Lewis Hamilton|           British|    Mercedes|       347.0|  11|   1|
|     2020|   Valtteri Bottas|           Finnish|    Mercedes|       223.0|   2|   2|
|     2020|    Max Verstappen|             Dutch|    Red Bull|       214.0|   2|   3|
|     2020|      Sergio Pérez|           Mexican|Racing Point|       125.0|   1|   4|
|     2020|  Daniel Ricciardo|        Australian|     Renault|       119.0|   0|   5|
|     2020|      Carlos Sainz|           Spanish|     McLaren|       105.0|   0|   6|
|     2020|   Alexander Albon|              Thai|    Red Bull|       105.0|   0|   6|
|     2020|   Charles Leclerc|        Monegasque|     Ferrari|        98.0|   0|   8|
|     2020|      Lando Norris|           British|     

In [17]:
final_df.write.mode("overwrite").parquet(f"{presentation_folder_path}/driver_standings")