## Create spark context

In [12]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
spark = SparkSession.builder.config("spark.sql.shuffle.partitions", "2").appName("Analysis").master("local[2]").getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [13]:
%run "../includes/configuration"

## Read the data required

In [14]:
race_results_df = spark.read.parquet(f"{presentation_folder_path}/race_results") 

In [15]:
race_results_df.show(5)

+---------+--------------------+-------------------+----------------+---------------+-------------+------------------+------------+----+-----------+---------+------+--------+--------------------+
|race_year|           race_name|          race_date|circuit_location|    driver_name|driver_number|driver_nationality|        team|grid|fastest_lap|race_time|points|position|        created_date|
+---------+--------------------+-------------------+----------------+---------------+-------------+------------------+------------+----+-----------+---------+------+--------+--------------------+
|     2018|Australian Grand ...|2018-03-25 05:10:00|       Melbourne|Sergey Sirotkin|           35|           Russian|    Williams|  19|          3|       \N|   0.0|    null|2023-08-17 15:42:...|
|     2018|Australian Grand ...|2018-03-25 05:10:00|       Melbourne|Marcus Ericsson|            9|           Swedish|      Sauber|  17|          4|       \N|   0.0|    null|2023-08-17 15:42:...|
|     2018|Australia

In [16]:
constructor_standings_df = race_results_df\
.groupBy("race_year", "team")\
.agg(sum("points").alias("total_points"), count(when(col("position") == 1, True)).alias("wins"))

In [17]:
constructor_standings_df.filter("race_year == 2020").show()

+---------+------------+------------+----+
|race_year|        team|total_points|wins|
+---------+------------+------------+----+
|     2020|     Ferrari|       131.0|   0|
|     2020|     McLaren|       202.0|   0|
|     2020|    Red Bull|       319.0|   2|
|     2020|     Renault|       181.0|   0|
|     2020|Racing Point|       210.0|   1|
|     2020|Haas F1 Team|         3.0|   0|
|     2020|    Williams|         0.0|   0|
|     2020|  Alfa Romeo|         8.0|   0|
|     2020|  AlphaTauri|       107.0|   1|
|     2020|    Mercedes|       573.0|  13|
+---------+------------+------------+----+



In [18]:
constructor_rank_spec = Window.partitionBy("race_year").orderBy(desc("total_points"), desc("wins"))
final_df = constructor_standings_df.withColumn("rank", rank().over(constructor_rank_spec))

In [19]:
final_df.filter("race_year == 2020").show()

+---------+------------+------------+----+----+
|race_year|        team|total_points|wins|rank|
+---------+------------+------------+----+----+
|     2020|    Mercedes|       573.0|  13|   1|
|     2020|    Red Bull|       319.0|   2|   2|
|     2020|Racing Point|       210.0|   1|   3|
|     2020|     McLaren|       202.0|   0|   4|
|     2020|     Renault|       181.0|   0|   5|
|     2020|     Ferrari|       131.0|   0|   6|
|     2020|  AlphaTauri|       107.0|   1|   7|
|     2020|  Alfa Romeo|         8.0|   0|   8|
|     2020|Haas F1 Team|         3.0|   0|   9|
|     2020|    Williams|         0.0|   0|  10|
+---------+------------+------------+----+----+



In [20]:
final_df.write.mode("overwrite").parquet(f"{presentation_folder_path}/constructor_standings")