In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.sql.functions import *
spark = SparkSession.builder.config("spark.sql.shuffle.partitions", "2").appName("Analysis").master("local[2]").getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext



In [3]:
%run "../includes/configuration"

In [4]:
race_results_df = spark.read.parquet(f"{presentation_folder_path}/race_results")

In [5]:
race_results_df.count()

24960

In [6]:
demo_df = race_results_df.filter("race_year == 2020")

In [7]:
demo_df.count()

340

In [8]:
demo_df.select(count("*")).show()

+--------+
|count(1)|
+--------+
|     340|
+--------+



In [9]:
demo_df.select(countDistinct("race_name")).show()

+-------------------------+
|count(DISTINCT race_name)|
+-------------------------+
|                       17|
+-------------------------+



In [10]:
demo_df.filter("driver_name == 'Lewis Hamilton'").select(sum("points")).show()

+-----------+
|sum(points)|
+-----------+
|      347.0|
+-----------+



In [11]:
demo_df.filter("driver_name == 'Lewis Hamilton'").select(sum("points"), countDistinct("race_name"))\
.withColumnRenamed("sum(points)", "total_points").withColumnRenamed("count(DISTINCT race_name)", "number of races").show()

+------------+---------------+
|total_points|number of races|
+------------+---------------+
|       347.0|             16|
+------------+---------------+



In [12]:
demo_df.groupBy("driver_name").sum("points").show()

+------------------+-----------+
|       driver_name|sum(points)|
+------------------+-----------+
|  Daniel Ricciardo|      119.0|
|      Lance Stroll|       75.0|
|   Kevin Magnussen|        1.0|
|      Sergio Pérez|      125.0|
|    Lewis Hamilton|      347.0|
|      Lando Norris|       97.0|
|   Valtteri Bottas|      223.0|
|       Jack Aitken|        0.0|
|    Max Verstappen|      214.0|
|   Romain Grosjean|        2.0|
|    George Russell|        3.0|
|    Kimi Räikkönen|        4.0|
|   Alexander Albon|      105.0|
|      Daniil Kvyat|       32.0|
|   Nicholas Latifi|        0.0|
|  Sebastian Vettel|       33.0|
|Antonio Giovinazzi|        4.0|
|      Esteban Ocon|       62.0|
|      Pierre Gasly|       75.0|
|      Carlos Sainz|      105.0|
+------------------+-----------+
only showing top 20 rows



In [13]:
demo_df.groupBy("driver_name").agg(sum("points").alias("total_points"), countDistinct("race_name").alias("number_of_races")).show()

+------------------+------------+---------------+
|       driver_name|total_points|number_of_races|
+------------------+------------+---------------+
|  Daniel Ricciardo|       119.0|             17|
|      Lance Stroll|        75.0|             16|
|      Lando Norris|        97.0|             17|
|      Sergio Pérez|       125.0|             15|
|   Valtteri Bottas|       223.0|             17|
|   Kevin Magnussen|         1.0|             17|
|    Lewis Hamilton|       347.0|             16|
|       Jack Aitken|         0.0|              1|
|    George Russell|         3.0|             17|
|   Alexander Albon|       105.0|             17|
|   Nicholas Latifi|         0.0|             17|
|Antonio Giovinazzi|         4.0|             17|
|      Esteban Ocon|        62.0|             17|
|      Pierre Gasly|        75.0|             17|
|  Sebastian Vettel|        33.0|             17|
|   Charles Leclerc|        98.0|             17|
|   Romain Grosjean|         2.0|             15|


In [14]:
## Window functions

In [15]:
demo_df = race_results_df.filter("race_year in (2019, 2020)")

In [16]:
demo_df.show()

+---------+--------------------+-------------------+----------------+------------------+-------------+------------------+------------+----+-----------+-----------+------+--------+--------------------+
|race_year|           race_name|          race_date|circuit_location|       driver_name|driver_number|driver_nationality|        team|grid|fastest_lap|  race_time|points|position|        created_date|
+---------+--------------------+-------------------+----------------+------------------+-------------+------------------+------------+----+-----------+-----------+------+--------+--------------------+
|     2019|Australian Grand ...|2019-03-17 05:10:00|       Melbourne|      Carlos Sainz|           55|           Spanish|     McLaren|  18|          9|         \N|   0.0|    null|2023-08-17 15:42:...|
|     2019|Australian Grand ...|2019-03-17 05:10:00|       Melbourne|  Daniel Ricciardo|            3|        Australian|     Renault|  12|         18|         \N|   0.0|    null|2023-08-17 15:42:

In [17]:
demo_grouped_df = demo_df\
.groupBy("race_year", "driver_name")\
.agg(sum("points").alias("total_points"), countDistinct("race_name").alias("number_of_races"))

In [18]:
demo_grouped_df.show()

+---------+------------------+------------+---------------+
|race_year|       driver_name|total_points|number_of_races|
+---------+------------------+------------+---------------+
|     2019|      Carlos Sainz|        96.0|             21|
|     2019|   Alexander Albon|        92.0|             21|
|     2019|    Kimi Räikkönen|        43.0|             21|
|     2019|   Charles Leclerc|       264.0|             21|
|     2019|    Lewis Hamilton|       413.0|             21|
|     2019|   Romain Grosjean|         8.0|             21|
|     2019|   Nico Hülkenberg|        37.0|             21|
|     2019|      Lance Stroll|        21.0|             21|
|     2019|   Kevin Magnussen|        20.0|             21|
|     2019|    Max Verstappen|       278.0|             21|
|     2019|      Sergio Pérez|        52.0|             21|
|     2020|      Lance Stroll|        75.0|             16|
|     2020|   Kevin Magnussen|         1.0|             17|
|     2020|   Alexander Albon|       105

In [19]:
driverRankSpec = Window.partitionBy("race_year").orderBy(desc("total_points"))

In [20]:
demo_grouped_df.withColumn("rank", rank().over(driverRankSpec)).show()

+---------+------------------+------------+---------------+----+
|race_year|       driver_name|total_points|number_of_races|rank|
+---------+------------------+------------+---------------+----+
|     2019|    Lewis Hamilton|       413.0|             21|   1|
|     2019|   Valtteri Bottas|       326.0|             21|   2|
|     2019|    Max Verstappen|       278.0|             21|   3|
|     2019|   Charles Leclerc|       264.0|             21|   4|
|     2019|  Sebastian Vettel|       240.0|             21|   5|
|     2019|      Carlos Sainz|        96.0|             21|   6|
|     2019|      Pierre Gasly|        95.0|             21|   7|
|     2019|   Alexander Albon|        92.0|             21|   8|
|     2019|  Daniel Ricciardo|        54.0|             21|   9|
|     2019|      Sergio Pérez|        52.0|             21|  10|
|     2019|      Lando Norris|        49.0|             21|  11|
|     2019|    Kimi Räikkönen|        43.0|             21|  12|
|     2019|   Nico Hülken

In [21]:
spark.sql("SELECT * FROM global_temp.gv_race_results")

AnalysisException: Table or view not found: global_temp.gv_race_results; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [global_temp, gv_race_results], [], false
