## Create spark context

In [1]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
spark = SparkSession.builder.config("spark.sql.shuffle.partitions", "2").appName("Analysis").master("local[2]").getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext



In [2]:
%run "../includes/configuration"

In [3]:
calculated_race_results = spark.read.parquet(f"{processed_folder_path}/calculated_race_results")

In [4]:
calculated_race_results.createOrReplaceTempView("calculated_race_results")

## Dominant drivers of all time

In [5]:
query = """
SELECT 
    driver_name,
    COUNT(1) AS total_races,
    SUM(calculated_points) AS total_points,
    AVG(calculated_points) AS avg_points
FROM calculated_race_results
GROUP BY driver_name
HAVING total_races >= 50
ORDER BY avg_points DESC;
"""

## Dominant drivers of last decade

In [26]:
query = """
SELECT 
    driver_name,
    COUNT(1) AS total_races,
    SUM(calculated_points) AS total_points,
    AVG(calculated_points) AS avg_points
FROM calculated_race_results
WHERE race_year BETWEEN 2011 AND 2020
GROUP BY driver_name
HAVING total_races >= 50
ORDER BY avg_points DESC;
"""

In [27]:
spark.sql(query).show()

+----------------+-----------+------------+------------------+
|     driver_name|total_races|total_points|        avg_points|
+----------------+-----------+------------+------------------+
|  Lewis Hamilton|        177|        1478| 8.350282485875706|
|Sebastian Vettel|        164|        1282| 7.817073170731708|
|    Nico Rosberg|         93|         673| 7.236559139784946|
|  Max Verstappen|         88|         605|             6.875|
| Valtteri Bottas|        117|         793| 6.777777777777778|
| Fernando Alonso|         95|         584| 6.147368421052631|
|  Kimi Räikkönen|        121|         721| 5.958677685950414|
|Daniel Ricciardo|        111|         639| 5.756756756756757|
|   Jenson Button|         69|         377| 5.463768115942029|
|    Felipe Massa|         94|         427| 4.542553191489362|
| Romain Grosjean|         59|         244| 4.135593220338983|
|    Sergio Pérez|        116|         446|3.8448275862068964|
|    Carlos Sainz|         65|         244| 3.753846153

## Dominant drivers of decade 2001 - 2010

In [29]:
query = """
SELECT 
    driver_name,
    COUNT(1) AS total_races,
    SUM(calculated_points) AS total_points,
    AVG(calculated_points) AS avg_points
FROM calculated_race_results
WHERE race_year BETWEEN 2001 AND 2010
GROUP BY driver_name
HAVING total_races >= 50
ORDER BY avg_points DESC;
"""

In [30]:
spark.sql(query).show()

+--------------------+-----------+------------+------------------+
|         driver_name|total_races|total_points|        avg_points|
+--------------------+-----------+------------+------------------+
|  Michael Schumacher|        104|         832|               8.0|
|      Lewis Hamilton|         55|         425|7.7272727272727275|
|  Juan Pablo Montoya|         58|         427| 7.362068965517241|
|     Fernando Alonso|        114|         827| 7.254385964912281|
|      Kimi Räikkönen|        106|         758| 7.150943396226415|
|  Rubens Barrichello|        113|         696|  6.15929203539823|
|       Jenson Button|        102|         601| 5.892156862745098|
|        Felipe Massa|         93|         533| 5.731182795698925|
|     Ralf Schumacher|         70|         399|               5.7|
|       Robert Kubica|         51|         283| 5.549019607843137|
|     David Coulthard|         79|         424| 5.367088607594937|
|        Jarno Trulli|         83|         409| 4.927710843373