## Create spark context

In [1]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
spark = SparkSession.builder.config("spark.sql.shuffle.partitions", "2").appName("Analysis").master("local[2]").getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext



In [2]:
%run "../includes/configuration"

In [3]:
calculated_race_results = spark.read.parquet(f"{processed_folder_path}/calculated_race_results")

In [4]:
calculated_race_results.createOrReplaceTempView("calculated_race_results")

## Dominant teams of all time

In [6]:
query = """
SELECT 
    team_name,
    COUNT(1) AS total_races,
    SUM(calculated_points) AS total_points,
    AVG(calculated_points) AS avg_points
FROM calculated_race_results
GROUP BY team_name
HAVING total_races >= 100
ORDER BY avg_points DESC;
"""

In [7]:
top_teams = spark.sql(query)
top_teams.write.csv("/home/sunbeam/Desktop/FastLaneForecast/filtered_csv/top_teams", header=True)

## Dominant teams of last decade

In [7]:
query = """
SELECT 
    team_name,
    COUNT(1) AS total_races,
    SUM(calculated_points) AS total_points,
    AVG(calculated_points) AS avg_points
FROM calculated_race_results
WHERE race_year BETWEEN 2011 AND 2020
GROUP BY team_name
HAVING total_races >= 100
ORDER BY avg_points DESC;
"""

In [8]:
spark.sql(query).show()

+-----------+-----------+------------+------------------+
|  team_name|total_races|total_points|        avg_points|
+-----------+-----------+------------+------------------+
|   Mercedes|        331|        2580| 7.794561933534744|
|   Red Bull|        313|        2201| 7.031948881789138|
|    Ferrari|        314|        2095| 6.671974522292993|
|    McLaren|        204|         984| 4.823529411764706|
|   Williams|        125|         531|             4.248|
|Force India|        189|         671|3.5502645502645502|
| Toro Rosso|        119|         316|2.6554621848739495|
+-----------+-----------+------------+------------------+



## Dominant team of decade 2001 - 2010

In [9]:
query = """
SELECT 
    team_name,
    COUNT(1) AS total_races,
    SUM(calculated_points) AS total_points,
    AVG(calculated_points) AS avg_points
FROM calculated_race_results
WHERE race_year BETWEEN 2001 AND 2010
GROUP BY team_name
HAVING total_races >= 100
ORDER BY avg_points DESC;
"""

In [10]:
spark.sql(query).show()

+---------+-----------+------------+------------------+
|team_name|total_races|total_points|        avg_points|
+---------+-----------+------------+------------------+
|  Ferrari|        279|        2125| 7.616487455197133|
|  McLaren|        244|        1729| 7.086065573770492|
|  Renault|        201|        1187|5.9054726368159205|
| Red Bull|        120|         650| 5.416666666666667|
| Williams|        179|         962| 5.374301675977653|
|   Toyota|        130|         535| 4.115384615384615|
|   Sauber|        107|         369|3.4485981308411215|
+---------+-----------+------------+------------------+

