In [None]:
# what is the relationship between the results and circuits? 

In [1]:
# import packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
# import datasets

results_raw = pd.read_csv("data_raw/results.csv")
circuits_raw = pd.read_csv("data_raw/circuits.csv")
races_raw = pd.read_csv("data_raw/races.csv")
lap_times_raw = pd.read_csv("data_raw/lap_times.csv")
constructors_raw = pd.read_csv("data_raw/constructors.csv")


In [8]:
# rename columns

results_rename = {"time":"results_time", "milliseconds":"results_milliseconds"}
results = results_raw.rename(columns = results_rename)

circuits_rename = {"name":"circuit_name"}
circuits = circuits_raw.rename(columns = circuits_rename)

lap_rename = {"time":"lap_time", "milliseconds":"lap_milliseconds"}
lap_times = lap_times_raw.rename(columns = lap_rename)

constructor_rename = {"name":"constructor_name"}
constructors = constructors_raw.rename(columns = constructor_rename)


In [10]:
# merge datasets

circuits_races_merge = pd.merge(circuits[['circuitId', 'country', 'circuit_name']],
                                races_raw[['raceId', 'year', 'circuitId']],
                                on = "circuitId",
                                how = "left")

circuits_races_merge

circuits_races_results_merge = pd.merge(circuits_races_merge,
                                        results[['raceId', 'constructorId', 'results_milliseconds', 'points', 'fastestLapSpeed', 'statusId']],
                                        on = "raceId",
                                        how = "left")

circuits_races_results_merge

circuits_races_results_constructors_merge = pd.merge(circuits_races_results_merge,
                                                     constructors[['constructorId', 'constructor_name']],
                                                     on = "constructorId",
                                                     how = "left")

formula1_merge = pd.merge(circuits_races_results_constructors_merge,
                          lap_times[['raceId', 'driverId', 'lap_milliseconds']],
                          on = "raceId",
                          how = "left")

formula1_merge


Unnamed: 0,circuitId,country,circuit_name,raceId,year,constructorId,results_milliseconds,points,fastestLapSpeed,statusId,constructor_name,driverId,lap_milliseconds
0,1,Australia,Albert Park Grand Prix Circuit,1,2009,23.0,5655784,10.0,216.891,1.0,Brawn,1.0,109088.0
1,1,Australia,Albert Park Grand Prix Circuit,1,2009,23.0,5655784,10.0,216.891,1.0,Brawn,1.0,93740.0
2,1,Australia,Albert Park Grand Prix Circuit,1,2009,23.0,5655784,10.0,216.891,1.0,Brawn,1.0,91600.0
3,1,Australia,Albert Park Grand Prix Circuit,1,2009,23.0,5655784,10.0,216.891,1.0,Brawn,1.0,91067.0
4,1,Australia,Albert Park Grand Prix Circuit,1,2009,23.0,5655784,10.0,216.891,1.0,Brawn,1.0,92129.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11479420,79,USA,Miami International Autodrome,1078,2022,51.0,\N,0.0,203.520,47.0,Alfa Romeo,20.0,94298.0
11479421,79,USA,Miami International Autodrome,1078,2022,51.0,\N,0.0,203.520,47.0,Alfa Romeo,20.0,94217.0
11479422,79,USA,Miami International Autodrome,1078,2022,51.0,\N,0.0,203.520,47.0,Alfa Romeo,20.0,93631.0
11479423,79,USA,Miami International Autodrome,1078,2022,51.0,\N,0.0,203.520,47.0,Alfa Romeo,20.0,119970.0


In [103]:
# clean the data (get rid of NaN)

formula1_merge.dtypes

formula1_merge["results_milliseconds"].str.isnumeric()
formula1_merge["fastestLapSpeed"].str.isnumeric()

subset_results = formula1_merge.query("results_milliseconds.str.isnumeric() == False") # tells us which rows have values that are not numeric
list_unique_results_millseconds = pd.unique(subset_results["results_milliseconds"])
print(list_unique_results_millseconds) # tells us what these non-numeric values are

list_old_results = ['\\N']
list_new_results = [np.nan]
formula1_merge["results_milliseconds"] = formula1_merge['results_milliseconds'].replace(list_old_results, list_new_results)
formula1_merge["results_milliseconds_numeric"] = pd.to_numeric(formula1_merge["results_milliseconds"])

subset_speed = formula1_merge.query("fastestLapSpeed.str.isnumeric() == False")
list_unique_speed = pd.unique(subset_speed["fastestLapSpeed"])
print(list_unique_speed)

list_old_speed = ['\\N']
list_new_speed = [np.nan]
formula1_merge["fastestLapSpeed"] = formula1_merge['fastestLapSpeed'].replace(list_old_speed, list_new_speed)

formula1_merge["fastestLapSpeed_numeric"] = pd.to_numeric(formula1_merge["fastestLapSpeed"])
display(formula1_merge)

formula1_clean = formula1_merge.dropna()
formula1_clean


[]
['216.891' '214.344' '214.706' ... '208.351' '208.423' '203.520']


Unnamed: 0,circuitId,country,circuit_name,raceId,year,constructorId,results_milliseconds,points,fastestLapSpeed,statusId,constructor_name,driverId,lap_milliseconds,results_milliseconds_numeric,fastestLapSpeed_numeric
0,1,Australia,Albert Park Grand Prix Circuit,1,2009,23.0,5655784,10.0,216.891,1.0,Brawn,1.0,109088.0,5655784.0,216.891
1,1,Australia,Albert Park Grand Prix Circuit,1,2009,23.0,5655784,10.0,216.891,1.0,Brawn,1.0,93740.0,5655784.0,216.891
2,1,Australia,Albert Park Grand Prix Circuit,1,2009,23.0,5655784,10.0,216.891,1.0,Brawn,1.0,91600.0,5655784.0,216.891
3,1,Australia,Albert Park Grand Prix Circuit,1,2009,23.0,5655784,10.0,216.891,1.0,Brawn,1.0,91067.0,5655784.0,216.891
4,1,Australia,Albert Park Grand Prix Circuit,1,2009,23.0,5655784,10.0,216.891,1.0,Brawn,1.0,92129.0,5655784.0,216.891
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11479420,79,USA,Miami International Autodrome,1078,2022,51.0,,0.0,203.520,47.0,Alfa Romeo,20.0,94298.0,,203.520
11479421,79,USA,Miami International Autodrome,1078,2022,51.0,,0.0,203.520,47.0,Alfa Romeo,20.0,94217.0,,203.520
11479422,79,USA,Miami International Autodrome,1078,2022,51.0,,0.0,203.520,47.0,Alfa Romeo,20.0,93631.0,,203.520
11479423,79,USA,Miami International Autodrome,1078,2022,51.0,,0.0,203.520,47.0,Alfa Romeo,20.0,119970.0,,203.520


Unnamed: 0,circuitId,country,circuit_name,raceId,year,constructorId,results_milliseconds,points,fastestLapSpeed,statusId,constructor_name,driverId,lap_milliseconds,results_milliseconds_numeric,fastestLapSpeed_numeric
0,1,Australia,Albert Park Grand Prix Circuit,1,2009,23.0,5655784,10.0,216.891,1.0,Brawn,1.0,109088.0,5655784.0,216.891
1,1,Australia,Albert Park Grand Prix Circuit,1,2009,23.0,5655784,10.0,216.891,1.0,Brawn,1.0,93740.0,5655784.0,216.891
2,1,Australia,Albert Park Grand Prix Circuit,1,2009,23.0,5655784,10.0,216.891,1.0,Brawn,1.0,91600.0,5655784.0,216.891
3,1,Australia,Albert Park Grand Prix Circuit,1,2009,23.0,5655784,10.0,216.891,1.0,Brawn,1.0,91067.0,5655784.0,216.891
4,1,Australia,Albert Park Grand Prix Circuit,1,2009,23.0,5655784,10.0,216.891,1.0,Brawn,1.0,92129.0,5655784.0,216.891
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11474144,79,USA,Miami International Autodrome,1078,2022,210.0,5737563,0.0,210.565,1.0,Haas F1 Team,20.0,93479.0,5737563.0,210.565
11474145,79,USA,Miami International Autodrome,1078,2022,210.0,5737563,0.0,210.565,1.0,Haas F1 Team,20.0,94298.0,5737563.0,210.565
11474146,79,USA,Miami International Autodrome,1078,2022,210.0,5737563,0.0,210.565,1.0,Haas F1 Team,20.0,94217.0,5737563.0,210.565
11474147,79,USA,Miami International Autodrome,1078,2022,210.0,5737563,0.0,210.565,1.0,Haas F1 Team,20.0,93631.0,5737563.0,210.565


In [104]:
# groupby and aggregate stufffff (to reduce the dataset that we're working with)

formula1_agg = (formula1_clean.groupby("circuit_name")
                .agg(mean_result_time = ('results_milliseconds_numeric','mean'),
                     mean_lap_time = ('lap_milliseconds','mean'),
                     mean_lap_speed = ('fastestLapSpeed_numeric','mean'),
                     mean_points = ('points','mean'),
                     sum_points = ('points','sum'),
                     count_obs = ('points',len)).reset_index())


In [101]:
# we're interested in results, so we will look at mean_result_time, mean_points, and 
# sum_points

# plot data


