In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from matplotlib import cm
import fastf1 as ff1
from fastf1.core import Laps
from fastf1 import utils
from fastf1 import plotting
plotting.setup_mpl()
from timple.timedelta import strftimedelta
import datetime
from datetime import datetime

In [2]:
import os

cache_dir = 'Desktop/Thesis/Documents/GitHub/ThesisF1/Cache'

if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

ff1.Cache.enable_cache(cache_dir)

In [3]:
schedule2019 = ff1.get_event_schedule(2019)
schedule2020 = ff1.get_event_schedule(2020)
schedule2021 = ff1.get_event_schedule(2021)
schedule2022 = ff1.get_event_schedule(2022)

In [4]:
all_data = pd.DataFrame()

## Race 1

In [5]:
# Load race data
race_2021 = ff1.get_session(2021, 1, 'R')
race_2021.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2021.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2021.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2021["Country"][1]
data["Year"] = 2021
dati = schedule2021["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[1])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2021 = ff1.get_session(2021, 1, 'Q')
qualification1_2021.load()

# Add the best qualifications lap
fastest_laps = qualification1_2021.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2021.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2021.laps

aux = aux[aux['DriverNumber'] != "9"]
data = data[data["DriverNumber"] != "9"] 

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2021) & (races["round"] == 1)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2021.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2021.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2021.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2021.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "38"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Bahrain Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '33', '77', '4', '11', '16', '3', '55', '22', '18', '7', '99', '31', '63', '5', '47', '10', '6', '14', '9']
core           INFO 	Loading data for Bahrain Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing

## Race 2

In [6]:
# Load race data
race_2021 = ff1.get_session(2021, 2, 'R')
race_2021.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2021.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2021.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2021["Country"][2]
data["Year"] = 2021
dati = schedule2021["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[2])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2021 = ff1.get_session(2021, 2, 'Q')
qualification1_2021.load()

# Add the best qualifications lap
fastest_laps = qualification1_2021.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2021.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2021.laps

aux = aux[aux['DriverNumber'] != "6"]
data = data[data["DriverNumber"] != "6"] 

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2021) & (races["round"] == 2)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2021.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2021.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2021.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2021.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "39"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Emilia Romagna Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
Traceback (most recent call last):
  File "/Users/soniamuthi/opt/anaconda3/lib/python3.9/site-packages/urllib3/connectionpool.py", line 703, in urlopen
    httplib_response = self._make_request(
  File "/Users/soniamuthi/opt/anaconda3/lib/python3.9/site-packages/urllib3/connectionpool.py", line 449, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/soniamuthi/opt/anaconda3/lib/python3.9/site-packages/urllib3/connectionpool.py", line 444, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/soniamuthi/opt/anaconda3/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/Users/soniamuthi/opt/anaconda3/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/Users/soniamuthi/opt/anaconda3/lib/python3.9/h

## Race 3

In [7]:
# Load race data
race_2021 = ff1.get_session(2021, 3, 'R')
race_2021.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2021.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2021.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2021["Country"][3]
data["Year"] = 2021
dati = schedule2021["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[3])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2021 = ff1.get_session(2021, 3, 'Q')
qualification1_2021.load()

# Add the best qualifications lap
fastest_laps = qualification1_2021.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2021.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2021.laps

aux = aux[aux['DriverNumber'] != "7"]
data = data[data["DriverNumber"] != "7"] 

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2021) & (races["round"] == 3)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2021.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2021.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2021.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2021.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "40"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Portuguese Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '33', '77', '11', '4', '16', '31', '14', '3', '10', '55', '99', '5', '18', '22', '63', '47', '6', '9', '7']
core           INFO 	Loading data for Portuguese Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for 

## Race 4

In [8]:
# Load race data
race_2021 = ff1.get_session(2021, 4, 'R')
race_2021.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2021.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2021.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2021["Country"][4]
data["Year"] = 2021
dati = schedule2021["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[4])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2021 = ff1.get_session(2021, 4, 'Q')
qualification1_2021.load()

# Add the best qualifications lap
fastest_laps = qualification1_2021.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2021.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

 

# Add average and std dev of lap times
aux = race_2021.laps

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2021) & (races["round"] == 4)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2021.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2021.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2021.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2021.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "41"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Spanish Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '33', '77', '16', '11', '3', '55', '4', '31', '10', '18', '7', '5', '63', '99', '6', '14', '47', '9', '22']
core           INFO 	Loading data for Spanish Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing

## Race 5

In [9]:
# Load race data
race_2021 = ff1.get_session(2021, 5, 'R')
race_2021.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2021.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2021.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2021["Country"][5]
data["Year"] = 2021
dati = schedule2021["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[5])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2021 = ff1.get_session(2021, 5, 'Q')
qualification1_2021.load()

# Add the best qualifications lap
fastest_laps = qualification1_2021.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2021.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2021.laps

aux = aux[aux['DriverNumber'] != "16"]
data = data[data["DriverNumber"] != "16"] 

aux = aux[aux['DriverNumber'] != "47"]
data = data[data["DriverNumber"] != "47"] 

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

data
# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2021) & (races["round"] == 5)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2021.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2021.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2021.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2021.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "42"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Monaco Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['33', '55', '4', '11', '5', '10', '44', '18', '31', '99', '7', '3', '14', '63', '6', '22', '9', '47', '77', '16']
core           INFO 	Loading data for Monaco Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_d

## Race 6

In [10]:
# Load race data
race_2021 = ff1.get_session(2021, 6, 'R')
race_2021.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2021.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2021.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2021["Country"][6]
data["Year"] = 2021
dati = schedule2021["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[6])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2021 = ff1.get_session(2021, 6, 'Q')
qualification1_2021.load()

# Add the best qualifications lap
fastest_laps = qualification1_2021.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2021.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

 

# Add average and std dev of lap times
aux = race_2021.laps

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2021) & (races["round"] == 6)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2021.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2021.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2021.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2021.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "43"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Azerbaijan Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
  result = result.append(new_last).reset_index(drop=True)
  result = result.append(new_last).reset_index(drop=True)
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['11', '5', '10', '16', '4', '14', '22', '55', '3', '7', '99', '77', '47', '9', '44', '6', '63', '33', '18', '31']
core           INFO 	Loading data for Azerbaijan Grand Prix - Qu

## Race 7

In [11]:
# Load race data
race_2021 = ff1.get_session(2021, 7, 'R')
race_2021.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2021.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2021.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2021["Country"][7]
data["Year"] = 2021
dati = schedule2021["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[7])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2021 = ff1.get_session(2021, 7, 'Q')
qualification1_2021.load()

# Add the best qualifications lap
fastest_laps = qualification1_2021.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2021.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2021.laps

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2021) & (races["round"] == 7)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2021.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2021.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2021.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2021.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "44"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for French Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['33', '44', '11', '77', '4', '3', '10', '14', '5', '18', '55', '63', '22', '31', '99', '16', '7', '6', '47', '9']
core           INFO 	Loading data for French Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_d

## Race 8

In [12]:
# Load race data
race_2021 = ff1.get_session(2021, 8, 'R')
race_2021.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2021.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2021.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2021["Country"][8]
data["Year"] = 2021
dati = schedule2021["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[8])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2021 = ff1.get_session(2021, 8, 'Q')
qualification1_2021.load()

# Add the best qualifications lap
fastest_laps = qualification1_2021.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2021.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2021.laps

aux = aux[aux['DriverNumber'] != "10"]
data = data[data["DriverNumber"] != "10"] 

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2021) & (races["round"] == 8)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2021.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2021.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2021.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2021.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "45"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Styrian Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['33', '44', '77', '11', '4', '55', '16', '18', '14', '22', '7', '5', '3', '31', '99', '47', '6', '9', '63', '10']
core           INFO 	Loading data for Styrian Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing

## Race 9

In [13]:
# Load race data
race_2021 = ff1.get_session(2021, 9, 'R')
race_2021.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2021.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2021.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2021["Country"][9]
data["Year"] = 2021
dati = schedule2021["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[9])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2021 = ff1.get_session(2021, 9, 'Q')
qualification1_2021.load()

# Add the best qualifications lap
fastest_laps = qualification1_2021.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2021.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2021.laps

aux = aux[aux['DriverNumber'] != "31"]
data = data[data["DriverNumber"] != "31"] 

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2021) & (races["round"] == 9)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2021.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2021.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2021.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2021.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "46"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Austrian Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['33', '77', '4', '44', '55', '11', '3', '16', '10', '14', '63', '22', '18', '99', '7', '6', '5', '47', '9', '31']
core           INFO 	Loading data for Austrian Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timi

## Race 10

In [14]:
# Load race data
race_2021 = ff1.get_session(2021, 10, 'R')
race_2021.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2021.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2021.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2021["Country"][10]
data["Year"] = 2021
dati = schedule2021["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[10])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2021 = ff1.get_session(2021, 10, 'Q')
qualification1_2021.load()

# Add the best qualifications lap
fastest_laps = qualification1_2021.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2021.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2021.laps

aux = aux[aux['DriverNumber'] != "33"]
data = data[data["DriverNumber"] != "33"] 

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2021) & (races["round"] == 10)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2021.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2021.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2021.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2021.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "47"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for British Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '16', '77', '4', '3', '55', '14', '18', '31', '22', '10', '63', '99', '6', '7', '11', '9', '47', '5', '33']
core           INFO 	Loading data for British Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing

## Race 11

In [15]:
# Load race data
race_2021 = ff1.get_session(2021, 11, 'R')
race_2021.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2021.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2021.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2021["Country"][11]
data["Year"] = 2021
dati = schedule2021["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[11])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2021 = ff1.get_session(2021, 11, 'Q')
qualification1_2021.load()

# Add the best qualifications lap
fastest_laps = qualification1_2021.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2021.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2021.laps

aux = aux[aux['DriverNumber'] != "47"]
data = data[data["DriverNumber"] != "47"] 

aux = aux[aux['DriverNumber'] != "11"]
data = data[data["DriverNumber"] != "11"]

aux = aux[aux['DriverNumber'] != "16"]
data = data[data["DriverNumber"] != "16"] 

aux = aux[aux['DriverNumber'] != "18"]
data = data[data["DriverNumber"] != "18"]

aux = aux[aux['DriverNumber'] != "77"]
data = data[data["DriverNumber"] != "77"] 

aux = aux[aux['DriverNumber'] != "9"]
data = data[data["DriverNumber"] != "9"]

aux = aux[aux['DriverNumber'] != "4"]
data = data[data["DriverNumber"] != "4"]

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2021) & (races["round"] == 11)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2021.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2021.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2021.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2021.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "48"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Hungarian Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['31', '44', '55', '14', '10', '22', '6', '63', '33', '7', '3', '47', '99', '9', '4', '77', '11', '16', '18', '5']
core           INFO 	Loading data for Hungarian Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for ti

## Race 13

In [16]:
# Load race data
race_2021 = ff1.get_session(2021, 13, 'R')
race_2021.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2021.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2021.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2021["Country"][13]
data["Year"] = 2021
dati = schedule2021["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[13])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2021 = ff1.get_session(2021, 13, 'Q')
qualification1_2021.load()

# Add the best qualifications lap
fastest_laps = qualification1_2021.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2021.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2021.laps


auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2021) & (races["round"] == 13)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2021.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2021.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2021.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2021.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "49"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Dutch Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['33', '44', '77', '10', '16', '14', '55', '11', '31', '4', '3', '18', '5', '99', '88', '6', '63', '47', '22', '9']
core           INFO 	Loading data for Dutch Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_da

## Race 14

In [17]:
# Load race data
race_2021 = ff1.get_session(2021, 14, 'R')
race_2021.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2021.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2021.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2021["Country"][14]
data["Year"] = 2021
dati = schedule2021["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[14])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2021 = ff1.get_session(2021, 14, 'Q')
qualification1_2021.load()

# Add the best qualifications lap
fastest_laps = qualification1_2021.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2021.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2021.laps

aux = aux[aux['DriverNumber'] != "22"]
data = data[data["DriverNumber"] != "22"]

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2021) & (races["round"] == 14)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2021.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2021.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2021.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2021.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "50"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Italian Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['3', '4', '77', '16', '11', '55', '18', '14', '63', '31', '6', '5', '99', '88', '47', '9', '44', '33', '10', '22']
core           INFO 	Loading data for Italian Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timin

## Race 15

In [18]:
# Load race data
race_2021 = ff1.get_session(2021, 15, 'R')
race_2021.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2021.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2021.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2021["Country"][15]
data["Year"] = 2021
dati = schedule2021["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[15])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2021 = ff1.get_session(2021, 15, 'Q')
qualification1_2021.load()

# Add the best qualifications lap
fastest_laps = qualification1_2021.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2021.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2021.laps

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2021) & (races["round"] == 15)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2021.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2021.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2021.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2021.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "51"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Russian Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '33', '55', '3', '77', '14', '4', '7', '11', '63', '18', '5', '10', '31', '16', '99', '22', '9', '6', '47']
core           INFO 	Loading data for Russian Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing

## Race 16

In [19]:
# Load race data
race_2021 = ff1.get_session(2021, 16, 'R')
race_2021.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2021.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2021.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2021["Country"][16]
data["Year"] = 2021
dati = schedule2021["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[16])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2021 = ff1.get_session(2021, 16, 'Q')
qualification1_2021.load()

# Add the best qualifications lap
fastest_laps = qualification1_2021.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2021.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2021.laps

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2021) & (races["round"] == 16)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2021.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2021.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2021.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2021.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "52"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Turkish Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['77', '33', '11', '16', '44', '10', '4', '55', '18', '31', '99', '7', '3', '22', '63', '14', '6', '5', '47', '9']
core           INFO 	Loading data for Turkish Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing

## Race 17

In [20]:
# Load race data
race_2021 = ff1.get_session(2021, 17, 'R')
race_2021.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2021.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2021.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2021["Country"][17]
data["Year"] = 2021
dati = schedule2021["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[17])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2021 = ff1.get_session(2021, 17, 'Q')
qualification1_2021.load()

# Add the best qualifications lap
fastest_laps = qualification1_2021.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2021.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2021.laps

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2021) & (races["round"] == 17)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2021.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2021.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2021.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2021.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "53"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for United States Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['33', '44', '11', '16', '3', '77', '55', '4', '22', '5', '99', '18', '7', '63', '6', '47', '9', '14', '31', '10']
core           INFO 	Loading data for United States Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached dat

## Race 18

In [21]:
# Load race data
race_2021 = ff1.get_session(2021, 18, 'R')
race_2021.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2021.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2021.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2021["Country"][18]
data["Year"] = 2021
dati = schedule2021["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[18])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2021 = ff1.get_session(2021, 18, 'Q')
qualification1_2021.load()

# Add the best qualifications lap
fastest_laps = qualification1_2021.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2021.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2021.laps

aux = aux[aux['DriverNumber'] != "22"]
data = data[data["DriverNumber"] != "22"] 

aux = aux[aux['DriverNumber'] != "47"]
data = data[data["DriverNumber"] != "47"] 

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2021) & (races["round"] == 18)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2021.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2021.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2021.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2021.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "54"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Mexico City Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['33', '44', '11', '10', '16', '55', '5', '7', '14', '4', '99', '3', '31', '18', '77', '63', '6', '9', '47', '22']
core           INFO 	Loading data for Mexico City Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
Traceback (most recent call last):
  File

## Race 19

In [22]:
# Load race data
race_2021 = ff1.get_session(2021, 19, 'R')
race_2021.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2021.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2021.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2021["Country"][19]
data["Year"] = 2021
dati = schedule2021["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[19])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2021 = ff1.get_session(2021, 19, 'Q')
qualification1_2021.load()

# Add the best qualifications lap
fastest_laps = qualification1_2021.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2021.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2021.laps

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2021) & (races["round"] == 19)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2021.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2021.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2021.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2021.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "55"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for São Paulo Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '33', '77', '11', '16', '55', '10', '31', '14', '4', '5', '7', '63', '99', '22', '6', '9', '47', '3', '18']
core           INFO 	Loading data for São Paulo Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for ti

## Race 20

In [23]:
# Load race data
race_2021 = ff1.get_session(2021, 20, 'R')
race_2021.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2021.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2021.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2021["Country"][20]
data["Year"] = 2021
dati = schedule2021["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[20])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2021 = ff1.get_session(2021, 20, 'Q')
qualification1_2021.load()

# Add the best qualifications lap
fastest_laps = qualification1_2021.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2021.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2021.laps

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2021) & (races["round"] == 20)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2021.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2021.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2021.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2021.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "56"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Qatar Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '33', '14', '11', '31', '18', '55', '16', '4', '5', '10', '3', '22', '7', '99', '47', '63', '9', '6', '77']
core           INFO 	Loading data for Qatar Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_dat

## Race 21

In [24]:
# Load race data
race_2021 = ff1.get_session(2021, 21, 'R')
race_2021.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2021.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2021.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2021["Country"][21]
data["Year"] = 2021
dati = schedule2021["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[21])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2021 = ff1.get_session(2021, 21, 'Q')
qualification1_2021.load()

# Add the best qualifications lap
fastest_laps = qualification1_2021.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2021.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2021.laps

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2021) & (races["round"] == 21)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2021.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2021.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2021.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2021.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "57"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Saudi Arabian Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
  result = result.append(new_last).reset_index(drop=True)
  result = result.append(new_last).reset_index(drop=True)
  result = result.append(new_last).reset_index(drop=True)
  result = result.append(new_last).reset_index(drop=True)
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '33', '77', '31', '3', '10', '16', '55', '99', '4', '

## Race 22

In [25]:
# Load race data
race_2021 = ff1.get_session(2021, 22, 'R')
race_2021.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2021.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2021.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2021["Country"][22]
data["Year"] = 2021
dati = schedule2021["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[22])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2021 = ff1.get_session(2021, 22, 'Q')
qualification1_2021.load()

# Add the best qualifications lap
fastest_laps = qualification1_2021.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2021.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2021.laps

aux = aux[aux['DriverNumber'] != "9"]
data = data[data["DriverNumber"] != "9"] 

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2021) & (races["round"] == 22)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2021.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2021.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2021.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2021.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "58"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Abu Dhabi Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 19 drivers: ['33', '44', '55', '22', '10', '77', '4', '14', '31', '16', '5', '3', '18', '47', '11', '6', '99', '63', '7']
core           INFO 	Loading data for Abu Dhabi Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_

## Complete missing values

In [26]:
# no pit stops
all_data['AvgPitTime'] = all_data['AvgPitTime'].fillna(pd.Timedelta("0 days"))

all_data['PitstopNo'] = all_data['PitstopNo'].fillna(0)

# Missing qualification times

all_data.loc[(all_data['Abbreviation'] == 'STR') & (all_data['RaceCountry'] == 'Azerbaijan'), 'BestQualiTime'] = pd.Timedelta('0 days 00:01:48.653')
all_data.loc[(all_data['Abbreviation'] == 'GIO') & (all_data['RaceCountry'] == 'Azerbaijan'), 'BestQualiTime'] = pd.Timedelta('0 days 00:01:48.653')


all_data.loc[(all_data['Abbreviation'] == 'TSU') & (all_data['RaceCountry'] == 'France'), 'BestQualiTime'] = pd.Timedelta('0 days 00:01:37.371')


all_data.loc[(all_data['Abbreviation'] == 'TSU') & (all_data['RaceCountry'] == 'Italy'), 'BestQualiTime'] = pd.Timedelta('0 days 00:01:19.899')



In [27]:
all_data.to_csv(r'2021data.csv', index=True, header=True)