In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from matplotlib import cm
import fastf1 as ff1
from fastf1.core import Laps
from fastf1 import utils
from fastf1 import plotting
plotting.setup_mpl()
from timple.timedelta import strftimedelta
import datetime
from datetime import datetime

In [2]:
import os

cache_dir = 'Desktop/Thesis/Documents/GitHub/ThesisF1/Cache'

if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

ff1.Cache.enable_cache(cache_dir)

In [3]:
schedule2019 = ff1.get_event_schedule(2019)
schedule2020 = ff1.get_event_schedule(2020)
schedule2021 = ff1.get_event_schedule(2021)
schedule2022 = ff1.get_event_schedule(2022)

In [4]:
all_data = pd.DataFrame()

## Race 1

In [5]:
# Load race data
race_2020 = ff1.get_session(2020, 1, 'R')
race_2020.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2020.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2020.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2020["Country"][2]
data["Year"] = 2020
dati = schedule2020["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[2])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2020 = ff1.get_session(2020, 1, 'Q')
qualification1_2020.load()

# Add the best qualifications lap
fastest_laps = qualification1_2020.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2020.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2020.laps
auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2020) & (races["round"] == 1)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2020.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2020.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2020.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2020.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "22"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Austrian Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['77', '16', '4', '44', '55', '11', '10', '31', '99', '5', '6', '26', '23', '7', '63', '8', '20', '18', '3', '33']
core           INFO 	Loading data for Austrian Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timi

## Race 2

## Race 3

In [6]:
# Load race data
race_2020 = ff1.get_session(2020, 3, 'R')
race_2020.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2020.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2020.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2020["Country"][4]
data["Year"] = 2020
dati = schedule2020["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[4])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2020 = ff1.get_session(2020, 3, 'Q')
qualification1_2020.load()

# Add the best qualifications lap
fastest_laps = qualification1_2020.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2020.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2020.laps
auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2020) & (races["round"] == 3)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2020.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2020.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2020.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2020.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "23"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Hungarian Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 19 drivers: ['44', '33', '77', '18', '23', '5', '11', '3', '55', '20', '16', '26', '4', '7', '8', '99', '63', '6', '10']
core           INFO 	Loading data for Hungarian Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_d

## Race 4

In [7]:
# Load race data
race_2020 = ff1.get_session(2020, 4, 'R')
race_2020.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2020.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2020.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2020["Country"][5]
data["Year"] = 2020
dati = schedule2020["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[5])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2020 = ff1.get_session(2020, 4, 'Q')
qualification1_2020.load()

# Add the best qualifications lap
fastest_laps = qualification1_2020.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2020.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

 

# Add average and std dev of lap times
aux = race_2020.laps

aux = aux[aux['DriverNumber'] != "20"]
data = data[data["DriverNumber"] != "20"] 

aux = aux[aux['DriverNumber'] != "27"]
data = data[data["DriverNumber"] != "27"]

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2020) & (races["round"] == 4)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2020.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2020.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2020.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2020.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "24"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for British Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '33', '16', '3', '4', '31', '10', '23', '18', '5', '77', '63', '55', '99', '6', '8', '7', '26', '20', '27']
core           INFO 	Loading data for British Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing

## Race 5

In [8]:
# Load race data
race_2020 = ff1.get_session(2020, 5, 'R')
race_2020.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2020.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2020.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2020["Country"][6]
data["Year"] = 2020
dati = schedule2020["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[6])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2020 = ff1.get_session(2020, 5, 'Q')
qualification1_2020.load()

# Add the best qualifications lap
fastest_laps = qualification1_2020.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2020.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

 

# Add average and std dev of lap times
aux = race_2020.laps

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2020) & (races["round"] == 5)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2020.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2020.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2020.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2020.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "25"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for 70th Anniversary Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['33', '44', '77', '16', '23', '18', '27', '31', '4', '26', '10', '5', '55', '3', '7', '8', '99', '63', '6', '20']
core           INFO 	Loading data for 70th Anniversary Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cach

## Race 6

In [9]:
# Load race data
race_2020 = ff1.get_session(2020, 6, 'R')
race_2020.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2020.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2020.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2020["Country"][7]
data["Year"] = 2020
dati = schedule2020["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[7])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2020 = ff1.get_session(2020, 6, 'Q')
qualification1_2020.load()

# Add the best qualifications lap
fastest_laps = qualification1_2020.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2020.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

 

# Add average and std dev of lap times
aux = race_2020.laps

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2020) & (races["round"] == 6)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2020.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2020.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2020.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2020.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "26"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Spanish Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '33', '77', '18', '11', '55', '5', '23', '10', '4', '3', '26', '31', '7', '20', '99', '63', '6', '8', '16']
core           INFO 	Loading data for Spanish Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing

## Race 7

In [10]:
# Load race data
race_2020 = ff1.get_session(2020, 7, 'R')
race_2020.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2020.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2020.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2020["Country"][8]
data["Year"] = 2020
dati = schedule2020["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[8])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2020 = ff1.get_session(2020, 7, 'Q')
qualification1_2020.load()

# Add the best qualifications lap
fastest_laps = qualification1_2020.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2020.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2020.laps

aux = aux[aux['DriverNumber'] != "55"]
data = data[data["DriverNumber"] != "55"] 

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2020) & (races["round"] == 7)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2020.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2020.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2020.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2020.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "27"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Belgian Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '77', '33', '3', '31', '23', '4', '10', '18', '11', '26', '7', '5', '16', '8', '6', '20', '99', '63', '55']
core           INFO 	Loading data for Belgian Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing

## Race 8

In [11]:
# Load race data
race_2020 = ff1.get_session(2020, 8, 'R')
race_2020.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2020.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2020.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2020["Country"][9]
data["Year"] = 2020
dati = schedule2020["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[9])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2020 = ff1.get_session(2020, 8, 'Q')
qualification1_2020.load()

# Add the best qualifications lap
fastest_laps = qualification1_2020.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2020.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2020.laps

#aux = aux[aux['DriverNumber'] != "55"]
#data = data[data["DriverNumber"] != "55"] 

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2020) & (races["round"] == 8)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2020.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2020.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2020.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2020.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "28"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Italian Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['10', '55', '18', '4', '77', '3', '44', '31', '26', '11', '6', '8', '7', '63', '23', '99', '33', '16', '20', '5']
core           INFO 	Loading data for Italian Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing

## Race 9

In [12]:
# Load race data
race_2020 = ff1.get_session(2020, 9, 'R')
race_2020.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2020.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2020.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2020["Country"][10]
data["Year"] = 2020
dati = schedule2020["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[10])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2020 = ff1.get_session(2020, 9, 'Q')
qualification1_2020.load()

# Add the best qualifications lap
fastest_laps = qualification1_2020.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2020.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2020.laps

aux = aux[aux['DriverNumber'] != "33"]
data = data[data["DriverNumber"] != "33"] 

aux = aux[aux['DriverNumber'] != "10"]
data = data[data["DriverNumber"] != "10"]

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2020) & (races["round"] == 9)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2020.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2020.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2020.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2020.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "29"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Tuscan Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
  result = result.append(new_last).reset_index(drop=True)
  result = result.append(new_last).reset_index(drop=True)
  result = result.append(new_last).reset_index(drop=True)
  result = result.append(new_last).reset_index(drop=True)
  result = result.append(new_last).reset_index(drop=True)
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '7

## Race 10

In [13]:
# Load race data
race_2020 = ff1.get_session(2020, 10, 'R')
race_2020.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2020.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2020.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2020["Country"][11]
data["Year"] = 2020
dati = schedule2020["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[11])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2020 = ff1.get_session(2020, 10, 'Q')
qualification1_2020.load()

# Add the best qualifications lap
fastest_laps = qualification1_2020.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2020.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2020.laps

aux = aux[aux['DriverNumber'] != "55"]
data = data[data["DriverNumber"] != "55"] 

aux = aux[aux['DriverNumber'] != "18"]
data = data[data["DriverNumber"] != "18"]

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2020) & (races["round"] == 10)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2020.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2020.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2020.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2020.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "30"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Russian Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['77', '33', '44', '11', '3', '16', '31', '26', '10', '23', '99', '20', '5', '7', '4', '6', '8', '63', '55', '18']
core           INFO 	Loading data for Russian Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing

## Race 11

In [14]:
# Load race data
race_2020 = ff1.get_session(2020, 11, 'R')
race_2020.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2020.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2020.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2020["Country"][12]
data["Year"] = 2020
dati = schedule2020["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[12])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2020 = ff1.get_session(2020, 11, 'Q')
qualification1_2020.load()

# Add the best qualifications lap
fastest_laps = qualification1_2020.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2020.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2020.laps


auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2020) & (races["round"] == 11)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2020.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2020.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2020.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2020.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "31"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Eifel Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '33', '3', '11', '55', '10', '16', '27', '8', '99', '5', '7', '20', '6', '26', '4', '23', '31', '77', '63']
core           INFO 	Loading data for Eifel Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_dat

## Race 12

In [15]:
# Load race data
race_2020 = ff1.get_session(2020, 12, 'R')
race_2020.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2020.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2020.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2020["Country"][13]
data["Year"] = 2020
dati = schedule2020["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[13])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2020 = ff1.get_session(2020, 12, 'Q')
qualification1_2020.load()

# Add the best qualifications lap
fastest_laps = qualification1_2020.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2020.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2020.laps


auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2020) & (races["round"] == 12)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2020.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2020.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2020.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2020.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "32"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Portuguese Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '77', '33', '16', '10', '55', '11', '31', '3', '5', '7', '23', '4', '63', '99', '20', '8', '6', '26', '18']
core           INFO 	Loading data for Portuguese Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for 

## Race 13

In [16]:
# Load race data
race_2020 = ff1.get_session(2020, 13, 'R')
race_2020.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2020.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2020.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2020["Country"][14]
data["Year"] = 2020
dati = schedule2020["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[14])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2020 = ff1.get_session(2020, 13, 'Q')
qualification1_2020.load()

# Add the best qualifications lap
fastest_laps = qualification1_2020.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2020.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2020.laps


auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2020) & (races["round"] == 13)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2020.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2020.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2020.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2020.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "33"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Emilia Romagna Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '77', '3', '26', '16', '11', '55', '4', '7', '99', '6', '5', '18', '8', '23', '63', '33', '20', '31', '10']
core           INFO 	Loading data for Emilia Romagna Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached d

## Race 14

In [17]:
# Load race data
race_2020 = ff1.get_session(2020, 14, 'R')
race_2020.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2020.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2020.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2020["Country"][15]
data["Year"] = 2020
dati = schedule2020["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[15])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2020 = ff1.get_session(2020, 14, 'Q')
qualification1_2020.load()

# Add the best qualifications lap
fastest_laps = qualification1_2020.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2020.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2020.laps


auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2020) & (races["round"] == 14)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2020.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2020.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2020.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2020.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "34"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Turkish Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '11', '5', '16', '55', '33', '23', '4', '18', '3', '31', '26', '10', '77', '7', '63', '20', '8', '6', '99']
core           INFO 	Loading data for Turkish Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing

## Race 15

In [18]:
# Load race data
race_2020 = ff1.get_session(2020, 15, 'R')
race_2020.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2020.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2020.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2020["Country"][16]
data["Year"] = 2020
dati = schedule2020["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[16])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2020 = ff1.get_session(2020, 15, 'Q')
qualification1_2020.load()

# Add the best qualifications lap
fastest_laps = qualification1_2020.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2020.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2020.laps

aux = aux[aux['DriverNumber'] != "8"]
data = data[data["DriverNumber"] != "8"] 

aux = aux[aux['DriverNumber'] != "18"]
data = data[data["DriverNumber"] != "18"] 

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2020) & (races["round"] == 15)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2020.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2020.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2020.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2020.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "35"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Bahrain Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '33', '23', '4', '55', '10', '3', '77', '31', '16', '26', '63', '5', '6', '7', '99', '20', '11', '18', '8']
core           INFO 	Loading data for Bahrain Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
Traceback (most recent call last):
  File "/Users

## Race 16

In [19]:
# Load race data
race_2020 = ff1.get_session(2020, 16, 'R')
race_2020.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2020.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2020.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2020["Country"][17]
data["Year"] = 2020
dati = schedule2020["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[17])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2020 = ff1.get_session(2020, 16, 'Q')
qualification1_2020.load()

# Add the best qualifications lap
fastest_laps = qualification1_2020.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2020.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2020.laps

aux = aux[aux['DriverNumber'] != "33"]
data = data[data["DriverNumber"] != "33"] 

aux = aux[aux['DriverNumber'] != "16"]
data = data[data["DriverNumber"] != "16"] 

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2020) & (races["round"] == 16)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2020.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2020.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2020.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2020.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "36"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Sakhir Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['11', '31', '18', '55', '3', '23', '26', '77', '63', '4', '10', '5', '99', '7', '20', '89', '51', '6', '33', '16']
core           INFO 	Loading data for Sakhir Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_

## Race 17

In [20]:
# Load race data
race_2020 = ff1.get_session(2020, 17, 'R')
race_2020.load(telemetry = True, laps = True, weather = True)

# Create dataframe
aux = race_2020.results
data = aux[["Abbreviation", "DriverNumber", "TeamName", "Position", "GridPosition", "Status", "Points"]]
data = pd.DataFrame(data)

# Add driver age at the time of the race
birth = pd.read_csv("BIRTH2020.csv", sep = ";")
birth["Birth"] = pd.to_datetime(birth["Birth"], format="%d.%m.%Y")
birth.columns = ["Abbreviation", "Birth"]

# Add race country and year
data["RaceCountry"] = schedule2020["Country"][18]
data["Year"] = 2020
dati = schedule2020["EventDate"]
data["RaceDate"] = pd.to_datetime(dati[18])
#formatted_date = timestamp.strftime('%d.%m.%Y')
data = pd.merge(data, birth, on='Abbreviation', how='inner')
data['AgeAtGP'] = (data["RaceDate"] - data["Birth"]) // pd.Timedelta(days=365.25)
data = data.drop(columns=["Birth", "RaceDate"])

# Load qualifications data
qualification1_2020 = ff1.get_session(2020, 17, 'Q')
qualification1_2020.load()

# Add the best qualifications lap
fastest_laps = qualification1_2020.laps.groupby("DriverNumber")["LapTime"].min()
fastest_laps = pd.DataFrame(fastest_laps)
fastest_laps["DriverNumber"] = fastest_laps.index
fastest_laps = fastest_laps.reset_index(drop=True)
data = pd.merge(left=data, right=fastest_laps, how='right')
data = data.rename(columns = {'LapTime': 'BestQualiTime'})

# Add fastest lap point
aux = race_2020.laps.pick_fastest()
auxDriverNo = aux[1]
data['FLapPoint'] = (data['DriverNumber'] == auxDriverNo).astype(int)

# Add average and std dev of lap times
aux = race_2020.laps

auxi = aux[["DriverNumber", "LapTime"]]
auxi

mean_lap_time = auxi.groupby('DriverNumber')['LapTime'].mean()
mean_lap_time = pd.DataFrame(mean_lap_time)

mean_lap_time["DriverNumber"] = mean_lap_time.index
mean_lap_time = mean_lap_time.reset_index(drop=True)

data = pd.merge(left=data, right=mean_lap_time, how='right')
data = data.rename(columns = {'LapTime': 'AvgLapTime'})

auxi = aux[["DriverNumber", "LapTime"]]
auxi['LapTime_ms'] = auxi['LapTime'].astype('timedelta64[ms]')
auxi = auxi.groupby(["DriverNumber"])["LapTime_ms"].std()
auxi = pd.DataFrame(auxi)

auxi["DriverNumber"] = auxi.index
auxi = auxi.reset_index(drop=True)

data = pd.merge(left=data, right=auxi, how='right')
data = data.rename(columns = {'LapTime_ms': 'SDLapTime'})
    
import datetime
data['SDLapTime'] = data['SDLapTime'].apply(lambda x: datetime.timedelta(milliseconds=x))

# Add average split times variable
besttime = pd.DataFrame(aux.groupby("LapNumber")["LapTime"].min())

besttime["LapNumber"] = besttime.index
besttime  = besttime .reset_index(drop=True)

best_lap = besttime.groupby('LapNumber')['LapTime'].min()

merged = pd.merge(aux, best_lap, on='LapNumber', suffixes=('', '_best'))

merged['diff'] = merged['LapTime'] - merged['LapTime_best']

driver_avg_diff = merged.groupby('DriverNumber')['diff'].mean().reset_index()

driver_avg_diff = driver_avg_diff.rename(columns={'diff': 'AvgSplitTime'})

data = pd.merge(left=data, right=driver_avg_diff, how='right')

# Get information about pit stops
# Add average pitstop time and number of pit stops
pit_stops = pd.read_csv("pit_stops.csv")
races = pd.read_csv("races.csv")
drivers = pd.read_csv("drivers.csv")

listdrivers = np.unique(data["Abbreviation"])

filtered_drivers = drivers[drivers['code'].isin(listdrivers)]

k = filtered_drivers["driverId"]

b = races[(races["year"] == 2020) & (races["round"] == 17)]
b = b["raceId"]
b = int(b)

auxpit = pit_stops[pit_stops["driverId"].isin(k)]
auxpit = auxpit[auxpit["raceId"] == b]
auxpit = auxpit.drop(["lap", "time", "duration"], axis = 1)

merged_df = auxpit.merge(drivers, on='driverId')
merged_df = pd.DataFrame(merged_df)
merged_df = merged_df.drop(["raceId", "driverId", "driverRef", "forename", "surname", "dob", "nationality", "url"], axis = 1)

transformed_df = merged_df.groupby('code').agg({
'milliseconds': 'mean',
'stop': 'max'
}).reset_index()

transformed_df = pd.DataFrame(transformed_df)

merged_data = data.merge(transformed_df, left_on='Abbreviation', right_on='code', how = "left")

merged_data = pd.DataFrame(merged_data)
merged_data = merged_data.drop("code", axis = 1)

mask = merged_data['milliseconds'].notna()
merged_data.loc[mask, 'milliseconds'] = merged_data.loc[mask, 'milliseconds'].apply(lambda x: datetime.timedelta(milliseconds=x))

merged_data = merged_data.rename(columns = {'milliseconds': 'AvgPitTime', "stop": "PitstopNo"})

# Add number of laps on each type of compound
compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

drivers2 = aux['DriverNumber'].unique()
compounds = ['SOFT', 'MEDIUM', 'HARD', 'INTERMEDIATE', 'WET']
all_combinations = pd.MultiIndex.from_product([drivers2, compounds], names=['DriverNumber', 'Compound'])
all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

compound_counts = aux.groupby(['DriverNumber', 'Compound']).size().reset_index(name='Laps')

compound_counts = pd.merge(all_combinations_df, compound_counts, on=['DriverNumber', 'Compound'], how='outer').fillna(0)
compound_counts_pivot = compound_counts.pivot(index='DriverNumber', columns='Compound', values='Laps').fillna(0)
indexi = compound_counts_pivot.index
indexi = list(indexi)

compound_counts_pivot = compound_counts_pivot.reset_index(level=0, drop=True)
compound_counts_pivot["DriverNumber"] = indexi
compound_counts_pivot = pd.DataFrame(compound_counts_pivot)

merged_data_compound = pd.merge(merged_data, compound_counts_pivot, on='DriverNumber')

data = merged_data_compound

# Add engine manufacturer for each team
engines = pd.read_csv("engines2020.csv", sep = ";")

data = data.merge(engines, left_on='TeamName', right_on='Car', how = "left")
data = data.drop(["Car"], axis = 1)

# Get information on number of previous wins at that particular race
#wins = pd.read_csv("podiums2020.csv", sep = ";")
#wins = wins[["Driver", "2"]]

#data = pd.merge(left=data, right=wins , how='right', left_on="Abbreviation", right_on = "Driver")

#data = data.drop(["Driver"], axis = 1)

#data = data.rename(columns={'2': 'PrevPodiumRace'})

# Get weather data
rainy = pd.DataFrame(race_2020.weather_data)

counts = rainy['Rainfall'].value_counts()

def determine_rain(counts):
    if counts.get(False, 0) > counts.get(True, 0):
        return False
    else:
        return True

is_rainy = determine_rain(counts)

data["Rain"] = is_rainy

# Add telemtry variables
data.dropna(subset=['DriverNumber'], inplace=True)
driversnumbers = np.unique(data["DriverNumber"])

agg = pd.DataFrame()
for i in driversnumbers:
    tele = pd.DataFrame(race_2020.car_data[i])
    avg_speed = tele['Speed'].mean()
    max_speed = tele['Speed'].max()
    avg_rpm = tele['RPM'].mean()
    max_rpm = tele['RPM'].max()
    avg_throttle = tele['Throttle'].mean()
    max_throttle = tele['Throttle'].max()
    max_throttle_pct = (tele['Throttle'] == max_throttle).sum() / len(tele) * 100
    break_percentage = tele['Brake'].sum() / len(tele) * 100

    driver_data = {
        'Driver': i,
        'AverageSpeed': avg_speed,
        'MaxSpeed': max_speed,
        'AverageRPM': avg_rpm,
        'MaxRPM': max_rpm,
        'AverageThrottle': avg_throttle,
        'MaxThrottlePct': max_throttle_pct,
        'Brake': break_percentage
        }
    
    agg = pd.concat([agg, pd.DataFrame(driver_data, index=[0])], ignore_index=True)

data = pd.merge(left=data, right=agg , how='right', left_on="DriverNumber", right_on = "Driver")
data["raceID"] = "37"
all_data = pd.concat([all_data, data])


core           INFO 	Loading data for Abu Dhabi Grand Prix - Race [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['33', '77', '44', '23', '4', '55', '3', '10', '31', '18', '26', '7', '16', '5', '63', '99', '6', '20', '51', '11']
core           INFO 	Loading data for Abu Dhabi Grand Prix - Qualifying [v2.3.0]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for t

## Complete missing values

In [21]:
# no pit stops
all_data['AvgPitTime'] = all_data['AvgPitTime'].fillna(pd.Timedelta("0 days"))

all_data['PitstopNo'] = all_data['PitstopNo'].fillna(0)

In [22]:
all_data.to_csv(r'2020data.csv', index=True, header=True)