In [57]:
import seaborn as sns
import seaborn.objects as so
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import fastf1 as ff1
import fastf1.plotting

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

# use a default session to set color maps etc
year = 2024
circuit = 'Japan'
session = 'Q'
race = ff1.get_session(year, circuit, session)

In [123]:
dp = pd.read_pickle('data/driver_performance.pkl')[['event', 'driver', 'points',
                                                    'total_brake_time_quali', 'avg_race_pace', 'lap_time_consistency',
                                                    'corner_avg', 'slow', 'medium','fast']]
drivers_to_exclude = ['DOO','BEA','LAW','COL','SAR','RIC','OCO'] #limit to drivers with close to a full season of information
dp = dp[~dp['driver'].isin(drivers_to_exclude)]
print('Drivers of Interest = ', dp['driver'].unique())
print(dp.info())
# Handle missing data - if any
print(dp.loc[dp.isnull().any(axis=1)])
# Use mean imputation for missing performance stats
dp['avg_race_pace'] = dp['avg_race_pace'].fillna(dp['avg_race_pace'].mean())
dp['lap_time_consistency'] = dp['lap_time_consistency'].fillna(dp['lap_time_consistency'].mean())
dp['corner_avg'] = dp['corner_avg'].fillna(dp['corner_avg'].mean())
dp['total_brake_time_quali'] = dp['total_brake_time_quali'].fillna(dp['total_brake_time_quali'].mean())
# Assume 0 points - either DNF or didn't take part in race
dp['points'] = dp['points'].fillna(0)
print(dp.loc[dp.isnull().any(axis=1)])
dp.head()

Drivers of Interest =  ['ALB' 'ALO' 'BOT' 'GAS' 'HAM' 'HUL' 'LEC' 'MAG' 'NOR' 'PER' 'PIA' 'RUS'
 'SAI' 'STR' 'TSU' 'VER' 'ZHO']
<class 'pandas.core.frame.DataFrame'>
Index: 405 entries, 0 to 478
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   event                   405 non-null    object 
 1   driver                  405 non-null    object 
 2   points                  393 non-null    float64
 3   total_brake_time_quali  402 non-null    float64
 4   avg_race_pace           393 non-null    float64
 5   lap_time_consistency    393 non-null    float64
 6   corner_avg              402 non-null    float64
 7   slow                    405 non-null    int64  
 8   medium                  405 non-null    int64  
 9   fast                    405 non-null    int64  
dtypes: float64(5), int64(3), object(2)
memory usage: 34.8+ KB
None
                        event driver  points  total_brake_time_quali

Unnamed: 0,event,driver,points,total_brake_time_quali,avg_race_pace,lap_time_consistency,corner_avg,slow,medium,fast
0,Abu Dhabi Grand Prix,ALB,0.0,1.596256,0.967115,0.458731,-1.011985,5,5,6
1,Abu Dhabi Grand Prix,ALO,2.0,0.589947,-0.225194,-0.056915,-0.706647,5,5,6
2,Abu Dhabi Grand Prix,BOT,0.0,-0.281068,1.111283,0.982101,-0.133451,5,5,6
5,Abu Dhabi Grand Prix,GAS,6.0,-0.726745,0.369307,-0.304784,0.435437,5,5,6
6,Abu Dhabi Grand Prix,HAM,12.0,1.313287,-1.031583,-0.686036,-0.816273,5,5,6


In [124]:
# For the last race 'Abu Dhabi', assume we don't yet know the driver performance characteristics, so will use their season averages instead.
driver_performance = dp.groupby('driver')[['total_brake_time_quali','avg_race_pace','lap_time_consistency','corner_avg']].mean().reset_index()
driver_performance = driver_performance.rename(columns={'total_brake_time_quali':'brake_time_season_avg','avg_race_pace':'race_pace_season_avg',
                                   'lap_time_consistency':'lap_time_consistency_season','corner_avg':'corner_season_avg'})
dp = pd.merge(dp, driver_performance, on='driver', how='left')
dp = dp.drop('event', axis=1)


In [125]:
# Hold out final race to use for final demonstration of predictive model.
last_race = dp.loc[0:19]
last_race_avgs = last_race[['driver', 'points', 'brake_time_season_avg', 'race_pace_season_avg',
       'lap_time_consistency_season', 'corner_season_avg' ,'slow', 'medium', 'fast']]
last_race_avgs = last_race_avgs.rename(columns={'brake_time_season_avg':'total_brake_time_quali', 'race_pace_season_avg':'avg_race_pace',
                                      'lap_time_consistency_season':'lap_time_consistency', 'corner_season_avg':'corner_avg'})
last_race_avgs = pd.get_dummies(last_race_avgs)
last_race = last_race[['driver', 'points', 'total_brake_time_quali', 'avg_race_pace',
       'lap_time_consistency', 'corner_avg', 'slow', 'medium', 'fast']]
last_race = pd.get_dummies(last_race)# rows for Abu Dhabi Grand Prix
# all other grand prix from 2024 season as input to modelling
model_data = dp.loc[20:]
model_data = model_data[['driver', 'points', 'total_brake_time_quali', 'avg_race_pace',
       'lap_time_consistency', 'corner_avg', 'slow', 'medium', 'fast']]
model_data = pd.get_dummies(model_data)
model_data

Unnamed: 0,points,total_brake_time_quali,avg_race_pace,lap_time_consistency,corner_avg,slow,medium,fast,driver_ALB,driver_ALO,...,driver_MAG,driver_NOR,driver_PER,driver_PIA,driver_RUS,driver_SAI,driver_STR,driver_TSU,driver_VER,driver_ZHO
20,0.0,-0.904810,0.939743,0.962046,-1.202838,2,5,7,False,False,...,False,False,False,False,False,False,False,False,False,False
21,0.0,0.095374,1.201057,0.261351,0.763690,2,5,7,False,False,...,False,False,False,False,False,False,False,False,False,False
22,2.0,0.893167,0.488009,-0.717406,-1.143779,2,5,7,False,False,...,False,False,False,False,False,False,False,False,False,False
23,19.0,0.367580,-1.710048,-1.016346,0.993167,2,5,7,False,False,...,False,False,False,False,False,False,False,False,False,False
24,1.0,0.548006,0.525185,1.153072,-0.963250,2,5,7,False,False,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400,18.0,1.089549,-1.437405,-0.315827,0.134678,7,5,8,False,False,...,False,False,False,False,False,True,False,False,False,False
401,0.0,-0.350485,1.020456,-0.654757,-0.126953,7,5,8,False,False,...,False,False,False,False,False,False,True,False,False,False
402,0.0,-0.653085,0.510510,0.210076,0.078433,7,5,8,False,False,...,False,False,False,False,False,False,False,True,False,False
403,15.0,-0.398772,-1.201399,-1.003278,1.023542,7,5,8,False,False,...,False,False,False,False,False,False,False,False,True,False


In [136]:
# Split data into train and test
X_train = model_data.drop(['points'], axis=1)
y_train = model_data['points']
X_test = last_race_avgs.drop(['points'], axis=1)
y_test = last_race_avgs['points']
#X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [137]:
rfr = RandomForestRegressor(n_estimators=100, random_state=42)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)

In [138]:
print("Mean Squared Error = ", mean_squared_error(y_pred, y_test))
print("R2 Score = ", r2_score(y_pred, y_test))

Mean Squared Error =  21.465939999999996
R2 Score =  0.30943125363445556


In [139]:
param_grid = {
    'n_estimators': [10, 100, 200],
    'max_depth': [5, 10,20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [140]:
rfr_cv = GridSearchCV(rfr, param_grid, cv=3, scoring='neg_mean_squared_error',n_jobs=-1)
rfr_cv.fit(X_train, y_train)
y_pred_avg = rfr_cv.predict(X_test)

In [141]:
print("Mean Squared Error = ", mean_squared_error(y_pred_avg, y_test))
print("R2 Score = ", r2_score(y_pred_avg, y_test))

Mean Squared Error =  21.751633280636895
R2 Score =  0.24996718311214094


In [142]:
# Let's try it if we use the actual values for driver performance instead of season averages
X_test = last_race.drop(['points'], axis=1)
y_test = last_race['points']
y_pred = rfr_cv.predict(X_test)
print("Mean Squared Error = ", mean_squared_error(y_pred, y_test))
print("R2 Score = ", r2_score(y_pred, y_test))

Mean Squared Error =  8.286104697673881
R2 Score =  0.8178989564368505


In [155]:
predicted_results = dp.loc[0:19][['driver','points']]
predicted_results['season_avg'] = y_pred_avg
predicted_results['known_performance'] = y_pred
predicted_results = predicted_results.sort_values(by='points', ascending=False)
predicted_results.head(15)

Unnamed: 0,driver,points,season_avg,known_performance
8,NOR,25.0,14.711079,20.103053
12,SAI,18.0,8.6935,23.125735
6,LEC,15.0,12.393134,13.759631
4,HAM,12.0,8.845328,9.646706
11,RUS,10.0,8.297406,8.56331
15,VER,8.0,16.327427,9.185019
3,GAS,6.0,0.346006,1.088774
18,ALO,4.0,2.497339,3.457416
5,HUL,4.0,0.756687,0.855621
1,ALO,2.0,1.959202,4.043234


In [157]:
predicted_results.to_csv('data/predictions.csv', index=False)