In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
import seaborn as sns
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import FactorAnalysis
from factor_analyzer import FactorAnalyzer

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

%matplotlib inline

from sklearn.preprocessing import OneHotEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier 

from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import TruncatedSVD

import plotly.express as px
import plotly.graph_objs as go
import statsmodels.formula.api as smf
from plotly.graph_objects import Layout

In [52]:
df = pd.read_csv("mergedData.csv")

In [53]:
df = df.drop("Unnamed: 0", axis = 1)

In [54]:
df.columns

Index(['Abbreviation', 'DriverNumber', 'TeamName', 'Position', 'GridPosition',
       'Status', 'Points', 'RaceCountry', 'Year', 'AgeAtGP', 'BestQualiTime',
       'FLap', 'AvgLapTime', 'SDLapTime', 'AvgSplitTime', 'AvgPitTime',
       'PitstopNo', 'HARD', 'INTERMEDIATE', 'MEDIUM', 'SOFT', 'WET', 'Engine',
       'Rain', 'Driver', 'AverageSpeed', 'MaxSpeed', 'AverageRPM', 'MaxRPM',
       'AverageThrottle', 'MaxThrottlePct', 'Brake', 'raceID', 'CircuitType'],
      dtype='object')

In [55]:
df = df.drop(["DriverNumber", "Driver", "Points"], axis = 1)
aux = df

In [56]:
df = df.drop(["AverageRPM"], axis = 1)
df = df.drop(["AvgLapTime"], axis = 1)
df = df.drop(["AvgSplitTime"], axis = 1)

In [57]:
# create new columns based on Status values
df['carIssue'] = (df['Status'] == 'carIssue').astype(int)
df['driverIssue'] = (df['Status'] == 'driverIssue').astype(int)
df = df.drop('Status', axis=1)

In [58]:
#df['after_2020'] = df['Year'].apply(lambda x: 1 if x > 2020 else 0)

In [59]:
df

Unnamed: 0,Abbreviation,TeamName,Position,GridPosition,RaceCountry,Year,AgeAtGP,BestQualiTime,FLap,SDLapTime,...,AverageSpeed,MaxSpeed,MaxRPM,AverageThrottle,MaxThrottlePct,Brake,raceID,CircuitType,carIssue,driverIssue
0,GAS,Red Bull Racing,11.0,17.0,Australia,2019.0,22.0,83.020,0.0,2.723962,...,153.154148,326,12860,49.783810,38.567815,13.413764,1,street,0,0
1,PER,Racing Point,13.0,10.0,Australia,2019.0,29.0,82.532,0.0,2.908751,...,152.749173,331,13252,50.152656,24.220017,14.542388,1,street,0,0
2,LEC,Ferrari,5.0,5.0,Australia,2019.0,21.0,81.442,0.0,2.795510,...,155.409580,312,12319,56.245606,3.762081,15.816955,1,street,0,0
3,STR,Racing Point,9.0,16.0,Australia,2019.0,20.0,83.017,0.0,2.711268,...,153.104527,323,13073,49.155575,3.580463,13.238633,1,street,0,0
4,MAG,Haas F1 Team,6.0,7.0,Australia,2019.0,26.0,82.099,0.0,2.795637,...,155.144905,314,12771,58.128754,7.825777,20.419018,1,street,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1542,VET,Aston Martin,10.0,9.0,Abu Dhabi,2022.0,35.0,84.961,0.0,3.057131,...,125.301371,326,12956,52.953467,12.165976,24.576710,80,race,0,0
1543,SAI,Ferrari,4.0,4.0,Abu Dhabi,2022.0,28.0,84.242,0.0,3.799668,...,127.577142,327,12447,55.569944,13.248675,26.350495,80,race,0,0
1544,LAT,Williams,19.0,20.0,Abu Dhabi,2022.0,27.0,86.054,0.0,6.817590,...,119.446470,330,12788,49.385827,12.384819,26.045266,80,race,1,0
1545,RUS,Mercedes,5.0,6.0,Abu Dhabi,2022.0,23.0,84.511,0.0,4.489247,...,127.622639,329,12852,53.573226,11.601589,22.278853,80,race,0,0


In [60]:
df["RaceCountry"] = df["RaceCountry"].str.replace(" ", "")
df["TeamName"] = df["TeamName"].str.replace(" ", "")
df["Engine"] = df["Engine"].str.replace(" ", "")

In [61]:
#df.to_csv(r'df_points.csv', index=True, header=True)
#df.to_csv(r'df_position.csv', index=True, header=True)
#df.to_csv(r'plotdata_points.csv', index=True, header=True)
#df.to_csv(r'plotdata_position.csv', index=True, header=True)

In [62]:

# get a list of all categorical variables except "Abbreviation" and "raceID"
cat_vars = df.select_dtypes(include=['object']).columns.tolist()
cat_vars.remove("Abbreviation")
cat_vars.remove("RaceCountry")

# create dummy variables for all categorical variables except "Abbreviation" and "raceID"
dummies = pd.get_dummies(df[cat_vars], drop_first=True)

# drop the original categorical variables from the DataFrame
df = df.drop(cat_vars, axis=1)

# concatenate the dummy variables with the remaining variables in the DataFrame
df = pd.concat([df, dummies], axis=1)


In [66]:
df

Unnamed: 0,Abbreviation,Position,GridPosition,RaceCountry,Year,AgeAtGP,BestQualiTime,FLap,SDLapTime,AvgPitTime,...,TeamName_RacingPoint,TeamName_RedBullRacing,TeamName_Renault,TeamName_ToroRosso,TeamName_Williams,Engine_Honda,Engine_Mercedes,Engine_RedBull,Engine_Renault,CircuitType_street
0,GAS,11.0,17.0,Australia,2019.0,22.0,83.020,0.0,2.723962,21.2690,...,0,1,0,0,0,1,0,0,0,1
1,PER,13.0,10.0,Australia,2019.0,29.0,82.532,0.0,2.908751,23.2340,...,1,0,0,0,0,0,1,0,0,1
2,LEC,5.0,5.0,Australia,2019.0,21.0,81.442,0.0,2.795510,22.3060,...,0,0,0,0,0,0,0,0,0,1
3,STR,9.0,16.0,Australia,2019.0,20.0,83.017,0.0,2.711268,22.4710,...,1,0,0,0,0,0,1,0,0,1
4,MAG,6.0,7.0,Australia,2019.0,26.0,82.099,0.0,2.795637,22.3880,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1542,VET,10.0,9.0,AbuDhabi,2022.0,35.0,84.961,0.0,3.057131,23.1860,...,0,0,0,0,0,0,1,0,0,0
1543,SAI,4.0,4.0,AbuDhabi,2022.0,28.0,84.242,0.0,3.799668,21.7525,...,0,0,0,0,0,0,0,0,0,0
1544,LAT,19.0,20.0,AbuDhabi,2022.0,27.0,86.054,0.0,6.817590,23.7985,...,0,0,0,0,1,0,1,0,0,0
1545,RUS,5.0,6.0,AbuDhabi,2022.0,23.0,84.511,0.0,4.489247,26.0870,...,0,0,0,0,0,0,1,0,0,0


In [67]:
df["Year"].value_counts()

2022.0    424
2019.0    414
2021.0    401
2020.0    308
Name: Year, dtype: int64

## MERF

In [70]:
from merf import MERF

In [74]:
df[["Year", "raceID"]]

Unnamed: 0,Year,raceID
0,2019.0,1
1,2019.0,1
2,2019.0,1
3,2019.0,1
4,2019.0,1
...,...,...
1542,2022.0,80
1543,2022.0,80
1544,2022.0,80
1545,2022.0,80


In [75]:
# Split the data into two datasets based on the Year column
df_before = df[df['Year'].isin([2019, 2020])]
df_after = df[df['Year'].isin([2021, 2022])]

## Before model

In [76]:
df_before["raceID"]

0       1
1       1
2       1
3       1
4       1
       ..
717    37
718    37
719    37
720    37
721    37
Name: raceID, Length: 722, dtype: int64

In [77]:
# Split the data into train and test sets based on the raceID column
train_df = df_before[(df_before['raceID'] >= 1) & (df_before['raceID'] <= 29)]
test_df = df_before[(df_before['raceID'] >= 30) & (df_before['raceID'] <= 37)]

In [78]:
X_train = train_df.drop('Position', axis=1)
Y_train = train_df['Position']
X_test = test_df.drop('Position', axis=1)
Y_test = test_df['Position']

In [79]:
X_train = X_train.drop('RaceCountry', axis=1)
X_train = X_train.drop('Year', axis=1)
X_test = X_test.drop('RaceCountry', axis=1)
X_test = X_test.drop('Year', axis=1)

In [80]:
x_train = X_train.drop(['Abbreviation', 'raceID'], axis = 1)
Z_train = X_train[['raceID']]
clusters_train = X_train['raceID']

In [81]:

from sklearn.svm import SVR

svr_model = SVR(kernel='rbf', gamma='scale')

mrf = MERF(fixed_effects_model=svr_model, max_iterations=30)
mrf.fit(x_train, Z_train, clusters_train, Y_train)

INFO     [merf.py:307] Training GLL is 2435.012355347997 at iteration 1.
INFO     [merf.py:307] Training GLL is 2395.2879496758464 at iteration 2.
INFO     [merf.py:307] Training GLL is 2382.0263857090836 at iteration 3.
INFO     [merf.py:307] Training GLL is 2372.2077972567336 at iteration 4.
INFO     [merf.py:307] Training GLL is 2364.401558292814 at iteration 5.
INFO     [merf.py:307] Training GLL is 2358.022992796393 at iteration 6.
INFO     [merf.py:307] Training GLL is 2352.725299922297 at iteration 7.
INFO     [merf.py:307] Training GLL is 2348.1462431501573 at iteration 8.
INFO     [merf.py:307] Training GLL is 2344.130624386462 at iteration 9.
INFO     [merf.py:307] Training GLL is 2340.554890117171 at iteration 10.
INFO     [merf.py:307] Training GLL is 2337.360798404492 at iteration 11.
INFO     [merf.py:307] Training GLL is 2334.452009283764 at iteration 12.
INFO     [merf.py:307] Training GLL is 2331.790672118035 at iteration 13.
INFO     [merf.py:307] Training GLL is 2329

<merf.merf.MERF at 0x7fbfc5926820>

In [82]:
train_preds = mrf.predict(x_train, Z_train, clusters_train)

In [83]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(Y_train, train_preds)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_train, train_preds)
r2 = r2_score(Y_train, train_preds)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)


Test MSE: 32.5883534311462
Test RMSE: 5.708620974556482
Test MAE: 4.940584915454136
Test R-squared: -0.0006222312438370903


In [84]:
actual_values = Y_test
predictions = train_preds
# Calculate the mean of actual values
mean_actual = sum(actual_values) / len(actual_values)

# Calculate SSE and SST
sse = sum((actual - pred) ** 2 for actual, pred in zip(actual_values, predictions))
sst = sum((actual - mean_actual) ** 2 for actual in actual_values)

# Calculate R-squared
r2 = 1 - (sse / sst)
r2

0.00028128207952660134

# Test dataset

In [85]:
x_test = X_test.drop(['Abbreviation', 'raceID'], axis = 1)
Z_test = X_test[['raceID']]
clusters_test = X_test['raceID']

In [86]:
x_test

Unnamed: 0,GridPosition,AgeAtGP,BestQualiTime,FLap,SDLapTime,AvgPitTime,PitstopNo,HARD,INTERMEDIATE,MEDIUM,...,TeamName_RacingPoint,TeamName_RedBullRacing,TeamName_Renault,TeamName_ToroRosso,TeamName_Williams,Engine_Honda,Engine_Mercedes,Engine_RedBull,Engine_Renault,CircuitType_street
568,9.0,24.0,93.000,0.0,4.414441,29.5895,2.0,24.0,0.0,11.0,...,0,0,0,0,0,0,0,0,0,1
569,4.0,30.0,92.317,0.0,3.598099,29.6020,1.0,33.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,1
570,10.0,22.0,93.239,0.0,3.308658,30.2610,1.0,25.0,0.0,28.0,...,0,0,0,0,0,0,0,0,0,1
571,18.0,28.0,94.681,0.0,3.643243,30.1880,1.0,33.0,0.0,19.0,...,0,0,0,0,0,0,0,0,0,1
572,15.0,24.0,93.008,0.0,3.737778,30.1610,2.0,26.0,0.0,26.0,...,0,1,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
717,18.0,25.0,98.443,0.0,7.947119,21.8705,2.0,24.0,0.0,30.0,...,0,0,0,0,1,0,1,0,0,0
718,16.0,21.0,98.045,0.0,8.040811,22.0850,1.0,44.0,0.0,10.0,...,0,0,0,0,1,0,1,0,0,0
719,15.0,41.0,97.555,0.0,8.160983,22.0120,1.0,44.0,0.0,10.0,...,0,0,0,0,0,0,0,0,0,0
720,2.0,31.0,95.271,0.0,3.679513,21.5870,1.0,45.0,0.0,10.0,...,0,0,0,0,0,0,1,0,0,0


In [87]:
y_pred = mrf.predict(x_test, Z_test, clusters_test)

In [88]:
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)


Test MSE: 31.23552633458314
Test RMSE: 5.588875229827836
Test MAE: 4.825901981369629
Test R-squared: 0.0006842910781438283


## After model

In [89]:
df_after["raceID"]

722     38
723     38
724     38
725     38
726     38
        ..
1542    80
1543    80
1544    80
1545    80
1546    80
Name: raceID, Length: 825, dtype: int64

In [90]:
df_after

Unnamed: 0,Abbreviation,Position,GridPosition,RaceCountry,Year,AgeAtGP,BestQualiTime,FLap,SDLapTime,AvgPitTime,...,TeamName_RacingPoint,TeamName_RedBullRacing,TeamName_Renault,TeamName_ToroRosso,TeamName_Williams,Engine_Honda,Engine_Mercedes,Engine_RedBull,Engine_Renault,CircuitType_street
722,GAS,17.0,5.0,Bahrain,2021.0,24.0,89.809,0.0,9.870618,29.212667,...,0,0,0,0,0,0,0,0,0,0
723,PER,5.0,20.0,Bahrain,2021.0,31.0,90.659,0.0,9.238201,24.096333,...,0,1,0,0,0,1,0,0,0,0
724,ALO,19.0,9.0,Bahrain,2021.0,39.0,90.249,0.0,9.050427,24.574000,...,0,0,0,0,0,0,0,0,1,0
725,LEC,6.0,4.0,Bahrain,2021.0,23.0,89.678,0.0,7.326363,24.550500,...,0,0,0,0,0,0,0,0,0,0
726,STR,10.0,10.0,Bahrain,2021.0,22.0,90.601,0.0,9.904268,25.204500,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1542,VET,10.0,9.0,AbuDhabi,2022.0,35.0,84.961,0.0,3.057131,23.186000,...,0,0,0,0,0,0,1,0,0,0
1543,SAI,4.0,4.0,AbuDhabi,2022.0,28.0,84.242,0.0,3.799668,21.752500,...,0,0,0,0,0,0,0,0,0,0
1544,LAT,19.0,20.0,AbuDhabi,2022.0,27.0,86.054,0.0,6.817590,23.798500,...,0,0,0,0,1,0,1,0,0,0
1545,RUS,5.0,6.0,AbuDhabi,2022.0,23.0,84.511,0.0,4.489247,26.087000,...,0,0,0,0,0,0,1,0,0,0


In [91]:
# Split the data into train and test sets based on the raceID column
train_df = df_after[(df_after['raceID'] >= 38) & (df_after['raceID'] <= 72)]
test_df = df_after[(df_after['raceID'] >= 73) & (df_after['raceID'] <= 80)]

In [92]:
X_train = train_df.drop('Position', axis=1)
Y_train = train_df['Position']
X_test = test_df.drop('Position', axis=1)
Y_test = test_df['Position']

In [93]:
X_train = X_train.drop('RaceCountry', axis=1)
X_train = X_train.drop('Year', axis=1)
X_test = X_test.drop('RaceCountry', axis=1)
X_test = X_test.drop('Year', axis=1)

In [94]:
x_train = X_train.drop(['Abbreviation', 'raceID'], axis = 1)
Z_train = X_train[['raceID']]
clusters_train = X_train['raceID']

In [95]:

mrf.fit(x_train, Z_train, clusters_train, Y_train)


INFO     [merf.py:307] Training GLL is 2698.3374904130565 at iteration 1.
INFO     [merf.py:307] Training GLL is 2666.032114070161 at iteration 2.
INFO     [merf.py:307] Training GLL is 2660.537947490707 at iteration 3.
INFO     [merf.py:307] Training GLL is 2655.6251547166144 at iteration 4.
INFO     [merf.py:307] Training GLL is 2651.2173845561233 at iteration 5.
INFO     [merf.py:307] Training GLL is 2647.2953006595058 at iteration 6.
INFO     [merf.py:307] Training GLL is 2643.741791912943 at iteration 7.
INFO     [merf.py:307] Training GLL is 2640.5278139988404 at iteration 8.
INFO     [merf.py:307] Training GLL is 2637.577168035475 at iteration 9.
INFO     [merf.py:307] Training GLL is 2634.8414779653212 at iteration 10.
INFO     [merf.py:307] Training GLL is 2632.303225669452 at iteration 11.
INFO     [merf.py:307] Training GLL is 2629.934668593295 at iteration 12.
INFO     [merf.py:307] Training GLL is 2627.719330425508 at iteration 13.
INFO     [merf.py:307] Training GLL is 26

<merf.merf.MERF at 0x7fbfc5926820>

In [96]:
train_preds = mrf.predict(x_train, Z_train, clusters_train)

In [97]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(Y_train, train_preds)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_train, train_preds)
r2 = r2_score(Y_train, train_preds)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)

Test MSE: 31.564983546760896
Test RMSE: 5.618272291973832
Test MAE: 4.850067647381201
Test R-squared: 0.0012309089697312148


# Test Dataset

In [98]:
x_test = X_test.drop(['Abbreviation', 'raceID'], axis = 1)
Z_test = X_test[['raceID']]
clusters_test = X_test['raceID']

In [99]:
y_pred = mrf.predict(x_test, Z_test, clusters_test)

In [100]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)


Test MSE: 31.555037790631737
Test RMSE: 5.617387096384914
Test MAE: 4.853123115063056
Test R-squared: 7.414877490208305e-05
