In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
import seaborn as sns
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import FactorAnalysis
from factor_analyzer import FactorAnalyzer

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

%matplotlib inline

from sklearn.preprocessing import OneHotEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier 

from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import TruncatedSVD

import plotly.express as px
import plotly.graph_objs as go
import statsmodels.formula.api as smf
from plotly.graph_objects import Layout

Import the dataset and do the required pre-processing

In [2]:
df = pd.read_csv("mergedData.csv")

In [142]:
df = df.drop("Unnamed: 0", axis = 1)

In [143]:
df = df.drop(["DriverNumber", "Driver", "Points"], axis = 1)
# df = df.drop(["DriverNumber", "Driver", "Position"], axis = 1)
aux = df

In [144]:
# create new columns based on Status values
df['carIssue'] = (df['Status'] == 'carIssue').astype(int)
df['driverIssue'] = (df['Status'] == 'driverIssue').astype(int)
df = df.drop('Status', axis=1)

In [145]:
df["RaceCountry"] = df["RaceCountry"].str.replace(" ", "")
df["TeamName"] = df["TeamName"].str.replace(" ", "")
# df = df.drop(["Engine"], axis = 1)
df["Engine"] = df["Engine"].str.replace(" ", "")

In [146]:

# get a list of all categorical variables except "Abbreviation" and "raceCountry"
cat_vars = df.select_dtypes(include=['object']).columns.tolist()
cat_vars.remove("Abbreviation")
cat_vars.remove("RaceCountry")

# create dummy variables 
dummies = pd.get_dummies(df[cat_vars], drop_first=True)

# drop the original categorical variables from the DataFrame
df = df.drop(cat_vars, axis=1)

# concatenate the dummy variables with the remaining variables in the DataFrame
df = pd.concat([df, dummies], axis=1)


In [148]:
#df = df.drop(["Year", "AverageRPM"], axis = 1)
df = df.drop(["AverageRPM"], axis = 1)
df = df.drop(["AvgLapTime"], axis = 1)
df = df.drop(["AvgSplitTime"], axis = 1)

In [149]:
df

Unnamed: 0,Abbreviation,Position,GridPosition,RaceCountry,Year,AgeAtGP,BestQualiTime,FLap,SDLapTime,AvgPitTime,...,TeamName_RacingPoint,TeamName_RedBullRacing,TeamName_Renault,TeamName_ToroRosso,TeamName_Williams,Engine_Honda,Engine_Mercedes,Engine_RedBull,Engine_Renault,CircuitType_street
0,GAS,11.0,17.0,Australia,2019.0,22.0,83.020,0.0,2.723962,21.2690,...,0,1,0,0,0,1,0,0,0,1
1,PER,13.0,10.0,Australia,2019.0,29.0,82.532,0.0,2.908751,23.2340,...,1,0,0,0,0,0,1,0,0,1
2,LEC,5.0,5.0,Australia,2019.0,21.0,81.442,0.0,2.795510,22.3060,...,0,0,0,0,0,0,0,0,0,1
3,STR,9.0,16.0,Australia,2019.0,20.0,83.017,0.0,2.711268,22.4710,...,1,0,0,0,0,0,1,0,0,1
4,MAG,6.0,7.0,Australia,2019.0,26.0,82.099,0.0,2.795637,22.3880,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1542,VET,10.0,9.0,AbuDhabi,2022.0,35.0,84.961,0.0,3.057131,23.1860,...,0,0,0,0,0,0,1,0,0,0
1543,SAI,4.0,4.0,AbuDhabi,2022.0,28.0,84.242,0.0,3.799668,21.7525,...,0,0,0,0,0,0,0,0,0,0
1544,LAT,19.0,20.0,AbuDhabi,2022.0,27.0,86.054,0.0,6.817590,23.7985,...,0,0,0,0,1,0,1,0,0,0
1545,RUS,5.0,6.0,AbuDhabi,2022.0,23.0,84.511,0.0,4.489247,26.0870,...,0,0,0,0,0,0,1,0,0,0


In [150]:
#df.to_csv(r'df_points.csv', index=True, header=True)
#df.to_csv(r'df_position.csv', index=True, header=True)
#df.to_csv(r'plotdata_points.csv', index=True, header=True)
#df.to_csv(r'plotdata_position.csv', index=True, header=True)

## MERF

In [152]:
from merf import MERF

In [157]:
# Split the data into two datasets based on the Year column
df_before = df[df['Year'].isin([2019, 2020])]
df_after = df[df['Year'].isin([2021, 2022])]

## Before model

## Train

In [159]:
# Split the data into train and test sets based on the raceID column
train_df = df_before[(df_before['raceID'] >= 1) & (df_before['raceID'] <= 29)]
test_df = df_before[(df_before['raceID'] >= 30) & (df_before['raceID'] <= 37)]

In [160]:
X_train = train_df.drop('Position', axis=1)
Y_train = train_df['Position']
X_test = test_df.drop('Position', axis=1)
Y_test = test_df['Position']

In [161]:
X_train = X_train.drop('RaceCountry', axis=1)
X_train = X_train.drop('Year', axis=1)
X_test = X_test.drop('RaceCountry', axis=1)
X_test = X_test.drop('Year', axis=1)

In [162]:
x_train = X_train.drop(['Abbreviation', 'raceID'], axis = 1)
Z_train = X_train[['raceID']]
clusters_train = X_train['raceID']

In [163]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

mrf = MERF(fixed_effects_model=model, max_iterations=30)

mrf.fit(x_train, Z_train, clusters_train, Y_train)


INFO     [merf.py:307] Training GLL is 1596.4646623331869 at iteration 1.
INFO     [merf.py:307] Training GLL is 1551.7884174971261 at iteration 2.
INFO     [merf.py:307] Training GLL is 1542.105886004856 at iteration 3.
INFO     [merf.py:307] Training GLL is 1538.0425715316378 at iteration 4.
INFO     [merf.py:307] Training GLL is 1536.2229205798524 at iteration 5.
INFO     [merf.py:307] Training GLL is 1535.3740519146759 at iteration 6.
INFO     [merf.py:307] Training GLL is 1534.9766138964126 at iteration 7.
INFO     [merf.py:307] Training GLL is 1534.7997873223103 at iteration 8.
INFO     [merf.py:307] Training GLL is 1534.732889336695 at iteration 9.
INFO     [merf.py:307] Training GLL is 1534.7207718824654 at iteration 10.
INFO     [merf.py:307] Training GLL is 1534.7353817557064 at iteration 11.
INFO     [merf.py:307] Training GLL is 1534.7622258925526 at iteration 12.
INFO     [merf.py:307] Training GLL is 1534.7937351805408 at iteration 13.
INFO     [merf.py:307] Training GLL 

<merf.merf.MERF at 0x7f8ac2223130>

In [164]:
train_preds = mrf.predict(x_train, Z_train, clusters_train)

In [165]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(Y_train, train_preds)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_train, train_preds)
r2 = r2_score(Y_train, train_preds)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)


Test MSE: 7.087361983712685
Test RMSE: 2.6622099811458684
Test MAE: 2.0889718378656745
Test R-squared: 0.7823832377183746
Test RMSPE: 0.22529975232393407


## Test

In [166]:
x_test = X_test.drop(['Abbreviation', 'raceID'], axis = 1)
Z_test = X_test[['raceID']]
clusters_test = X_test['raceID']

In [167]:
x_test

Unnamed: 0,GridPosition,AgeAtGP,BestQualiTime,FLap,SDLapTime,AvgPitTime,PitstopNo,HARD,INTERMEDIATE,MEDIUM,...,TeamName_RacingPoint,TeamName_RedBullRacing,TeamName_Renault,TeamName_ToroRosso,TeamName_Williams,Engine_Honda,Engine_Mercedes,Engine_RedBull,Engine_Renault,CircuitType_street
568,9.0,24.0,93.000,0.0,4.414441,29.5895,2.0,24.0,0.0,11.0,...,0,0,0,0,0,0,0,0,0,1
569,4.0,30.0,92.317,0.0,3.598099,29.6020,1.0,33.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,1
570,10.0,22.0,93.239,0.0,3.308658,30.2610,1.0,25.0,0.0,28.0,...,0,0,0,0,0,0,0,0,0,1
571,18.0,28.0,94.681,0.0,3.643243,30.1880,1.0,33.0,0.0,19.0,...,0,0,0,0,0,0,0,0,0,1
572,15.0,24.0,93.008,0.0,3.737778,30.1610,2.0,26.0,0.0,26.0,...,0,1,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
717,18.0,25.0,98.443,0.0,7.947119,21.8705,2.0,24.0,0.0,30.0,...,0,0,0,0,1,0,1,0,0,0
718,16.0,21.0,98.045,0.0,8.040811,22.0850,1.0,44.0,0.0,10.0,...,0,0,0,0,1,0,1,0,0,0
719,15.0,41.0,97.555,0.0,8.160983,22.0120,1.0,44.0,0.0,10.0,...,0,0,0,0,0,0,0,0,0,0
720,2.0,31.0,95.271,0.0,3.679513,21.5870,1.0,45.0,0.0,10.0,...,0,0,0,0,0,0,1,0,0,0


In [168]:
y_pred = mrf.predict(x_test, Z_test, clusters_test)

In [169]:
from sklearn.metrics import r2_score
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)

Test MSE: 34.24464770089943
Test RMSE: 5.851892659721249
Test MAE: 3.869540405529234
Test R-squared: -0.09558628938852576


In [170]:
actual_values = Y_test
predictions = y_pred
# Calculate the mean of actual values
mean_actual = sum(actual_values) / len(actual_values)

# Calculate SSE and SST
sse = sum((actual - pred) ** 2 for actual, pred in zip(actual_values, predictions))
sst = sum((actual - mean_actual) ** 2 for actual in actual_values)

# Calculate R-squared
r2 = 1 - (sse / sst)
r2

-0.09558628938852576

In [171]:
# Calculate R-squared
r = np.corrcoef(predictions, actual_values)[0, 1]
r2 = r**2
r2

0.3716447876071426

## After model

## Train

In [174]:
# Split the data into train and test sets based on the raceID column
train_df = df_after[(df_after['raceID'] >= 38) & (df_after['raceID'] <= 72)]
test_df = df_after[(df_after['raceID'] >= 73) & (df_after['raceID'] <= 80)]

In [175]:
X_train = train_df.drop('Position', axis=1)
Y_train = train_df['Position']
X_test = test_df.drop('Position', axis=1)
Y_test = test_df['Position']

In [176]:
X_train = X_train.drop('RaceCountry', axis=1)
X_train = X_train.drop('Year', axis=1)
X_test = X_test.drop('RaceCountry', axis=1)
X_test = X_test.drop('Year', axis=1)

In [177]:
x_train = X_train.drop(['Abbreviation', 'raceID'], axis = 1)
Z_train = X_train[['raceID']]
clusters_train = X_train['raceID']

In [178]:
#mrf = MERF(fixed_effects_model = LinearRegression(), max_iterations=30)
mrf.fit(x_train, Z_train, clusters_train, Y_train)

INFO     [merf.py:307] Training GLL is 1798.3726199915113 at iteration 1.
INFO     [merf.py:307] Training GLL is 1774.986836037651 at iteration 2.
INFO     [merf.py:307] Training GLL is 1770.384372461241 at iteration 3.
INFO     [merf.py:307] Training GLL is 1766.8062783736425 at iteration 4.
INFO     [merf.py:307] Training GLL is 1763.7753608996136 at iteration 5.
INFO     [merf.py:307] Training GLL is 1761.1336138538284 at iteration 6.
INFO     [merf.py:307] Training GLL is 1758.7881329778447 at iteration 7.
INFO     [merf.py:307] Training GLL is 1756.6771808268707 at iteration 8.
INFO     [merf.py:307] Training GLL is 1754.7570908973603 at iteration 9.
INFO     [merf.py:307] Training GLL is 1752.9956196284452 at iteration 10.
INFO     [merf.py:307] Training GLL is 1751.3681702713134 at iteration 11.
INFO     [merf.py:307] Training GLL is 1749.8555013718844 at iteration 12.
INFO     [merf.py:307] Training GLL is 1748.4422572239962 at iteration 13.
INFO     [merf.py:307] Training GLL 

<merf.merf.MERF at 0x7f8ac2223130>

In [179]:
train_preds = mrf.predict(x_train, Z_train, clusters_train)

In [180]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(Y_train, train_preds)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_train, train_preds)
r2 = r2_score(Y_train, train_preds)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)

Test MSE: 8.27348915822779
Test RMSE: 2.8763673545338033
Test MAE: 2.259793609210368
Test R-squared: 0.7382129081749542


## Test

In [181]:
x_test = X_test.drop(['Abbreviation', 'raceID'], axis = 1)
Z_test = X_test[['raceID']]
clusters_test = X_test['raceID']

In [182]:
y_pred = mrf.predict(x_test, Z_test, clusters_test)

In [183]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)


Test MSE: 7.302172787094027
Test RMSE: 2.7022532795972376
Test MAE: 2.1726833375904095
Test R-squared: 0.7686064777239628


In [184]:
actual_values = Y_test
predictions = y_pred
# Calculate the mean of actual values
mean_actual = sum(actual_values) / len(actual_values)

# Calculate SSE and SST
sse = sum((actual - pred) ** 2 for actual, pred in zip(actual_values, predictions))
sst = sum((actual - mean_actual) ** 2 for actual in actual_values)

# Calculate R-squared
r2 = 1 - (sse / sst)
r2

0.768606477723963