In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
import seaborn as sns
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import FactorAnalysis
from factor_analyzer import FactorAnalyzer

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

%matplotlib inline

from sklearn.preprocessing import OneHotEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier 

from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import TruncatedSVD

import plotly.express as px
import plotly.graph_objs as go
import statsmodels.formula.api as smf
from plotly.graph_objects import Layout

Import the dataset and do the required pre-processing

In [2]:
df = pd.read_csv("mergedData.csv")

In [3]:
df = df.drop("Unnamed: 0", axis = 1)

In [5]:
df = df.drop(["DriverNumber", "Driver", "Points"], axis = 1)
# df = df.drop(["DriverNumber", "Driver", "Position"], axis = 1)
aux = df

In [6]:
# create new columns based on Status values
df['carIssue'] = (df['Status'] == 'carIssue').astype(int)
df['driverIssue'] = (df['Status'] == 'driverIssue').astype(int)
df = df.drop('Status', axis=1)

In [8]:
df = df.drop(["AverageRPM"], axis = 1)
df = df.drop(["AvgLapTime"], axis = 1)
df = df.drop(["AvgSplitTime"], axis = 1)

In [9]:
df["RaceCountry"] = df["RaceCountry"].str.replace(" ", "")
df["TeamName"] = df["TeamName"].str.replace(" ", "")
df["Engine"] = df["Engine"].str.replace(" ", "")

In [10]:
#df.to_csv(r'df_points.csv', index=True, header=True)
#df.to_csv(r'df_position.csv', index=True, header=True)
#df.to_csv(r'plotdata_points.csv', index=True, header=True)
#df.to_csv(r'plotdata_position.csv', index=True, header=True)

In [11]:
# get a list of all categorical variables except "Abbreviation" and "raceCountry"
cat_vars = df.select_dtypes(include=['object']).columns.tolist()
cat_vars.remove("Abbreviation")
cat_vars.remove("RaceCountry")

# create dummy variables
dummies = pd.get_dummies(df[cat_vars], drop_first=True)

# drop the original categorical variables from the DataFrame
df = df.drop(cat_vars, axis=1)

# concatenate the dummy variables with the remaining variables in the DataFrame
df = pd.concat([df, dummies], axis=1)


## MERF

In [19]:
from merf import MERF

In [21]:
# Split the data into two datasets based on the Year column
df_before = df[df['Year'].isin([2019, 2020])]
df_after = df[df['Year'].isin([2021, 2022])]

## Before model

## Train

In [23]:
# Split the data into train and test sets based on the raceID column
train_df = df_before[(df_before['raceID'] >= 1) & (df_before['raceID'] <= 29)]
test_df = df_before[(df_before['raceID'] >= 30) & (df_before['raceID'] <= 37)]

In [24]:
X_train = train_df.drop('Position', axis=1)
Y_train = train_df['Position']
X_test = test_df.drop('Position', axis=1)
Y_test = test_df['Position']

In [25]:
X_train = X_train.drop('RaceCountry', axis=1)
X_train = X_train.drop('Year', axis=1)
X_test = X_test.drop('RaceCountry', axis=1)
X_test = X_test.drop('Year', axis=1)

In [26]:
x_train = X_train.drop(['Abbreviation', 'raceID'], axis = 1)
Z_train = X_train[['raceID']]
clusters_train = X_train['raceID']

In [27]:
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor(max_depth = 10, learning_rate = 0.05, n_estimators = 1000)

In [28]:
mrf = MERF(lgbm, max_iterations=30)
#mrf = MERF(fixed_effects_model = LinearRegression(), max_iterations=30)
mrf.fit(x_train, Z_train, clusters_train, Y_train)


INFO     [merf.py:307] Training GLL is -1814.2977514988797 at iteration 1.
INFO     [merf.py:307] Training GLL is -3111.3117441397603 at iteration 2.
INFO     [merf.py:307] Training GLL is -3262.0117527020284 at iteration 3.
INFO     [merf.py:307] Training GLL is -3370.730770892507 at iteration 4.
INFO     [merf.py:307] Training GLL is -3300.4949597791015 at iteration 5.
INFO     [merf.py:307] Training GLL is -3264.4628998285457 at iteration 6.
INFO     [merf.py:307] Training GLL is -3361.562173619673 at iteration 7.
INFO     [merf.py:307] Training GLL is -3376.0196199996117 at iteration 8.
INFO     [merf.py:307] Training GLL is -3377.2413756351207 at iteration 9.
INFO     [merf.py:307] Training GLL is -3259.731444384213 at iteration 10.
INFO     [merf.py:307] Training GLL is -3333.4668419882823 at iteration 11.
INFO     [merf.py:307] Training GLL is -3391.2086924245114 at iteration 12.
INFO     [merf.py:307] Training GLL is -3410.028867853237 at iteration 13.
INFO     [merf.py:307] Tr

<merf.merf.MERF at 0x7fd1d1055a60>

In [29]:
train_preds = mrf.predict(x_train, Z_train, clusters_train)

In [30]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(Y_train, train_preds)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_train, train_preds)
r2 = r2_score(Y_train, train_preds)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)


Test MSE: 0.0017845513954319317
Test RMSE: 0.04224395099220635
Test MAE: 0.02954712143906553
Test R-squared: 0.9999452055224932


## Test

In [31]:
x_test = X_test.drop(['Abbreviation', 'raceID'], axis = 1)
Z_test = X_test[['raceID']]
clusters_test = X_test['raceID']

In [32]:
x_test

Unnamed: 0,GridPosition,AgeAtGP,BestQualiTime,FLap,SDLapTime,AvgPitTime,PitstopNo,HARD,INTERMEDIATE,MEDIUM,...,TeamName_RacingPoint,TeamName_RedBullRacing,TeamName_Renault,TeamName_ToroRosso,TeamName_Williams,Engine_Honda,Engine_Mercedes,Engine_RedBull,Engine_Renault,CircuitType_street
568,9.0,24.0,93.000,0.0,4.414441,29.5895,2.0,24.0,0.0,11.0,...,0,0,0,0,0,0,0,0,0,1
569,4.0,30.0,92.317,0.0,3.598099,29.6020,1.0,33.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,1
570,10.0,22.0,93.239,0.0,3.308658,30.2610,1.0,25.0,0.0,28.0,...,0,0,0,0,0,0,0,0,0,1
571,18.0,28.0,94.681,0.0,3.643243,30.1880,1.0,33.0,0.0,19.0,...,0,0,0,0,0,0,0,0,0,1
572,15.0,24.0,93.008,0.0,3.737778,30.1610,2.0,26.0,0.0,26.0,...,0,1,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
717,18.0,25.0,98.443,0.0,7.947119,21.8705,2.0,24.0,0.0,30.0,...,0,0,0,0,1,0,1,0,0,0
718,16.0,21.0,98.045,0.0,8.040811,22.0850,1.0,44.0,0.0,10.0,...,0,0,0,0,1,0,1,0,0,0
719,15.0,41.0,97.555,0.0,8.160983,22.0120,1.0,44.0,0.0,10.0,...,0,0,0,0,0,0,0,0,0,0
720,2.0,31.0,95.271,0.0,3.679513,21.5870,1.0,45.0,0.0,10.0,...,0,0,0,0,0,0,1,0,0,0


In [33]:
y_pred = mrf.predict(x_test, Z_test, clusters_test)

In [34]:
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)


Test MSE: 10.651847412729614
Test RMSE: 3.263716809517887
Test MAE: 2.482383290847572
Test R-squared: 0.6592162931861992


## After model

## Train

In [37]:
# Split the data into train and test sets based on the raceID column
train_df = df_after[(df_after['raceID'] >= 38) & (df_after['raceID'] <= 72)]
test_df = df_after[(df_after['raceID'] >= 73) & (df_after['raceID'] <= 80)]

In [38]:
X_train = train_df.drop('Position', axis=1)
Y_train = train_df['Position']
X_test = test_df.drop('Position', axis=1)
Y_test = test_df['Position']

In [39]:
X_train = X_train.drop('RaceCountry', axis=1)
X_train = X_train.drop('Year', axis=1)
X_test = X_test.drop('RaceCountry', axis=1)
X_test = X_test.drop('Year', axis=1)

In [40]:
x_train = X_train.drop(['Abbreviation', 'raceID'], axis = 1)
Z_train = X_train[['raceID']]
clusters_train = X_train['raceID']

In [41]:

#mrf = MERF(fixed_effects_model = LinearRegression(), max_iterations=30)
mrf.fit(x_train, Z_train, clusters_train, Y_train)


INFO     [merf.py:307] Training GLL is -2298.994457420132 at iteration 1.
INFO     [merf.py:307] Training GLL is -3694.9260565979284 at iteration 2.
INFO     [merf.py:307] Training GLL is -3740.356689626113 at iteration 3.
INFO     [merf.py:307] Training GLL is -3663.7045493709566 at iteration 4.
INFO     [merf.py:307] Training GLL is -3804.452324511743 at iteration 5.
INFO     [merf.py:307] Training GLL is -3635.948743023698 at iteration 6.
INFO     [merf.py:307] Training GLL is -3793.622996571447 at iteration 7.
INFO     [merf.py:307] Training GLL is -3617.753548993378 at iteration 8.
INFO     [merf.py:307] Training GLL is -3791.6654470411268 at iteration 9.
INFO     [merf.py:307] Training GLL is -3738.580460283372 at iteration 10.
INFO     [merf.py:307] Training GLL is -3820.277162151804 at iteration 11.
INFO     [merf.py:307] Training GLL is -3918.1384085007917 at iteration 12.
INFO     [merf.py:307] Training GLL is -3825.8194505949473 at iteration 13.
INFO     [merf.py:307] Traini

<merf.merf.MERF at 0x7fd1d1055a60>

In [42]:
train_preds = mrf.predict(x_train, Z_train, clusters_train)

In [43]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(Y_train, train_preds)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_train, train_preds)
r2 = r2_score(Y_train, train_preds)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)

Test MSE: 0.002784377358501824
Test RMSE: 0.052767199646199
Test MAE: 0.037812576651521206
Test R-squared: 0.9999118976241722


## Test

In [44]:
x_test = X_test.drop(['Abbreviation', 'raceID'], axis = 1)
Z_test = X_test[['raceID']]
clusters_test = X_test['raceID']

In [45]:
y_pred = mrf.predict(x_test, Z_test, clusters_test)

In [46]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)


Test MSE: 10.28143895532415
Test RMSE: 3.2064682994416382
Test MAE: 2.4254420680307494
Test R-squared: 0.6741985648239796
