In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
import seaborn as sns
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import FactorAnalysis
from factor_analyzer import FactorAnalyzer

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

%matplotlib inline

from sklearn.preprocessing import OneHotEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier 

from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import TruncatedSVD

import plotly.express as px
import plotly.graph_objs as go
import statsmodels.formula.api as smf
from plotly.graph_objects import Layout

Import the dataset and do the required pre-processing

In [2]:
df = pd.read_csv("mergedData.csv")

In [3]:
df = df.drop("Unnamed: 0", axis = 1)

In [4]:
# Change outcome varibale and drop Position instead of Points
df = df.drop(["DriverNumber", "Driver", "Position"], axis = 1)
aux = df

In [5]:
# create new columns based on Status values
df['carIssue'] = (df['Status'] == 'carIssue').astype(int)
df['driverIssue'] = (df['Status'] == 'driverIssue').astype(int)
df = df.drop('Status', axis=1)

In [7]:
df = df.drop(["AverageRPM"], axis = 1)
df = df.drop(["AvgLapTime"], axis = 1)
df = df.drop(["AvgSplitTime"], axis = 1)

In [9]:
df["RaceCountry"] = df["RaceCountry"].str.replace(" ", "")
df["TeamName"] = df["TeamName"].str.replace(" ", "")
df["Engine"] = df["Engine"].str.replace(" ", "")

In [9]:
#df.to_csv(r'plotdata_position.csv', index=True, header=True)

In [10]:

# get a list of all categorical variables except "Abbreviation" and "raceCountry"
cat_vars = df.select_dtypes(include=['object']).columns.tolist()
cat_vars.remove("Abbreviation")
cat_vars.remove("RaceCountry")

# create dummy variables 
dummies = pd.get_dummies(df[cat_vars], drop_first=True)

# drop the original categorical variables from the DataFrame
df = df.drop(cat_vars, axis=1)

# concatenate the dummy variables with the remaining variables in the DataFrame
df = pd.concat([df, dummies], axis=1)


In [11]:
# Subset only on drivers who obtained points
df = df[df["Points"] != 0]

In [12]:
# Split the data into two datasets based on the Year column
df_before = df[df['Year'].isin([2019, 2020])]
df_after = df[df['Year'].isin([2021, 2022])]

## MERF

In [13]:
from merf import MERF

## Before model

## Train

In [14]:
# Split the data into train and test sets based on the raceID column
train_df = df_before[(df_before['raceID'] >= 1) & (df_before['raceID'] <= 29)]
test_df = df_before[(df_before['raceID'] >= 30) & (df_before['raceID'] <= 37)]

In [15]:
X_train = train_df.drop('Points', axis=1)
Y_train = train_df['Points']
X_test = test_df.drop('Points', axis=1)
Y_test = test_df['Points']

In [16]:
X_train = X_train.drop('RaceCountry', axis=1)
X_train = X_train.drop('Year', axis=1)
X_test = X_test.drop('RaceCountry', axis=1)
X_test = X_test.drop('Year', axis=1)

In [17]:
x_train = X_train.drop(['Abbreviation', 'raceID'], axis = 1)
Z_train = X_train[['raceID']]
clusters_train = X_train['raceID']

In [18]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor(objective='reg:squarederror', max_depth=8, learning_rate=0.08, n_estimators=300)

mrf = MERF(fixed_effects_model=xgb_model, max_iterations=30)
mrf.fit(x_train, Z_train, clusters_train, Y_train)

INFO     [merf.py:307] Training GLL is -818.1723410146841 at iteration 1.
INFO     [merf.py:307] Training GLL is -1582.825784420217 at iteration 2.
INFO     [merf.py:307] Training GLL is -2344.2230080765703 at iteration 3.
INFO     [merf.py:307] Training GLL is -3082.3650496935647 at iteration 4.
INFO     [merf.py:307] Training GLL is -3720.807936065543 at iteration 5.
INFO     [merf.py:307] Training GLL is -4068.6057156332336 at iteration 6.
INFO     [merf.py:307] Training GLL is -4127.5845186886745 at iteration 7.
INFO     [merf.py:307] Training GLL is -4139.740225399527 at iteration 8.
INFO     [merf.py:307] Training GLL is -4252.609601184473 at iteration 9.
INFO     [merf.py:307] Training GLL is -4155.757743357531 at iteration 10.
INFO     [merf.py:307] Training GLL is -4137.415832254544 at iteration 11.
INFO     [merf.py:307] Training GLL is -4178.334245126116 at iteration 12.
INFO     [merf.py:307] Training GLL is -4187.424744322849 at iteration 13.
INFO     [merf.py:307] Trainin

<merf.merf.MERF at 0x7fe26298a850>

In [19]:
#regressor = mrf.fit(x_train, Z_train, clusters_train, Y_train)

In [20]:
train_preds = mrf.predict(x_train, Z_train, clusters_train)

In [21]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(Y_train, train_preds)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_train, train_preds)
r2 = r2_score(Y_train, train_preds)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)


Test MSE: 1.0937024289716511e-06
Test RMSE: 0.0010458022896186693
Test MAE: 0.000721949194541852
Test R-squared: 0.9999999788787217


## Test

In [22]:
x_test = X_test.drop(['Abbreviation', 'raceID'], axis = 1)
Z_test = X_test[['raceID']]
clusters_test = X_test['raceID']

In [23]:
x_test

Unnamed: 0,GridPosition,AgeAtGP,BestQualiTime,FLap,SDLapTime,AvgPitTime,PitstopNo,HARD,INTERMEDIATE,MEDIUM,...,TeamName_RacingPoint,TeamName_RedBullRacing,TeamName_Renault,TeamName_ToroRosso,TeamName_Williams,Engine_Honda,Engine_Mercedes,Engine_RedBull,Engine_Renault,CircuitType_street
568,9.0,24.0,93.000,0.0,4.414441,29.5895,2.0,24.0,0.0,11.0,...,0,0,0,0,0,0,0,0,0,1
569,4.0,30.0,92.317,0.0,3.598099,29.6020,1.0,33.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,1
570,10.0,22.0,93.239,0.0,3.308658,30.2610,1.0,25.0,0.0,28.0,...,0,0,0,0,0,0,0,0,0,1
572,15.0,24.0,93.008,0.0,3.737778,30.1610,2.0,26.0,0.0,26.0,...,0,1,0,0,0,1,0,0,0,1
573,11.0,26.0,93.249,0.0,3.362108,29.6620,1.0,30.0,0.0,23.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
711,1.0,23.0,95.246,0.0,3.469862,21.2890,1.0,45.0,0.0,10.0,...,0,1,0,0,0,1,0,0,0,0
712,4.0,21.0,95.497,0.0,7.104119,21.8400,1.0,45.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
713,3.0,35.0,95.332,0.0,3.931841,22.4060,1.0,45.0,0.0,10.0,...,0,0,0,0,0,0,1,0,0,0
716,6.0,26.0,95.815,0.0,9.226260,22.1400,1.0,45.0,0.0,10.0,...,0,0,0,0,0,0,0,0,1,0


In [24]:
y_pred = mrf.predict(x_test, Z_test, clusters_test)

In [25]:
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)


Test MSE: 34.5937695881409
Test RMSE: 5.881646843201392
Test MAE: 4.498683062195778
Test R-squared: 0.3333249260331297


In [28]:
# Get feature importances
importance = mrf.trained_fe_model.feature_importances_

importance_df_before = pd.DataFrame({'Feature': x_train.columns, 'Importance': importance})

importance_df_before = importance_df_before.sort_values('Importance', ascending=False).reset_index(drop=True)
print(importance_df_before)

                     Feature  Importance
0       TeamName_RacingPoint    0.205741
1               GridPosition    0.163604
2        TeamName_HaasF1Team    0.076255
3          TeamName_Mercedes    0.074623
4                  PitstopNo    0.069269
5               INTERMEDIATE    0.054051
6     TeamName_RedBullRacing    0.051240
7                       HARD    0.031202
8                   MaxSpeed    0.030236
9   TeamName_AlfaRomeoRacing    0.025095
10          TeamName_McLaren    0.023150
11                    MEDIUM    0.021886
12            MaxThrottlePct    0.020069
13          TeamName_Ferrari    0.018695
14                     Brake    0.017824
15                 SDLapTime    0.015919
16                      SOFT    0.014112
17        TeamName_ToroRosso    0.012100
18                   AgeAtGP    0.012061
19           AverageThrottle    0.011261
20                    MaxRPM    0.011133
21             BestQualiTime    0.008501
22              AverageSpeed    0.007821
23              

## After model

## Train

In [31]:
# Split the data into train and test sets based on the raceID column
train_df = df_after[(df_after['raceID'] >= 38) & (df_after['raceID'] <= 72)]
test_df = df_after[(df_after['raceID'] >= 73) & (df_after['raceID'] <= 80)]

In [32]:
X_train = train_df.drop('Points', axis=1)
Y_train = train_df['Points']
X_test = test_df.drop('Points', axis=1)
Y_test = test_df['Points']

In [33]:
X_train = X_train.drop('RaceCountry', axis=1)
X_train = X_train.drop('Year', axis=1)
X_test = X_test.drop('RaceCountry', axis=1)
X_test = X_test.drop('Year', axis=1)

In [34]:
x_train = X_train.drop(['Abbreviation', 'raceID'], axis = 1)
Z_train = X_train[['raceID']]
clusters_train = X_train['raceID']

In [35]:
mrf.fit(x_train, Z_train, clusters_train, Y_train)

INFO     [merf.py:307] Training GLL is -1156.4901275319978 at iteration 1.
INFO     [merf.py:307] Training GLL is -2072.7546998689268 at iteration 2.
INFO     [merf.py:307] Training GLL is -2987.7841754854853 at iteration 3.
INFO     [merf.py:307] Training GLL is -3881.9599211375403 at iteration 4.
INFO     [merf.py:307] Training GLL is -4661.450462566137 at iteration 5.
INFO     [merf.py:307] Training GLL is -5013.230426990707 at iteration 6.
INFO     [merf.py:307] Training GLL is -4865.938757800317 at iteration 7.
INFO     [merf.py:307] Training GLL is -5102.08300296113 at iteration 8.
INFO     [merf.py:307] Training GLL is -4832.081579774717 at iteration 9.
INFO     [merf.py:307] Training GLL is -4724.234464335062 at iteration 10.
INFO     [merf.py:307] Training GLL is -4923.589080953077 at iteration 11.
INFO     [merf.py:307] Training GLL is -4911.834004352975 at iteration 12.
INFO     [merf.py:307] Training GLL is -5099.217299629781 at iteration 13.
INFO     [merf.py:307] Training

<merf.merf.MERF at 0x7fe26298a850>

In [36]:
train_preds = mrf.predict(x_train, Z_train, clusters_train)

In [37]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(Y_train, train_preds)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_train, train_preds)
r2 = r2_score(Y_train, train_preds)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)

Test MSE: 9.139790307852647e-06
Test RMSE: 0.0030232086113684987
Test MAE: 0.002047876688255661
Test R-squared: 0.9999998245079853


## Test

In [38]:
x_test = X_test.drop(['Abbreviation', 'raceID'], axis = 1)
Z_test = X_test[['raceID']]
clusters_test = X_test['raceID']

In [39]:
y_pred = mrf.predict(x_test, Z_test, clusters_test)

In [40]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)


Test MSE: 24.65488428205937
Test RMSE: 4.965368494085748
Test MAE: 3.734183921664953
Test R-squared: 0.5248625114268766


In [43]:
# Get feature importance scores
importance = mrf.trained_fe_model.feature_importances_
importance_df_after = pd.DataFrame({'Feature': x_train.columns, 'Importance': importance})
importance_df_after = importance_df_after.sort_values('Importance', ascending=False).reset_index(drop=True)
print(importance_df_after)

                     Feature  Importance
0          TeamName_Mercedes    0.366763
1     TeamName_RedBullRacing    0.232540
2           TeamName_Ferrari    0.083730
3               GridPosition    0.082617
4                  PitstopNo    0.024776
5                        WET    0.018630
6                    AgeAtGP    0.016902
7                      Brake    0.013593
8                     MEDIUM    0.013409
9                   MaxSpeed    0.012517
10                      SOFT    0.010155
11            MaxThrottlePct    0.010019
12                    MaxRPM    0.009329
13                      FLap    0.009134
14              INTERMEDIATE    0.008830
15                AvgPitTime    0.008801
16        CircuitType_street    0.007940
17           AverageThrottle    0.007611
18            Engine_RedBull    0.007138
19      TeamName_AstonMartin    0.007001
20              AverageSpeed    0.006829
21             BestQualiTime    0.005879
22         TeamName_Williams    0.005586
23       TeamNam

In [44]:
# Merge feature importance scores before and after based on features
importance = pd.merge(importance_df_before, importance_df_after, on="Feature")
importance

Unnamed: 0,Feature,Importance_x,Importance_y
0,TeamName_RacingPoint,0.205741,0.0
1,GridPosition,0.163604,0.082617
2,TeamName_HaasF1Team,0.076255,0.000361
3,TeamName_Mercedes,0.074623,0.366763
4,PitstopNo,0.069269,0.024776
5,INTERMEDIATE,0.054051,0.00883
6,TeamName_RedBullRacing,0.05124,0.23254
7,HARD,0.031202,0.005468
8,MaxSpeed,0.030236,0.012517
9,TeamName_AlfaRomeoRacing,0.025095,0.000634
