In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
import seaborn as sns
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import FactorAnalysis
from factor_analyzer import FactorAnalyzer

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

%matplotlib inline

from sklearn.preprocessing import OneHotEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier 

from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import TruncatedSVD

import plotly.express as px
import plotly.graph_objs as go
import statsmodels.formula.api as smf
from plotly.graph_objects import Layout

In [2]:
df = pd.read_csv("mergedData.csv")

In [3]:
df = df.drop("Unnamed: 0", axis = 1)

In [4]:
df = df.drop(["DriverNumber", "Driver", "Position"], axis = 1)
aux = df

In [5]:
# create new columns based on Status values
df['carIssue'] = (df['Status'] == 'carIssue').astype(int)
df['driverIssue'] = (df['Status'] == 'driverIssue').astype(int)
df = df.drop('Status', axis=1)

In [6]:
#df['after_2020'] = df['Year'].apply(lambda x: 1 if x > 2020 else 0)

In [7]:
df = df.drop(["AverageRPM"], axis = 1)
df = df.drop(["AvgLapTime"], axis = 1)
df = df.drop(["AvgSplitTime"], axis = 1)

In [8]:
df["RaceCountry"] = df["RaceCountry"].str.replace(" ", "")
df["TeamName"] = df["TeamName"].str.replace(" ", "")
df["Engine"] = df["Engine"].str.replace(" ", "")

In [9]:
#df.to_csv(r'plotdata_position.csv', index=True, header=True)

In [10]:

# get a list of all categorical variables except "Abbreviation" and "raceID"
cat_vars = df.select_dtypes(include=['object']).columns.tolist()
cat_vars.remove("Abbreviation")
cat_vars.remove("RaceCountry")

# create dummy variables for all categorical variables except "Abbreviation" and "raceID"
dummies = pd.get_dummies(df[cat_vars], drop_first=True)

# drop the original categorical variables from the DataFrame
df = df.drop(cat_vars, axis=1)

# concatenate the dummy variables with the remaining variables in the DataFrame
df = pd.concat([df, dummies], axis=1)


In [12]:
# Split the data into two datasets based on the Year column
df_before = df[df['Year'].isin([2019, 2020])]
df_after = df[df['Year'].isin([2021, 2022])]

## MERF

In [13]:
from merf import MERF

In [14]:
#df = df.drop(["RaceCountry", "Year"], axis = 1)

In [15]:
#y = df.loc[:, ["Position"]]
#X = df.drop(["Position"], axis = 1)

In [16]:
#X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Before model

In [17]:
# Split the data into train and test sets based on the raceID column
train_df = df_before[(df_before['raceID'] >= 1) & (df_before['raceID'] <= 29)]
test_df = df_before[(df_before['raceID'] >= 30) & (df_before['raceID'] <= 37)]

In [18]:
X_train = train_df.drop('Points', axis=1)
Y_train = train_df['Points']
X_test = test_df.drop('Points', axis=1)
Y_test = test_df['Points']

In [19]:
X_train = X_train.drop('RaceCountry', axis=1)
X_train = X_train.drop('Year', axis=1)
X_test = X_test.drop('RaceCountry', axis=1)
X_test = X_test.drop('Year', axis=1)




In [20]:
x_train = X_train.drop(['Abbreviation', 'raceID'], axis = 1)
Z_train = X_train[['raceID']]
clusters_train = X_train['raceID']

In [21]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor(objective='reg:squarederror', max_depth=8, learning_rate=0.08, n_estimators=300)

mrf = MERF(fixed_effects_model=xgb_model, max_iterations=30)
mrf.fit(x_train, Z_train, clusters_train, Y_train)

INFO     [merf.py:307] Training GLL is -1861.5024129543822 at iteration 1.
INFO     [merf.py:307] Training GLL is -3616.8404700081323 at iteration 2.
INFO     [merf.py:307] Training GLL is -4783.513279148445 at iteration 3.
INFO     [merf.py:307] Training GLL is -5870.646730385381 at iteration 4.
INFO     [merf.py:307] Training GLL is -5718.31015906151 at iteration 5.
INFO     [merf.py:307] Training GLL is -6037.305205687128 at iteration 6.
INFO     [merf.py:307] Training GLL is -5506.0883684281225 at iteration 7.
INFO     [merf.py:307] Training GLL is -5230.206236134884 at iteration 8.
INFO     [merf.py:307] Training GLL is -5460.877950088498 at iteration 9.
INFO     [merf.py:307] Training GLL is -5558.955313751267 at iteration 10.
INFO     [merf.py:307] Training GLL is -5321.082012432609 at iteration 11.
INFO     [merf.py:307] Training GLL is -5449.56085553815 at iteration 12.
INFO     [merf.py:307] Training GLL is -5394.182222631165 at iteration 13.
INFO     [merf.py:307] Training G

<merf.merf.MERF at 0x7fe02a6da220>

In [22]:
#regressor = mrf.fit(x_train, Z_train, clusters_train, Y_train)

In [23]:
train_preds = mrf.predict(x_train, Z_train, clusters_train)

In [24]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(Y_train, train_preds)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_train, train_preds)
r2 = r2_score(Y_train, train_preds)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)


Test MSE: 4.012061644473455e-05
Test RMSE: 0.006334083709956362
Test MAE: 0.004248404697208776
Test R-squared: 0.9999992284592496


# Test dataset

In [25]:
x_test = X_test.drop(['Abbreviation', 'raceID'], axis = 1)
Z_test = X_test[['raceID']]
clusters_test = X_test['raceID']

In [26]:
x_test

Unnamed: 0,GridPosition,AgeAtGP,BestQualiTime,FLap,SDLapTime,AvgPitTime,PitstopNo,HARD,INTERMEDIATE,MEDIUM,...,TeamName_RacingPoint,TeamName_RedBullRacing,TeamName_Renault,TeamName_ToroRosso,TeamName_Williams,Engine_Honda,Engine_Mercedes,Engine_RedBull,Engine_Renault,CircuitType_street
568,9.0,24.0,93.000,0.0,4.414441,29.5895,2.0,24.0,0.0,11.0,...,0,0,0,0,0,0,0,0,0,1
569,4.0,30.0,92.317,0.0,3.598099,29.6020,1.0,33.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,1
570,10.0,22.0,93.239,0.0,3.308658,30.2610,1.0,25.0,0.0,28.0,...,0,0,0,0,0,0,0,0,0,1
571,18.0,28.0,94.681,0.0,3.643243,30.1880,1.0,33.0,0.0,19.0,...,0,0,0,0,0,0,0,0,0,1
572,15.0,24.0,93.008,0.0,3.737778,30.1610,2.0,26.0,0.0,26.0,...,0,1,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
717,18.0,25.0,98.443,0.0,7.947119,21.8705,2.0,24.0,0.0,30.0,...,0,0,0,0,1,0,1,0,0,0
718,16.0,21.0,98.045,0.0,8.040811,22.0850,1.0,44.0,0.0,10.0,...,0,0,0,0,1,0,1,0,0,0
719,15.0,41.0,97.555,0.0,8.160983,22.0120,1.0,44.0,0.0,10.0,...,0,0,0,0,0,0,0,0,0,0
720,2.0,31.0,95.271,0.0,3.679513,21.5870,1.0,45.0,0.0,10.0,...,0,0,0,0,0,0,1,0,0,0


In [27]:
y_pred = mrf.predict(x_test, Z_test, clusters_test)

In [28]:
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)


Test MSE: 22.223789884044084
Test RMSE: 4.7142114806236775
Test MAE: 3.01737275980292
Test R-squared: 0.5760407140938191


In [29]:
data = {"Y_test": Y_test, "y_pred": y_pred}

# Create the DataFrame
dfplot = pd.DataFrame(data)

# Print the DataFrame
print(dfplot)

     Y_test     y_pred
568     2.0   2.136305
569    12.0   9.310695
570     8.0   2.411396
571     0.0   0.183204
572     1.0   8.703220
..      ...        ...
717     0.0  -0.195822
718     0.0  -0.119904
719     0.0  -0.185277
720    18.0  20.447754
721     0.0  -0.438783

[154 rows x 2 columns]


In [30]:
import plotly.express as px



fig = px.scatter(
    dfplot, x='Y_test', y='y_pred', opacity=0.65,
    trendline='ols', trendline_color_override='darkblue',
    width=550, height=400
    
)
fig.show()

In [31]:
importance = mrf.trained_fe_model.feature_importances_

importance_df_before = pd.DataFrame({'Feature': x_train.columns, 'Importance': importance})

importance_df_before = importance_df_before.sort_values('Importance', ascending=False).reset_index(drop=True)
print(importance_df_before)

                     Feature  Importance
0                   carIssue    0.292619
1                driverIssue    0.265181
2               GridPosition    0.067130
3     TeamName_RedBullRacing    0.053025
4          TeamName_Mercedes    0.044983
5        TeamName_HaasF1Team    0.040312
6               INTERMEDIATE    0.035969
7       TeamName_RacingPoint    0.023159
8             Engine_Renault    0.022722
9                      Brake    0.018041
10          TeamName_McLaren    0.017095
11                 PitstopNo    0.016563
12  TeamName_AlfaRomeoRacing    0.013401
13                      SOFT    0.013337
14          TeamName_Ferrari    0.012296
15                  MaxSpeed    0.011471
16                    MEDIUM    0.006857
17            MaxThrottlePct    0.005848
18                 SDLapTime    0.005528
19                AvgPitTime    0.004783
20                    MaxRPM    0.004552
21              AverageSpeed    0.003892
22                      HARD    0.003826
23             B

## After model

In [32]:
df_after["raceID"]

722     38
723     38
724     38
725     38
726     38
        ..
1542    80
1543    80
1544    80
1545    80
1546    80
Name: raceID, Length: 825, dtype: int64

In [33]:
df_after

Unnamed: 0,Abbreviation,GridPosition,Points,RaceCountry,Year,AgeAtGP,BestQualiTime,FLap,SDLapTime,AvgPitTime,...,TeamName_RacingPoint,TeamName_RedBullRacing,TeamName_Renault,TeamName_ToroRosso,TeamName_Williams,Engine_Honda,Engine_Mercedes,Engine_RedBull,Engine_Renault,CircuitType_street
722,GAS,5.0,0.0,Bahrain,2021.0,24.0,89.809,0.0,9.870618,29.212667,...,0,0,0,0,0,0,0,0,0,0
723,PER,20.0,10.0,Bahrain,2021.0,31.0,90.659,0.0,9.238201,24.096333,...,0,1,0,0,0,1,0,0,0,0
724,ALO,9.0,0.0,Bahrain,2021.0,39.0,90.249,0.0,9.050427,24.574000,...,0,0,0,0,0,0,0,0,1,0
725,LEC,4.0,8.0,Bahrain,2021.0,23.0,89.678,0.0,7.326363,24.550500,...,0,0,0,0,0,0,0,0,0,0
726,STR,10.0,1.0,Bahrain,2021.0,22.0,90.601,0.0,9.904268,25.204500,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1542,VET,9.0,1.0,AbuDhabi,2022.0,35.0,84.961,0.0,3.057131,23.186000,...,0,0,0,0,0,0,1,0,0,0
1543,SAI,4.0,12.0,AbuDhabi,2022.0,28.0,84.242,0.0,3.799668,21.752500,...,0,0,0,0,0,0,0,0,0,0
1544,LAT,20.0,0.0,AbuDhabi,2022.0,27.0,86.054,0.0,6.817590,23.798500,...,0,0,0,0,1,0,1,0,0,0
1545,RUS,6.0,10.0,AbuDhabi,2022.0,23.0,84.511,0.0,4.489247,26.087000,...,0,0,0,0,0,0,1,0,0,0


In [34]:
# Split the data into train and test sets based on the raceID column
train_df = df_after[(df_after['raceID'] >= 38) & (df_after['raceID'] <= 72)]
test_df = df_after[(df_after['raceID'] >= 73) & (df_after['raceID'] <= 80)]

In [35]:
X_train = train_df.drop('Points', axis=1)
Y_train = train_df['Points']
X_test = test_df.drop('Points', axis=1)
Y_test = test_df['Points']

In [36]:
X_train = X_train.drop('RaceCountry', axis=1)
X_train = X_train.drop('Year', axis=1)
X_test = X_test.drop('RaceCountry', axis=1)
X_test = X_test.drop('Year', axis=1)

In [37]:
x_train = X_train.drop(['Abbreviation', 'raceID'], axis = 1)
Z_train = X_train[['raceID']]
clusters_train = X_train['raceID']

In [38]:
mrf.fit(x_train, Z_train, clusters_train, Y_train)

INFO     [merf.py:307] Training GLL is -2347.464631674962 at iteration 1.
INFO     [merf.py:307] Training GLL is -4375.735160031037 at iteration 2.
INFO     [merf.py:307] Training GLL is -5269.977615079012 at iteration 3.
INFO     [merf.py:307] Training GLL is -5455.436100938193 at iteration 4.
INFO     [merf.py:307] Training GLL is -5347.266894456135 at iteration 5.
INFO     [merf.py:307] Training GLL is -5275.112014669853 at iteration 6.
INFO     [merf.py:307] Training GLL is -5129.744678084023 at iteration 7.
INFO     [merf.py:307] Training GLL is -5097.595012798317 at iteration 8.
INFO     [merf.py:307] Training GLL is -5033.559568479897 at iteration 9.
INFO     [merf.py:307] Training GLL is -5437.538234341727 at iteration 10.
INFO     [merf.py:307] Training GLL is -5293.973506286812 at iteration 11.
INFO     [merf.py:307] Training GLL is -5286.794836054401 at iteration 12.
INFO     [merf.py:307] Training GLL is -5916.858921943049 at iteration 13.
INFO     [merf.py:307] Training GL

<merf.merf.MERF at 0x7fe02a6da220>

In [39]:
train_preds = mrf.predict(x_train, Z_train, clusters_train)

In [40]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(Y_train, train_preds)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_train, train_preds)
r2 = r2_score(Y_train, train_preds)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)

Test MSE: 0.00038119542615666185
Test RMSE: 0.019524226646826806
Test MAE: 0.012267921363843979
Test R-squared: 0.9999927536288657


# Test Dataset

In [41]:
x_test = X_test.drop(['Abbreviation', 'raceID'], axis = 1)
Z_test = X_test[['raceID']]
clusters_test = X_test['raceID']

In [42]:
y_pred = mrf.predict(x_test, Z_test, clusters_test)

In [43]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)


Test MSE: 14.233136537110266
Test RMSE: 3.7726829362020693
Test MAE: 2.4835178769043376
Test R-squared: 0.7276366429649525


In [44]:
data = {"Y_test": Y_test, "y_pred": y_pred}

# Create the DataFrame
dfplot = pd.DataFrame(data)

# Print the DataFrame
print(dfplot)



      Y_test     y_pred
1392    25.0  18.426392
1393     0.0   1.014280
1394    10.0  14.164695
1395     8.0   2.714829
1396    15.0  14.304780
...      ...        ...
1542     1.0   5.086267
1543    12.0  10.072722
1544     0.0   0.104793
1545    10.0  10.697105
1546     0.0   0.021462

[155 rows x 2 columns]


In [45]:
fig = px.scatter(
    dfplot, x='Y_test', y='y_pred', opacity=0.65,
    trendline='ols', trendline_color_override='darkblue',
    color_discrete_sequence=['red'],
    width=550, height=400
)
fig.show()


In [46]:
df.columns

Index(['Abbreviation', 'GridPosition', 'Points', 'RaceCountry', 'Year',
       'AgeAtGP', 'BestQualiTime', 'FLap', 'SDLapTime', 'AvgPitTime',
       'PitstopNo', 'HARD', 'INTERMEDIATE', 'MEDIUM', 'SOFT', 'WET', 'Rain',
       'AverageSpeed', 'MaxSpeed', 'MaxRPM', 'AverageThrottle',
       'MaxThrottlePct', 'Brake', 'raceID', 'carIssue', 'driverIssue',
       'TeamName_AlfaRomeoRacing', 'TeamName_AlphaTauri', 'TeamName_Alpine',
       'TeamName_AstonMartin', 'TeamName_Ferrari', 'TeamName_HaasF1Team',
       'TeamName_McLaren', 'TeamName_Mercedes', 'TeamName_RacingPoint',
       'TeamName_RedBullRacing', 'TeamName_Renault', 'TeamName_ToroRosso',
       'TeamName_Williams', 'Engine_Honda', 'Engine_Mercedes',
       'Engine_RedBull', 'Engine_Renault', 'CircuitType_street'],
      dtype='object')

In [49]:
importance = mrf.trained_fe_model.feature_importances_
importance_df_after = pd.DataFrame({'Feature': x_train.columns, 'Importance': importance})
importance_df_after = importance_df_after.sort_values('Importance', ascending=False).reset_index(drop=True)
print(importance_df_after)


                     Feature  Importance
0                   carIssue    0.387701
1     TeamName_RedBullRacing    0.148407
2          TeamName_Mercedes    0.102272
3           TeamName_Ferrari    0.060412
4               GridPosition    0.058019
5             Engine_RedBull    0.041010
6        TeamName_AlphaTauri    0.021501
7                  PitstopNo    0.016065
8   TeamName_AlfaRomeoRacing    0.012223
9                      Brake    0.011801
10               driverIssue    0.011037
11                    MEDIUM    0.010452
12           TeamName_Alpine    0.010399
13            MaxThrottlePct    0.009881
14                      FLap    0.009536
15              AverageSpeed    0.008609
16         TeamName_Williams    0.008059
17                  MaxSpeed    0.008009
18           Engine_Mercedes    0.007417
19                   AgeAtGP    0.007348
20                    MaxRPM    0.006399
21              Engine_Honda    0.006370
22                AvgPitTime    0.005413
23              

In [50]:
importance = pd.merge(importance_df_before, importance_df_after, on="Feature")
importance

Unnamed: 0,Feature,Importance_x,Importance_y
0,carIssue,0.292619,0.387701
1,driverIssue,0.265181,0.011037
2,GridPosition,0.06713,0.058019
3,TeamName_RedBullRacing,0.053025,0.148407
4,TeamName_Mercedes,0.044983,0.102272
5,TeamName_HaasF1Team,0.040312,0.000236
6,INTERMEDIATE,0.035969,0.004489
7,TeamName_RacingPoint,0.023159,0.0
8,Engine_Renault,0.022722,0.0
9,Brake,0.018041,0.011801
