In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
import seaborn as sns
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import FactorAnalysis
from factor_analyzer import FactorAnalyzer

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

%matplotlib inline

from sklearn.preprocessing import OneHotEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier 

from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import TruncatedSVD

import plotly.express as px
import plotly.graph_objs as go
import statsmodels.formula.api as smf
from plotly.graph_objects import Layout

Import the dataset and do the required pre-processing

In [2]:
df = pd.read_csv("mergedData.csv")

In [3]:
df = df.drop("Unnamed: 0", axis = 1)

In [4]:
df.columns

Index(['Abbreviation', 'DriverNumber', 'TeamName', 'Position', 'GridPosition',
       'Status', 'Points', 'RaceCountry', 'Year', 'AgeAtGP', 'BestQualiTime',
       'FLap', 'AvgLapTime', 'SDLapTime', 'AvgSplitTime', 'AvgPitTime',
       'PitstopNo', 'HARD', 'INTERMEDIATE', 'MEDIUM', 'SOFT', 'WET', 'Engine',
       'Rain', 'Driver', 'AverageSpeed', 'MaxSpeed', 'AverageRPM', 'MaxRPM',
       'AverageThrottle', 'MaxThrottlePct', 'Brake', 'raceID', 'CircuitType'],
      dtype='object')

In [5]:
df = df.drop(["DriverNumber", "Driver", "Points"], axis = 1)
aux = df

In [6]:
# create new columns based on Status values
df['carIssue'] = (df['Status'] == 'carIssue').astype(int)
df['driverIssue'] = (df['Status'] == 'driverIssue').astype(int)
df = df.drop('Status', axis=1)

In [9]:
df = df.drop(["AverageRPM"], axis = 1)
df = df.drop(["AvgLapTime"], axis = 1)
df = df.drop(["AvgSplitTime"], axis = 1)

In [10]:
df["RaceCountry"] = df["RaceCountry"].str.replace(" ", "")
df["TeamName"] = df["TeamName"].str.replace(" ", "")
df["Engine"] = df["Engine"].str.replace(" ", "")

In [None]:
#df.to_csv(r'df_points.csv', index=True, header=True)
#df.to_csv(r'df_position.csv', index=True, header=True)
#df.to_csv(r'plotdata_points.csv', index=True, header=True)
#df.to_csv(r'plotdata_position.csv', index=True, header=True)

In [11]:
# get a list of all categorical variables except "Abbreviation" and "raceCountry"
cat_vars = df.select_dtypes(include=['object']).columns.tolist()
cat_vars.remove("Abbreviation")
cat_vars.remove("RaceCountry")

# create dummy variables
dummies = pd.get_dummies(df[cat_vars], drop_first=True)

# drop the original categorical variables from the DataFrame
df = df.drop(cat_vars, axis=1)

# concatenate the dummy variables with the remaining variables in the DataFrame
df = pd.concat([df, dummies], axis=1)


## MERF

In [15]:
from merf import MERF

In [17]:
# Split the data into two datasets based on the Year column
df_before = df[df['Year'].isin([2019, 2020])]
df_after = df[df['Year'].isin([2021, 2022])]

## Before model

## Train

In [19]:
# Split the data into train and test sets based on the raceID column
train_df = df_before[(df_before['raceID'] >= 1) & (df_before['raceID'] <= 29)]
test_df = df_before[(df_before['raceID'] >= 30) & (df_before['raceID'] <= 37)]

In [20]:
X_train = train_df.drop('Position', axis=1)
Y_train = train_df['Position']
X_test = test_df.drop('Position', axis=1)
Y_test = test_df['Position']

In [21]:
X_train = X_train.drop('RaceCountry', axis=1)
X_train = X_train.drop('Year', axis=1)
X_test = X_test.drop('RaceCountry', axis=1)
X_test = X_test.drop('Year', axis=1)

In [22]:
x_train = X_train.drop(['Abbreviation', 'raceID'], axis = 1)
Z_train = X_train[['raceID']]
clusters_train = X_train['raceID']

In [23]:
from sklearn.ensemble import RandomForestRegressor

mrf = MERF(fixed_effects_model=RandomForestRegressor(max_depth = 3), max_iterations=30)
#mrf = MERF(max_iterations=30)
#mrf = MERF(fixed_effects_model = LinearRegression(), max_iterations=30)
mrf.fit(x_train, Z_train, clusters_train, Y_train)


INFO     [merf.py:307] Training GLL is 1667.7207546822453 at iteration 1.
INFO     [merf.py:307] Training GLL is 1644.492180628175 at iteration 2.
INFO     [merf.py:307] Training GLL is 1641.503739236482 at iteration 3.
INFO     [merf.py:307] Training GLL is 1643.1927281405087 at iteration 4.
INFO     [merf.py:307] Training GLL is 1634.7337215036828 at iteration 5.
INFO     [merf.py:307] Training GLL is 1633.29006564651 at iteration 6.
INFO     [merf.py:307] Training GLL is 1630.438688044807 at iteration 7.
INFO     [merf.py:307] Training GLL is 1634.1952936398025 at iteration 8.
INFO     [merf.py:307] Training GLL is 1626.253326111924 at iteration 9.
INFO     [merf.py:307] Training GLL is 1626.6359900715845 at iteration 10.
INFO     [merf.py:307] Training GLL is 1623.7718507618831 at iteration 11.
INFO     [merf.py:307] Training GLL is 1622.434981957772 at iteration 12.
INFO     [merf.py:307] Training GLL is 1626.2529652467033 at iteration 13.
INFO     [merf.py:307] Training GLL is 16

<merf.merf.MERF at 0x7f9333ea76d0>

In [24]:
train_preds = mrf.predict(x_train, Z_train, clusters_train)

In [25]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(Y_train, train_preds)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_train, train_preds)
r2 = r2_score(Y_train, train_preds)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)


Test MSE: 8.93061313801887
Test RMSE: 2.9884131471432913
Test MAE: 2.32267877166
Test R-squared: 0.7257863898088434


## Test

In [26]:
x_test = X_test.drop(['Abbreviation', 'raceID'], axis = 1)
Z_test = X_test[['raceID']]
clusters_test = X_test['raceID']

In [28]:
y_pred = mrf.predict(x_test, Z_test, clusters_test)

In [29]:
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)


Test MSE: 12.08399345924616
Test RMSE: 3.476203886317107
Test MAE: 2.809712983421306
Test R-squared: 0.6133977586616259


## After model

## Train

In [34]:
# Split the data into train and test sets based on the raceID column
train_df = df_after[(df_after['raceID'] >= 38) & (df_after['raceID'] <= 72)]
test_df = df_after[(df_after['raceID'] >= 73) & (df_after['raceID'] <= 80)]

In [35]:
X_train = train_df.drop('Position', axis=1)
Y_train = train_df['Position']
X_test = test_df.drop('Position', axis=1)
Y_test = test_df['Position']

In [36]:
X_train = X_train.drop('RaceCountry', axis=1)
X_train = X_train.drop('Year', axis=1)
X_test = X_test.drop('RaceCountry', axis=1)
X_test = X_test.drop('Year', axis=1)

In [37]:
x_train = X_train.drop(['Abbreviation', 'raceID'], axis = 1)
Z_train = X_train[['raceID']]
clusters_train = X_train['raceID']

In [38]:

mrf.fit(x_train, Z_train, clusters_train, Y_train)


INFO     [merf.py:307] Training GLL is 1943.336622875812 at iteration 1.
INFO     [merf.py:307] Training GLL is 1918.5978471651279 at iteration 2.
INFO     [merf.py:307] Training GLL is 1912.228048140183 at iteration 3.
INFO     [merf.py:307] Training GLL is 1907.9358149604664 at iteration 4.
INFO     [merf.py:307] Training GLL is 1903.4509125730285 at iteration 5.
INFO     [merf.py:307] Training GLL is 1902.0050403753103 at iteration 6.
INFO     [merf.py:307] Training GLL is 1894.5348328439893 at iteration 7.
INFO     [merf.py:307] Training GLL is 1893.032564766561 at iteration 8.
INFO     [merf.py:307] Training GLL is 1894.5779495112515 at iteration 9.
INFO     [merf.py:307] Training GLL is 1890.091034926103 at iteration 10.
INFO     [merf.py:307] Training GLL is 1888.1100596779393 at iteration 11.
INFO     [merf.py:307] Training GLL is 1880.865080873833 at iteration 12.
INFO     [merf.py:307] Training GLL is 1885.0335846085184 at iteration 13.
INFO     [merf.py:307] Training GLL is 

<merf.merf.MERF at 0x7f9333ea76d0>

In [39]:
train_preds = mrf.predict(x_train, Z_train, clusters_train)

In [40]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(Y_train, train_preds)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_train, train_preds)
r2 = r2_score(Y_train, train_preds)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)

Test MSE: 10.198891307305878
Test RMSE: 3.1935703072432706
Test MAE: 2.4295741487177502
Test R-squared: 0.6772899505737366


## Test

In [41]:
x_test = X_test.drop(['Abbreviation', 'raceID'], axis = 1)
Z_test = X_test[['raceID']]
clusters_test = X_test['raceID']

In [42]:
y_pred = mrf.predict(x_test, Z_test, clusters_test)

In [43]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)


Test MSE: 8.893606771940293
Test RMSE: 2.9822150780821115
Test MAE: 2.3635600126175174
Test R-squared: 0.7181766226712019
