In [154]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
import seaborn as sns
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import FactorAnalysis
from factor_analyzer import FactorAnalyzer

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

%matplotlib inline

from sklearn.preprocessing import OneHotEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier 

from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import TruncatedSVD

import plotly.express as px
import plotly.graph_objs as go
import statsmodels.formula.api as smf
from plotly.graph_objects import Layout

Import the dataset and do the required pre-processing

In [155]:
df = pd.read_csv("mergedData.csv")

In [156]:
df = df.drop("Unnamed: 0", axis = 1)

In [157]:
df = df.drop(["DriverNumber", "Driver", "Points"], axis = 1)
aux = df

In [158]:
# create new columns based on Status values
df['carIssue'] = (df['Status'] == 'carIssue').astype(int)
df['driverIssue'] = (df['Status'] == 'driverIssue').astype(int)
df = df.drop('Status', axis=1)

In [160]:
df = df.drop(["AverageRPM"], axis = 1)
df = df.drop(["AvgLapTime"], axis = 1)
df = df.drop(["AvgSplitTime"], axis = 1)

In [161]:
df["RaceCountry"] = df["RaceCountry"].str.replace(" ", "")
df["TeamName"] = df["TeamName"].str.replace(" ", "")
df["Engine"] = df["Engine"].str.replace(" ", "")

In [162]:
#df.to_csv(r'df_points.csv', index=True, header=True)
#df.to_csv(r'df_position.csv', index=True, header=True)
#df.to_csv(r'plotdata_points.csv', index=True, header=True)
#df.to_csv(r'plotdata_position.csv', index=True, header=True)

In [163]:
# get a list of all categorical variables except "Abbreviation" and "raceCountry"
cat_vars = df.select_dtypes(include=['object']).columns.tolist()
cat_vars.remove("Abbreviation")
cat_vars.remove("RaceCountry")

# create dummy variables
dummies = pd.get_dummies(df[cat_vars], drop_first=True)

# drop the original categorical variables from the DataFrame
df = df.drop(cat_vars, axis=1)

# concatenate the dummy variables with the remaining variables in the DataFrame
df = pd.concat([df, dummies], axis=1)

In [164]:
# Split the data into two datasets based on the Year column
df_before = df[df['Year'].isin([2019, 2020])]
df_after = df[df['Year'].isin([2021, 2022])]

## MERF

In [165]:
from merf import MERF

## Before model

## Train

In [166]:
# Split the data into train and test sets based on the raceID column
train_df = df_before[(df_before['raceID'] >= 1) & (df_before['raceID'] <= 29)]
test_df = df_before[(df_before['raceID'] >= 30) & (df_before['raceID'] <= 37)]

In [167]:
X_train = train_df.drop('Position', axis=1)
Y_train = train_df['Position']
X_test = test_df.drop('Position', axis=1)
Y_test = test_df['Position']

In [168]:
X_train = X_train.drop('RaceCountry', axis=1)
X_train = X_train.drop('Year', axis=1)
X_test = X_test.drop('RaceCountry', axis=1)
X_test = X_test.drop('Year', axis=1)




In [169]:
x_train = X_train.drop(['Abbreviation', 'raceID'], axis = 1)
Z_train = X_train[['raceID']]
clusters_train = X_train['raceID']

In [170]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor(objective='reg:squarederror', max_depth=8, learning_rate=0.08, n_estimators=300)

mrf = MERF(fixed_effects_model=xgb_model, max_iterations=30)
mrf.fit(x_train, Z_train, clusters_train, Y_train)

INFO     [merf.py:307] Training GLL is -1862.2847864719645 at iteration 1.
INFO     [merf.py:307] Training GLL is -3647.630655191136 at iteration 2.
INFO     [merf.py:307] Training GLL is -5272.782182554975 at iteration 3.
INFO     [merf.py:307] Training GLL is -5711.852787234318 at iteration 4.
INFO     [merf.py:307] Training GLL is -5326.753129410458 at iteration 5.
INFO     [merf.py:307] Training GLL is -5275.457197358164 at iteration 6.
INFO     [merf.py:307] Training GLL is -5892.664099150863 at iteration 7.
INFO     [merf.py:307] Training GLL is -5477.966383848667 at iteration 8.
INFO     [merf.py:307] Training GLL is -5657.643075471162 at iteration 9.
INFO     [merf.py:307] Training GLL is -5617.1689197573805 at iteration 10.
INFO     [merf.py:307] Training GLL is -5165.792099773413 at iteration 11.
INFO     [merf.py:307] Training GLL is -5488.354094190334 at iteration 12.
INFO     [merf.py:307] Training GLL is -5696.382980351925 at iteration 13.
INFO     [merf.py:307] Training 

<merf.merf.MERF at 0x7f7c9ad3b580>

In [171]:
#regressor = mrf.fit(x_train, Z_train, clusters_train, Y_train)

In [172]:
train_preds = mrf.predict(x_train, Z_train, clusters_train)

In [173]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(Y_train, train_preds)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_train, train_preds)
r2 = r2_score(Y_train, train_preds)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)


Test MSE: 5.8217849457097465e-05
Test RMSE: 0.007630062218428987
Test MAE: 0.005132014482793674
Test R-squared: 0.9999982124265792


## Test

In [174]:
x_test = X_test.drop(['Abbreviation', 'raceID'], axis = 1)
Z_test = X_test[['raceID']]
clusters_test = X_test['raceID']

In [176]:
y_pred = mrf.predict(x_test, Z_test, clusters_test)

In [177]:
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)


Test MSE: 10.222745296459669
Test RMSE: 3.1973028158839862
Test MAE: 2.4326430141151727
Test R-squared: 0.6729445230526319


In [178]:
actual_values = Y_test
predictions = y_pred
# Calculate the mean of actual values
mean_actual = sum(actual_values) / len(actual_values)

# Calculate SSE and SST
sse = sum((actual - pred) ** 2 for actual, pred in zip(actual_values, predictions))
sst = sum((actual - mean_actual) ** 2 for actual in actual_values)

# Calculate R-squared
r2 = 1 - (sse / sst)
r2

0.6729445230526319

In [179]:
r2

0.6729445230526319

In [180]:
# Create a predicted vs actual values plot
import plotly.express as px

data = {"Y_test": Y_test, "y_pred": y_pred}

# Create the DataFrame
dfplot = pd.DataFrame(data)

# Print the DataFrame
print(dfplot)


fig = px.scatter(
    dfplot, x='Y_test', y='y_pred', opacity=0.65,
    trendline='ols', trendline_color_override='red',
    width=550, height=400
    
)

color_palette = ['#171C54']  # Specify your desired colors

# Update the color palette
fig.update_traces(marker=dict(color=color_palette))

fig.show()

     Y_test  y_pred
568  9.0000  8.9324
569  4.0000  3.8714
570  6.0000 10.0012
571 12.0000 15.2203
572 10.0000  8.2199
..      ...     ...
717 17.0000 16.5710
718 15.0000 15.9106
719 12.0000 13.7244
720  2.0000  1.2770
721 16.0000 13.9127

[154 rows x 2 columns]


In [181]:
# Get feature importances and sort them
importance = mrf.trained_fe_model.feature_importances_

importance_df_before = pd.DataFrame({'Feature': x_train.columns, 'Importance': importance})

importance_df_before = importance_df_before.sort_values('Importance', ascending=False).reset_index(drop=True)
print(importance_df_before)


                     Feature  Importance
0                   carIssue      0.5337
1          TeamName_Williams      0.0932
2          TeamName_Mercedes      0.0496
3               GridPosition      0.0419
4                driverIssue      0.0351
5               INTERMEDIATE      0.0349
6        TeamName_HaasF1Team      0.0332
7     TeamName_RedBullRacing      0.0264
8   TeamName_AlfaRomeoRacing      0.0258
9           TeamName_McLaren      0.0147
10                 PitstopNo      0.0120
11                      FLap      0.0120
12                     Brake      0.0115
13              AverageSpeed      0.0108
14              Engine_Honda      0.0097
15      TeamName_RacingPoint      0.0075
16                      SOFT      0.0053
17                  MaxSpeed      0.0048
18            Engine_Renault      0.0045
19          TeamName_Ferrari      0.0040
20           AverageThrottle      0.0039
21                 SDLapTime      0.0035
22                    MaxRPM      0.0034
23              

In [182]:
# Get permutation feature importances and sort them
from sklearn.inspection import permutation_importance

result = permutation_importance(mrf.trained_fe_model, x_test, Y_test, n_repeats=10, random_state=42)

# Get the importance scores
importance_perm = result.importances_mean

# Create the DataFrame with feature names and importance scores
importance_df_before_perm = pd.DataFrame({'Feature': x_train.columns, 'Importance': importance_perm})

# Sort the DataFrame by importance in descending order
importance_df_before_perm = importance_df_before_perm.sort_values('Importance', ascending=False).reset_index(drop=True)

# Print the importance DataFrame
print(importance_df_before_perm)


                     Feature  Importance
0               GridPosition      0.5293
1                   carIssue      0.2336
2               AverageSpeed      0.0527
3                  SDLapTime      0.0425
4                 AvgPitTime      0.0245
5        TeamName_HaasF1Team      0.0189
6                  PitstopNo      0.0173
7          TeamName_Williams      0.0171
8     TeamName_RedBullRacing      0.0162
9          TeamName_Mercedes      0.0150
10                     Brake      0.0116
11               driverIssue      0.0094
12          TeamName_McLaren      0.0087
13           AverageThrottle      0.0083
14                    MEDIUM      0.0071
15                      SOFT      0.0067
16            MaxThrottlePct      0.0038
17              INTERMEDIATE      0.0036
18                      HARD      0.0035
19                   AgeAtGP      0.0026
20      TeamName_RacingPoint      0.0022
21                      FLap      0.0018
22            Engine_Renault      0.0005
23          Team

## After model

## Train

In [185]:
# Split the data into train and test sets based on the raceID column
train_df = df_after[(df_after['raceID'] >= 38) & (df_after['raceID'] <= 72)]
test_df = df_after[(df_after['raceID'] >= 73) & (df_after['raceID'] <= 80)]

In [186]:
X_train = train_df.drop('Position', axis=1)
Y_train = train_df['Position']
X_test = test_df.drop('Position', axis=1)
Y_test = test_df['Position']

In [187]:
X_train = X_train.drop('RaceCountry', axis=1)
X_train = X_train.drop('Year', axis=1)
X_test = X_test.drop('RaceCountry', axis=1)
X_test = X_test.drop('Year', axis=1)

In [188]:
x_train = X_train.drop(['Abbreviation', 'raceID'], axis = 1)
Z_train = X_train[['raceID']]
clusters_train = X_train['raceID']

In [189]:
mrf.fit(x_train, Z_train, clusters_train, Y_train)

INFO     [merf.py:307] Training GLL is -2351.47307003303 at iteration 1.
INFO     [merf.py:307] Training GLL is -4361.795891854094 at iteration 2.
INFO     [merf.py:307] Training GLL is -5406.22141019901 at iteration 3.
INFO     [merf.py:307] Training GLL is -5429.750043488067 at iteration 4.
INFO     [merf.py:307] Training GLL is -5440.340304925421 at iteration 5.
INFO     [merf.py:307] Training GLL is -6000.183006218617 at iteration 6.
INFO     [merf.py:307] Training GLL is -5318.946551047956 at iteration 7.
INFO     [merf.py:307] Training GLL is -5325.136465958087 at iteration 8.
INFO     [merf.py:307] Training GLL is -5537.893274981799 at iteration 9.
INFO     [merf.py:307] Training GLL is -5563.921702170532 at iteration 10.
INFO     [merf.py:307] Training GLL is -5281.973155850868 at iteration 11.
INFO     [merf.py:307] Training GLL is -5614.690147888443 at iteration 12.
INFO     [merf.py:307] Training GLL is -5367.6349086815735 at iteration 13.
INFO     [merf.py:307] Training GLL

<merf.merf.MERF at 0x7f7c9ad3b580>

In [190]:
train_preds = mrf.predict(x_train, Z_train, clusters_train)

In [191]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(Y_train, train_preds)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_train, train_preds)
r2 = r2_score(Y_train, train_preds)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)

Test MSE: 0.0002888596563726572
Test RMSE: 0.01699587174500494
Test MAE: 0.01176123859277412
Test R-squared: 0.9999908599953489


## Test

In [192]:
x_test = X_test.drop(['Abbreviation', 'raceID'], axis = 1)
Z_test = X_test[['raceID']]
clusters_test = X_test['raceID']

In [193]:
y_pred = mrf.predict(x_test, Z_test, clusters_test)

In [194]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

# Print the performance metrics
print("Test MSE:", mse)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R-squared:", r2)


Test MSE: 8.259772518771722
Test RMSE: 2.8739819969463487
Test MAE: 2.180150468503275
Test R-squared: 0.7382617596100449


In [195]:
# Create predicted vs actual values plot
import plotly.express as px

data = {"Y_test": Y_test, "y_pred": y_pred}

# Create the DataFrame
dfplot = pd.DataFrame(data)

# Print the DataFrame
print(dfplot)


fig = px.scatter(
    dfplot, x='Y_test', y='y_pred', opacity=0.65,
    trendline='ols', trendline_color_override='red',
    width=550, height=400
    
)

color_palette = ['#2871AD']  # Specify your desired colors

# Update the color palette
fig.update_traces(marker=dict(color=color_palette))

fig.show()

      Y_test  y_pred
1392  1.0000  2.3189
1393 11.0000 11.0738
1394  5.0000  3.1241
1395  6.0000 10.7640
1396  3.0000  3.4565
...      ...     ...
1542 10.0000  7.5423
1543  4.0000  3.9166
1544 19.0000 15.1147
1545  5.0000  3.2889
1546 15.0000 13.9034

[155 rows x 2 columns]


In [196]:
# Get feature importances and sort them
importance = mrf.trained_fe_model.feature_importances_
importance_df_after = pd.DataFrame({'Feature': x_train.columns, 'Importance': importance})
importance_df_after = importance_df_after.sort_values('Importance', ascending=False).reset_index(drop=True)
print(importance_df_after)

                     Feature  Importance
0                   carIssue      0.5997
1        TeamName_HaasF1Team      0.0628
2                driverIssue      0.0543
3     TeamName_RedBullRacing      0.0539
4          TeamName_Mercedes      0.0459
5               GridPosition      0.0284
6           TeamName_Ferrari      0.0272
7          TeamName_Williams      0.0270
8             Engine_RedBull      0.0206
9                      Brake      0.0074
10              AverageSpeed      0.0067
11  TeamName_AlfaRomeoRacing      0.0061
12                 PitstopNo      0.0058
13              INTERMEDIATE      0.0052
14           TeamName_Alpine      0.0046
15            MaxThrottlePct      0.0044
16                  MaxSpeed      0.0044
17                AvgPitTime      0.0042
18           AverageThrottle      0.0039
19                      SOFT      0.0033
20                    MEDIUM      0.0025
21      TeamName_AstonMartin      0.0025
22                      FLap      0.0023
23          Team

In [None]:
# Create a dataframe with both before and after feature importances by merging on features
importance = pd.merge(importance_df_before, importance_df_after, on="Feature")
pd.set_option('display.float_format', '{:.4f}'.format)

importance.rename(columns={'Importance_x': "Before", 'Importance_y': "After"}, inplace=True)

importance

Unnamed: 0,Feature,Before,After
0,carIssue,0.5337,0.5997
1,TeamName_Williams,0.0932,0.027
2,TeamName_Mercedes,0.0496,0.0459
3,GridPosition,0.0419,0.0284
4,driverIssue,0.0351,0.0543
5,INTERMEDIATE,0.0349,0.0052
6,TeamName_HaasF1Team,0.0332,0.0628
7,TeamName_RedBullRacing,0.0264,0.0539
8,TeamName_AlfaRomeoRacing,0.0258,0.0061
9,TeamName_McLaren,0.0147,0.0022


## Create plots for visualization of feature importances

In [198]:
import plotly.graph_objects as go

features = importance['Feature']
before_values = importance['Before']
after_values = importance['After']

trace_before = go.Bar(
    x=features,
    y=before_values,
    name='Before'
)

trace_after = go.Bar(
    x=features,
    y=after_values,
    name='After'
)

layout = go.Layout(
    title='Comparison of Before and After',
    xaxis=dict(title='Feature'),
    yaxis=dict(title='Value')
)

# Create the figure
fig = go.Figure(data=[trace_before, trace_after], layout=layout)

# Show the bar chart
fig.show()


In [199]:
# Top 10 features - before and Top 10 features - after

import plotly.graph_objects as go

colors = ['#171C54', '#93B8D6']

# Sort the DataFrame by "Before" values and select the top 10 features
top_10_before = importance.sort_values(by='Before', ascending=False).head(10)

# Sort the DataFrame by "After" values and select the top 10 features
top_10_after = importance.sort_values(by='After', ascending=False).head(10)

# Create data for the "Before" chart
before_features = top_10_before['Feature']
before_values = top_10_before['Before']
after_values = top_10_before['After']

# Create a trace for "Before" bars
trace_before = go.Bar(
    x=before_features,
    y=before_values,
    name='Before',
    marker=dict(color=colors[0]),
    textposition='auto'
)

# Create a trace for "After" bars
trace_after = go.Bar(
    x=before_features,
    y=after_values,
    name='After',
    marker=dict(color=colors[1]),
    textposition='auto'
)

# Create the layout for the "Before" chart
layout_before = go.Layout(
    title='Top 10 Features - Before',
    xaxis=dict(title='Feature'),
    yaxis=dict(title='Value')
)

# Create the figure for the "Before" chart
fig_before = go.Figure(data=[trace_before, trace_after], layout=layout_before)

# Show the "Before" chart
fig_before.show()


# Create data for the "After" chart
after_features = top_10_after['Feature']
before_values = top_10_after['Before']
after_values = top_10_after['After']

# Create a trace for "Before" bars
trace_before = go.Bar(
    x=after_features,
    y=before_values,
    name='Before',
    marker=dict(color=colors[0])
)

# Create a trace for "After" bars
trace_after = go.Bar(
    x=after_features,
    y=after_values,
    name='After',
    marker=dict(color=colors[1]),
    textposition='auto'
)

# Create the layout for the "After" chart
layout_after = go.Layout(
    title='Top 10 Features - After',
    xaxis=dict(title='Feature'),
    yaxis=dict(title='Value')
)

# Create the figure for the "After" chart
fig_after = go.Figure(data=[trace_before, trace_after], layout=layout_after)

# Show the "After" chart
fig_after.show()


In [200]:
# Create same plots but without caIssue feature

importance_aux = importance.drop(index=importance.index[0])

colors = ['#171C54', '#93B8D6']

# Sort the DataFrame by "Before" values and select the top 10 features
top_10_before = importance_aux.sort_values(by='Before', ascending=False).head(10)

# Sort the DataFrame by "After" values and select the top 10 features
top_10_after = importance_aux.sort_values(by='After', ascending=False).head(10)

# Create data for the "Before" chart
before_features = top_10_before['Feature']
before_values = top_10_before['Before']
after_values = top_10_before['After']

# Create a trace for "Before" bars
trace_before = go.Bar(
    x=before_features,
    y=before_values,
    name='Before',
    marker=dict(color=colors[0]),
    textposition='auto'
)

# Create a trace for "After" bars
trace_after = go.Bar(
    x=before_features,
    y=after_values,
    name='After',
    marker=dict(color=colors[1]),
    textposition='auto'
)

# Create the layout for the "Before" chart
layout_before = go.Layout(
    title='Top 10 Features - Before (without carIssue)',
    xaxis=dict(title='Feature'),
    yaxis=dict(title='Value')
)

# Create the figure for the "Before" chart
fig_before = go.Figure(data=[trace_before, trace_after], layout=layout_before)

# Show the "Before" chart
fig_before.show()


# Create data for the "After" chart
after_features = top_10_after['Feature']
before_values = top_10_after['Before']
after_values = top_10_after['After']

# Create a trace for "Before" bars
trace_before = go.Bar(
    x=after_features,
    y=before_values,
    name='Before',
    marker=dict(color=colors[0])
)

# Create a trace for "After" bars
trace_after = go.Bar(
    x=after_features,
    y=after_values,
    name='After',
    marker=dict(color=colors[1]),
    textposition='auto'
)

# Create the layout for the "After" chart
layout_after = go.Layout(
    title='Top 10 Features - After (without carIssue)',
    xaxis=dict(title='Feature'),
    yaxis=dict(title='Value')
)

# Create the figure for the "After" chart
fig_after = go.Figure(data=[trace_before, trace_after], layout=layout_after)

# Show the "After" chart
fig_after.show()


In [201]:
# Add column with the importance difference
importance["Difference"] = importance["After"] - importance["Before"]
importance

Unnamed: 0,Feature,Before,After,Difference
0,carIssue,0.5337,0.5997,0.066
1,TeamName_Williams,0.0932,0.027,-0.0662
2,TeamName_Mercedes,0.0496,0.0459,-0.0037
3,GridPosition,0.0419,0.0284,-0.0135
4,driverIssue,0.0351,0.0543,0.0192
5,INTERMEDIATE,0.0349,0.0052,-0.0298
6,TeamName_HaasF1Team,0.0332,0.0628,0.0296
7,TeamName_RedBullRacing,0.0264,0.0539,0.0275
8,TeamName_AlfaRomeoRacing,0.0258,0.0061,-0.0196
9,TeamName_McLaren,0.0147,0.0022,-0.0125


In [202]:
# Subset the DataFrame with top highest and lowest values in "Difference"
top_highest = importance.nlargest(4, 'Difference')

top_lowest = importance.nsmallest(2, 'Difference')

# Concatenate the DataFrames vertically
result = pd.concat([top_highest, top_lowest], ignore_index=True)

result = result.drop(["Before", "After"], axis= 1)

In [203]:
import plotly.graph_objects as go

# Sort the DataFrame by the "Difference" column
df_sorted = result.sort_values('Difference')

# Define colors for positive and negative differences
colors = ['#171C54' if diff >= 0 else '#93B8D6' for diff in df_sorted['Difference']]

# Create the bar chart
fig = go.Figure(go.Bar(
    x=df_sorted['Feature'],
    y=df_sorted['Difference'],
    marker_color=colors,
))

# Update the layout
fig.update_layout(
    xaxis=dict(title="Feature"),
    yaxis=dict(title="Difference"),
)

# Show the chart
fig.show()


In [204]:
# Get permutation feature importance scores for after model
result = permutation_importance(mrf.trained_fe_model, x_test, Y_test, n_repeats=10, random_state=42)

# Get the importance scores
importance_perm = result.importances_mean

# Create the DataFrame with feature names and importance scores
importance_df_after_perm = pd.DataFrame({'Feature': x_train.columns, 'Importance': importance_perm})

# Sort the DataFrame by importance in descending order
importance_df_after_perm = importance_df_after_perm.sort_values('Importance', ascending=False).reset_index(drop=True)

# Print the importance DataFrame
print(importance_df_after_perm)

                     Feature  Importance
0               GridPosition      0.4453
1                   carIssue      0.4452
2     TeamName_RedBullRacing      0.0419
3        TeamName_HaasF1Team      0.0355
4                   MaxSpeed      0.0320
5           TeamName_Ferrari      0.0206
6          TeamName_Mercedes      0.0184
7               AverageSpeed      0.0167
8              BestQualiTime      0.0142
9                  PitstopNo      0.0130
10         TeamName_Williams      0.0112
11           AverageThrottle      0.0109
12                   AgeAtGP      0.0109
13                    MaxRPM      0.0103
14               driverIssue      0.0049
15                AvgPitTime      0.0045
16           TeamName_Alpine      0.0030
17                    MEDIUM      0.0014
18       TeamName_AlphaTauri      0.0014
19              INTERMEDIATE      0.0013
20            Engine_RedBull      0.0006
21          TeamName_McLaren      0.0005
22      TeamName_AstonMartin      0.0004
23           Eng

In [205]:
# Merge the permutation scores based on features
importance_perm = pd.merge(importance_df_before_perm, importance_df_after_perm, on="Feature")
pd.set_option('display.float_format', '{:.4f}'.format)
importance_perm

Unnamed: 0,Feature,Importance_x,Importance_y
0,GridPosition,0.5293,0.4453
1,carIssue,0.2336,0.4452
2,AverageSpeed,0.0527,0.0167
3,SDLapTime,0.0425,-0.0004
4,AvgPitTime,0.0245,0.0045
5,TeamName_HaasF1Team,0.0189,0.0355
6,PitstopNo,0.0173,0.013
7,TeamName_Williams,0.0171,0.0112
8,TeamName_RedBullRacing,0.0162,0.0419
9,TeamName_Mercedes,0.015,0.0184
