In [1]:
from sklearn.ensemble import RandomForestRegressor

In [2]:
def read_and_concat(file_list):
    dfs = [pd.read_csv(f) for f in file_list]
    return pd.concat(dfs, ignore_index=True)

In [3]:
import glob
import pandas as pd

file_list_2022 = glob.glob("2022/*.csv")
file_list_2023 = glob.glob("2023/*.csv")
file_list_2024 = glob.glob("2024/*.csv")
file_list_2025 = glob.glob("2025/*.csv")

In [4]:
df_2022 = read_and_concat(file_list_2022)
df_2023 = read_and_concat(file_list_2023)
df_2024 = read_and_concat(file_list_2024)

big_df = pd.concat([df_2022, df_2023, df_2024], ignore_index=True)
big_df = big_df.dropna(subset=["FinishingPos"])
big_df["DriverId"] = big_df["DriverId"].astype("category")

In [5]:
X_train = big_df.drop(["FinishingPos"], axis=1)
y_train = big_df["FinishingPos"]

In [6]:
test_df = read_and_concat(file_list_2025)
test_df = test_df.dropna(subset=["FinishingPos"])
test_df["DriverId"] = test_df["DriverId"].astype("category")

In [7]:
X_test = test_df.drop(["FinishingPos"], axis=1)
Y_test = test_df["FinishingPos"]

In [8]:
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

In [9]:
from sklearn.model_selection import RandomizedSearchCV
param_dist = {
    "n_estimators" : [i for i in range(1, 1001, 10)],
    "max_depth" : [i for i in range(1,101)],
    "min_samples_split" : [i for i in range(2,101)],
    "min_samples_leaf" : [i for i in range(1,101)],
    "max_features" : ["sqrt", "log2", None],
    "bootstrap" : [True],
    "max_samples" : [0.5, 0.6, 0.7, 0.8, 0.9],
    "random_state" : [30]
}

rfr = RandomForestRegressor(random_state=30)
rfr = RandomizedSearchCV(
    estimator=rfr,
    param_distributions=param_dist,
    n_iter=200,
    cv=5,
    verbose=2,
    n_jobs=-1,
    scoring="neg_mean_squared_error"
)

In [10]:
rfr.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


0,1,2
,estimator,RandomForestR...ndom_state=30)
,param_distributions,"{'bootstrap': [True], 'max_depth': [1, 2, ...], 'max_features': ['sqrt', 'log2', ...], 'max_samples': [0.5, 0.6, ...], ...}"
,n_iter,200
,scoring,'neg_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,n_estimators,841
,criterion,'squared_error'
,max_depth,43
,min_samples_split,33
,min_samples_leaf,7
,min_weight_fraction_leaf,0.0
,max_features,
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [11]:
def get_df_preds(X_test, Y_pred):
    df_preds = pd.DataFrame({
        "RaceId" : X_test["RacesInGEEra"],
        "PredictedValue" : Y_pred
    })

    # argsort the race predictions made by the model
    # each race is grouped by RacesInGEEra
    df_preds["PredictedRank"] = df_preds.groupby("RaceId")["PredictedValue"].rank(method="first", ascending=True).astype(int)

    return df_preds

In [12]:
Y_pred = rfr.predict(X_test)
df_preds = get_df_preds(X_test, Y_pred)

In [13]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error, median_absolute_error
from sklearn.inspection import permutation_importance
from scipy.stats import spearmanr, kendalltau

r2 = r2_score(Y_test, df_preds["PredictedRank"])
print("MAE:", mean_absolute_error(Y_test, df_preds["PredictedRank"]))
print("MSE:", mean_squared_error(Y_test, df_preds["PredictedRank"]))
print("r2:", r2)
print("Adjusted r2:", 1 - ((1 - r2) * (len(df_preds) - 1) / (len(df_preds) - 15 - 1)))
print("RMSE:", root_mean_squared_error(Y_test, df_preds["PredictedRank"]))
print("MEAE:", median_absolute_error(Y_test, df_preds["PredictedRank"]))
print("Spearmanr:", spearmanr(Y_test, df_preds["PredictedRank"])[0])
print("Kendalltau:", kendalltau(Y_test, df_preds["PredictedRank"])[0])
pi = permutation_importance(rfr, X_test, Y_test, n_repeats=10, random_state=30)
for i in pi.importances_mean.argsort()[::-1]:
    print(f"{X_test.columns[i]}: {pi.importances_mean[i]:.4f}")

MAE: 2.16988416988417
MSE: 10.069498069498069
r2: 0.6951277086464553
Adjusted r2: 0.6763084314024094
RMSE: 3.173247243676116
MEAE: 1.0
Spearmanr: 0.847532109272273
Kendalltau: 0.7011580830430278
Retired: 18.6385
QualifyingPos: 9.6490
TeamId: 2.5161
GapToPole: 0.8467
StartingPos: 0.6615
RelativeHumidity: 0.0246
RainBefore: 0.0083
RainDuring: 0.0048
WindSpeed: 0.0017
RacesInGEEra: 0.0000
Season: 0.0000
LocationId: -0.0068
DriverId: -0.0097
RoundNumber: -0.0112
ApparentTemp: -0.0188
TeammateQualifyingPos: -0.0827


In [14]:
for file in file_list_2025:
    spa = pd.read_csv(file)
    spa = spa.dropna(subset=["FinishingPos"])
    spa_X = spa.drop(["FinishingPos"], axis=1)
    spa_Y = spa["FinishingPos"]

    X_train, spa_X = X_train.align(spa_X, join='left', axis=1, fill_value=0)
    spa_Y_pred = rfr.predict(spa_X)

    print(mean_absolute_error(spa_Y, get_df_preds(spa_X, spa_Y_pred)["PredictedRank"]), file)

2.0 2025\Barcelona_race.csv


KeyError: ['FinishingPos']

In [None]:
from sklearn.inspection import permutation_importance

race = pd.read_csv("2025/Silverstone_race.csv")
race = race.dropna(subset=["FinishingPos"])
race_X = race.drop(["FinishingPos"], axis=1)
race_Y = race["FinishingPos"]

X_train, race_X = X_train.align(race_X, join='left', axis=1, fill_value=0)
race_Y_pred = rfr.predict(race_X)

df_preds = get_df_preds(race_X, race_Y_pred)

r2 = r2_score(race_Y, df_preds["PredictedRank"])
print("MAE:", mean_absolute_error(race_Y, df_preds["PredictedRank"]))
print("MSE:", mean_squared_error(race_Y, df_preds["PredictedRank"]))
print("r2:", r2)
print("Adjusted r2:", 1 - ((1 - r2) * (len(df_preds) - 1) / (len(df_preds) - 13 - 1)))
print("RMSE:", root_mean_squared_error(race_Y, df_preds["PredictedRank"]))
print("MEAE:", median_absolute_error(race_Y, df_preds["PredictedRank"]))
print("Spearmanr:", spearmanr(race_Y, df_preds["PredictedRank"])[0])
print("Kendalltau:", kendalltau(race_Y, df_preds["PredictedRank"])[0])

MAE: 3.1
MSE: 20.6
r2: 0.3804511278195488
Adjusted r2: -0.961904761904762
RMSE: 4.538722287164087
MEAE: 1.5
Spearmanr: 0.6902255639097743
Kendalltau: 0.5157894736842106


In [None]:

result = permutation_importance(rfr, race_X, race_Y, n_repeats=10, random_state=30)

for i in result.importances_mean.argsort()[::-1]:
    print(f"{X_test.columns[i]}: {result.importances_mean[i]:.4f}")

Retired: 26.7679
DriverId: 0.0902
ApparentTemp: 0.0000
WindSpeed: 0.0000
RainBefore: 0.0000
LocationId: 0.0000
RainDuring: 0.0000
RelativeHumidity: 0.0000
RacesInGEEra: 0.0000
RoundNumber: 0.0000
Season: 0.0000
QualifyingPos: -0.7335
TeammateQualifyingPos: -0.7761
GapToPole: -1.7443
StartingPos: -2.4078
TeamId: -4.1854


In [None]:
df_pred = get_df_preds(spa_X, spa_Y_pred)
for i, pred in df_pred.iterrows():
    print(f"{driver_numbers[spa_X["DriverId"][i]]} - {spa_X["QualifyingPos"]} - {pred["PredictedRank"]} - {spa_Y[i]}")

NameError: name 'driver_numbers' is not defined

In [15]:
driver_numbers = {
    1: 'Charles Leclerc',
    2: 'Carlos Sainz',
    3: 'Lewis Hamilton',
    4: 'George Russell',
    5: 'Kevin Magnussen',
    6: 'Valtteri Bottas',
    7: 'Esteban Ocon',
    8: 'Yuki Tsunoda',
    9: 'Fernando Alonso',
    10: 'Guanyu Zhou',
    11: 'Mick Schumacher',
    12: 'Lance Stroll',
    13: 'Alexander Albon',
    14: 'Daniel Ricciardo',
    15: 'Lando Norris',
    16: 'Nicholas Latifi',
    17: 'Nico Hulkenberg',
    18: 'Sergio Perez',
    19: 'Max Verstappen',
    20: 'Pierre Gasly',
    21: 'Sebastian Vettel',
    22: 'Nyck De Vries',
    23: 'Logan Sargeant',
    24: 'Oscar Piastri',
    25: 'Liam Lawson',
    26: 'Oliver Bearman',
    27: 'Franco Colapinto',
    28: 'Jack Doohan',
    29: 'Andrea Kimi Antonelli',
    30: 'Gabriel Bortoleto',
    31: 'Isack Hadjar',
    32: 'Kimi Antonelli'
}

In [16]:
best_rf = rfr.best_estimator_

importances = best_rf.feature_importances_
feature_names = X_train.columns

importances_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(importances_df)

                  Feature  Importance
12          QualifyingPos    0.415062
15                Retired    0.315450
1                  TeamId    0.128333
13              GapToPole    0.034338
11            StartingPos    0.033219
4            RacesInGEEra    0.025754
14  TeammateQualifyingPos    0.009461
10       RelativeHumidity    0.008040
9            ApparentTemp    0.006873
8               WindSpeed    0.006330
0                DriverId    0.005961
2                  Season    0.004086
5              LocationId    0.003830
3             RoundNumber    0.002977
6              RainBefore    0.000169
7              RainDuring    0.000115


In [19]:
hungary_X = pd.read_csv("2025/Hungary_race.csv")
preds = rfr.predict(hungary_X)

df_preds = get_df_preds(hungary_X, preds)

for i, pred in df_preds.iterrows():
    print(f"{driver_numbers[hungary_X["DriverId"][i]]} - {hungary_X["QualifyingPos"][i]} - {pred["PredictedRank"]}")

Charles Leclerc - 1.0 - 2.0
Oscar Piastri - 2.0 - 1.0
Lando Norris - 3.0 - 3.0
George Russell - 4.0 - 4.0
Fernando Alonso - 5.0 - 6.0
Lance Stroll - 6.0 - 7.0
Gabriel Bortoleto - 7.0 - 9.0
Max Verstappen - 8.0 - 5.0
Liam Lawson - 9.0 - 12.0
Isack Hadjar - 10.0 - 13.0
Oliver Bearman - 11.0 - 14.0
Lewis Hamilton - 12.0 - 8.0
Carlos Sainz - 13.0 - 15.0
Franco Colapinto - 14.0 - 16.0
Kimi Antonelli - 15.0 - 10.0
Yuki Tsunoda - 16.0 - 11.0
Pierre Gasly - 17.0 - 17.0
Esteban Ocon - 18.0 - 19.0
Nico Hulkenberg - 19.0 - 18.0
Alexander Albon - 20.0 - 20.0
