In [264]:
from sklearn.ensemble import RandomForestRegressor

In [318]:
def read_and_concat(file_list):
    dfs = [pd.read_csv(f) for f in file_list]
    return pd.concat(dfs, ignore_index=True)

In [567]:
import glob
import pandas as pd

file_list_2022 = glob.glob("2022/*.csv")
file_list_2023 = glob.glob("2023/*.csv")
file_list_2024 = glob.glob("2024/*.csv")
file_list_2025 = glob.glob("2025/*.csv")

In [568]:
df_2022 = read_and_concat(file_list_2022)
df_2023 = read_and_concat(file_list_2023)
df_2024 = read_and_concat(file_list_2024)

big_df = pd.concat([df_2022, df_2023, df_2024], ignore_index=True)
big_df = big_df.dropna(subset=["FinishingPos"])

In [569]:
X_train = big_df.drop(["FinishingPos"], axis=1)
y_train = big_df["FinishingPos"]

In [570]:
test_df = read_and_concat(file_list_2025)
test_df = test_df.dropna(subset=["FinishingPos"])

In [571]:
X_test = test_df.drop(["FinishingPos"], axis=1)
Y_test = test_df["FinishingPos"]

In [572]:
from sklearn.model_selection import RandomizedSearchCV
param_dist = {
    "n_estimators" : [i for i in range(1, 1001, 10)],
    "max_depth" : [i for i in range(1,101)],
    "min_samples_split" : [i for i in range(2,101)],
    "min_samples_leaf" : [i for i in range(1,101)],
    "max_features" : ["sqrt", "log2", None],
    "bootstrap" : [True],
    "max_samples" : [0.5, 0.6, 0.7, 0.8, 0.9],
    "random_state" : [30]
}

rfr = RandomForestRegressor(random_state=30)
rfr = RandomizedSearchCV(
    estimator=rfr,
    param_distributions=param_dist,
    n_iter=100,
    cv=5,
    verbose=2,
    n_jobs=-1,
    scoring="r2"
)

In [None]:
rfr.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [None]:
def get_df_preds(X_test, Y_pred):
    df_preds = pd.DataFrame({
        "RaceId" : X_test["RacesInGEEra"],
        "PredictedValue" : Y_pred
    })

    # argsort the race predictions made by the model
    # each race is grouped by RacesInGEEra
    df_preds["PredictedRank"] = df_preds.groupby("RaceId")["PredictedValue"].rank(method="first", ascending=True).astype(int)

    return df_preds

In [564]:
Y_pred = rfr.predict(X_test)
df_preds = get_df_preds(X_test, Y_pred)

In [565]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print(mean_absolute_error(Y_test, df_preds["PredictedRank"]))
print(mean_squared_error(Y_test, df_preds["PredictedRank"]))
print(r2_score(Y_test, df_preds["PredictedRank"]))

2.2393822393822393
10.447876447876448
0.6836716179437532


In [533]:
for file in file_list_2025:
    spa = pd.read_csv(file)
    spa = spa.drop(["FullName"], axis=1)
    spa = spa.dropna(subset=["FinishingPos"])
    spa = pd.get_dummies(spa)
    spa_X = spa.drop(["FinishingPos"], axis=1)
    spa_Y = spa["FinishingPos"]

    X_train, spa_X = X_train.align(spa_X, join='left', axis=1, fill_value=0)
    spa_Y_pred = rfr.predict(spa_X)

    print(r2_score(spa_Y, get_df_preds(spa_X, spa_Y_pred)["PredictedRank"]), file)

0.6666666666666667 2025\Barcelona_race.csv
0.8045112781954887 2025\Imola_race.csv
0.9157894736842105 2025\Jeddah_race.csv
0.6812030075187969 2025\Melbourne_race.csv
0.8796992481203008 2025\Miami_race.csv
0.5308270676691729 2025\Monaco_race.csv
0.7804511278195488 2025\Montreal_race.csv
0.6330827067669174 2025\Sakhir_race.csv
0.6451127819548872 2025\Shanghai_race.csv
0.33834586466165417 2025\Silverstone_race.csv
0.4676691729323308 2025\Spa-Francorchamps_race.csv
0.6691729323308271 2025\Spielberg_race.csv
0.924812030075188 2025\Suzuka_race.csv


In [534]:
spa = pd.read_csv("2025/Silverstone_race.csv")
spa = spa.drop(["FullName"], axis=1)
spa = spa.dropna(subset=["FinishingPos"])
spa = pd.get_dummies(spa)
spa_X = spa.drop(["FinishingPos"], axis=1)
spa_Y = spa["FinishingPos"]

X_train, spa_X = X_train.align(spa_X, join='left', axis=1, fill_value=0)
spa_Y_pred = rfr.predict(spa_X)

print(r2_score(spa_Y, get_df_preds(spa_X, spa_Y_pred)["PredictedRank"]))

0.33834586466165417


In [535]:
df_pred = get_df_preds(spa_X, spa_Y_pred)
for i, pred in df_pred.iterrows():
    print(f"{driver_names[spa_X["DriverNumber"][i]]} - {pred["PredictedRank"]} - {spa_Y[i]}")

Lando Norris - 3.0 - 1.0
Oscar Piastri - 2.0 - 2.0
Nico Hulkenberg - 15.0 - 3.0
Lewis Hamilton - 5.0 - 4.0
Max Verstappen - 1.0 - 5.0
Pierre Gasly - 9.0 - 6.0
Lance Stroll - 14.0 - 7.0
Alex Albon - 12.0 - 8.0
Fernando Alonso - 8.0 - 9.0
George Russell - 6.0 - 10.0
Oliver Bearman - 10.0 - 11.0
Carlos Sainz - 11.0 - 12.0
Estaban Ocon - 13.0 - 13.0
Charles Leclerc - 4.0 - 14.0
Yuki Tsunoda - 7.0 - 15.0
Kimi Antonelli - 16.0 - 16.0
Isack Hadjar - 20.0 - 17.0
Gabriel Bortoleto - 19.0 - 18.0
Liam Lawson - 17.0 - 19.0
Franco Colapinto - 18.0 - 20.0


In [221]:
driver_names = {
    4 : "Lando Norris",
    1 : "Max Verstappen",
    81 : "Oscar Piastri",
    16 : "Charles Leclerc",
    22 : "Yuki Tsunoda",
    23 : "Alex Albon",
    63 : "George Russell",
    6 : "Isack Hadjar",
    5 : "Gabriel Bortoleto",
    10 : "Pierre Gasly",
    30 : "Liam Lawson",
    31 : "Estaban Ocon",
    87 : "Oliver Bearman",
    12 : "Kimi Antonelli",
    44 : "Lewis Hamilton",
    14 : "Fernando Alonso",
    18 : "Lance Stroll",
    43 : "Franco Colapinto",
    55 : "Carlos Sainz",
    27 : "Nico Hulkenberg",
    7 : "Jack Doohan"
}

In [566]:
best_rf = rfr.best_estimator_

importances = best_rf.feature_importances_
feature_names = X_train.columns

importances_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(importances_df)

                  Feature  Importance
13                Retired    0.323341
10            StartingPos    0.264068
1                  TeamId    0.251296
11              GapToPole    0.056659
4            RacesInGEEra    0.032863
12  TeammateQualifyingPos    0.013225
0            DriverNumber    0.012576
7               WindSpeed    0.009195
8                 AirTemp    0.008474
9               TrackTemp    0.007425
5              LocationId    0.007299
3             RoundNumber    0.006070
2                  Season    0.005034
6                    Rain    0.002476


Ways to improve
- Swap quali pos and starting pos in csv files
- Give each circuit an ID instead of bool columns. Circuit not taken into account for now