In [None]:
import pandas as pd

df = pd.read_csv('/content/F1 data - f1_data_verified.csv')
display(df.head())

Unnamed: 0,teamname,drivername,round,year,starting_grid_position,location,laps,dnf,race_finish_position,no_of_pitstops,rainfall,constructor_points,points
0,Ferrari,Charles Leclerc,1,2022,1,Sakhir,57,0,1,3,0,43,25
1,Ferrari,Carlos Sainz,1,2022,3,Sakhir,57,0,2,3,0,43,18
2,Mercedes,Lewis Hamilton,1,2022,5,Sakhir,57,0,3,3,0,27,15
3,Mercedes,George Russell,1,2022,9,Sakhir,57,0,4,2,0,27,12
4,Haas F1 Team,Kevin Magnussen,1,2022,7,Sakhir,57,0,5,2,0,10,10


In [None]:
# Block: Load

import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


# ---- Define target and inputs ----
# Target
y = df["points"]

# Drop leakage column
df = df.drop(columns=["race_finish_position"])

# Feature groups
categorical_features = ["teamname", "drivername", "location", "year"]
numeric_features = ["starting_grid_position", "laps", "no_of_pitstops", "rainfall", "round"]

# Final input matrix
X = df[categorical_features + numeric_features]

# ---- Preprocessing pipeline ----
# One-hot encode categoricals, scale numerics
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", StandardScaler(), numeric_features)
    ]
)

print("Data loaded and preprocessing pipeline created.")
print("Features:", X.shape[1])
print("Samples:", X.shape[0])


Data loaded and preprocessing pipeline created.
Features: 9
Samples: 1658


In [None]:
# Block: Aim

df_feat = df.copy()

# ---- Grid advantage ----
df_feat["grid_advantage"] = df_feat["starting_grid_position"].max() - df_feat["starting_grid_position"]

# ---- Driver form: exponentially decaying average of points ----
df_feat["driver_form_decay"] = (
    df_feat.groupby("drivername")["points"]
    .transform(lambda x: x.shift().ewm(span=5, adjust=False).mean())
)
# shift() ensures we only use *past races* for the average, not the current one

# ---- Team form: exponentially decaying average of constructor points ----
df_feat["team_form_decay"] = (
    df_feat.groupby("teamname")["constructor_points"]
    .transform(lambda x: x.shift().ewm(span=8, adjust=False).mean())
)

# ---- Season progress ----
df_feat["season_progress"] = df_feat["round"] / df_feat.groupby("year")["round"].transform("max")

# ---- Update features ----
categorical_features = ["teamname", "drivername", "location", "year"]
numeric_features = [
    "starting_grid_position", "rainfall", "round",
    "grid_advantage", "driver_form_decay", "team_form_decay", "season_progress"
]

X = df_feat[categorical_features + numeric_features]
y = df_feat["points"]

print("Feature engineering with decaying averages complete.")
print("Features now:", X.shape[1])


Feature engineering with decaying averages complete.
Features now: 11


In [None]:
# Block: Baseline

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# ---- Chronological sort ----
df_sorted = df_feat.sort_values(by=["year", "round"]).reset_index(drop=True)

y_true_all, y_pred_baseline_all = [], []

# Walk forward round by round
for i in range(len(df_sorted)):
    row = df_sorted.iloc[i]
    driver = row["drivername"]
    points = row["points"]

    # Use all past races of this driver to compute mean points
    past_points = df_sorted.loc[:i-1].query("drivername == @driver")["points"]

    if len(past_points) > 0:
        pred = past_points.mean()
    else:
        pred = df_sorted.loc[:i-1]["points"].mean() if i > 0 else 0  # global mean as fallback

    y_true_all.append(points)
    y_pred_baseline_all.append(pred)

# ---- Metrics ----
mae = mean_absolute_error(y_true_all, y_pred_baseline_all)
rmse = np.sqrt(mean_squared_error(y_true_all, y_pred_baseline_all))
r2 = r2_score(y_true_all, y_pred_baseline_all)

print("Baseline Model (Driver Historical Average):")
print(f"  MAE : {mae:.3f}")
print(f"  RMSE: {rmse:.3f}")
print(f"  R²  : {r2:.3f}")


Baseline Model (Driver Historical Average):
  MAE : 3.519
  RMSE: 5.277
  R²  : 0.459


# Light GBM

In [None]:
# Block: Shoot

import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# ---- Preprocessing pipeline ----
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", StandardScaler(), numeric_features)
    ]
)

# ---- LightGBM regressor ----
model = lgb.LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# ---- Rolling-window backtesting ----
df_sorted = df_feat.sort_values(by=["year", "round"]).reset_index(drop=True)
X_sorted = df_sorted[categorical_features + numeric_features]
y_sorted = df_sorted["points"]

tscv = TimeSeriesSplit(n_splits=8)
fold_results = []

for fold, (train_idx, test_idx) in enumerate(tscv.split(X_sorted)):
    X_train, X_test = X_sorted.iloc[train_idx], X_sorted.iloc[test_idx]
    y_train, y_test = y_sorted.iloc[train_idx], y_sorted.iloc[test_idx]

    # Preprocess + fit
    X_train_p = preprocessor.fit_transform(X_train)
    X_test_p = preprocessor.transform(X_test)

    model.fit(X_train_p, y_train)
    y_pred = model.predict(X_test_p)

    # Metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    fold_results.append((mae, rmse, r2))

    print(f"Fold {fold+1}")
    print(f"  MAE : {mae:.3f}")
    print(f"  RMSE: {rmse:.3f}")
    print(f"  R²  : {r2:.3f}")
    print("-"*40)

# ---- Average results ----
mae_avg = np.mean([r[0] for r in fold_results])
rmse_avg = np.mean([r[1] for r in fold_results])
r2_avg = np.mean([r[2] for r in fold_results])

print("Average Backtest Performance:")
print(f"  MAE : {mae_avg:.3f}")
print(f"  RMSE: {rmse_avg:.3f}")
print(f"  R²  : {r2_avg:.3f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 193
[LightGBM] [Info] Number of data points in the train set: 186, number of used features: 17
[LightGBM] [Info] Start training from score 5.360215
Fold 1
  MAE : 3.781
  RMSE: 5.533
  R²  : 0.393
----------------------------------------
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000058 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 383
[LightGBM] [Info] Number of data points in the train set: 370, number of used features: 35
[LightGBM] [Info] Start training from score 5.186486




Fold 2
  MAE : 2.935
  RMSE: 4.320
  R²  : 0.633
----------------------------------------
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000078 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 561
[LightGBM] [Info] Number of data points in the train set: 554, number of used features: 61
[LightGBM] [Info] Start training from score 5.104693




Fold 3
  MAE : 3.013
  RMSE: 4.506
  R²  : 0.601
----------------------------------------
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000111 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 684
[LightGBM] [Info] Number of data points in the train set: 738, number of used features: 61
[LightGBM] [Info] Start training from score 5.063686
Fold 4
  MAE : 3.102
  RMSE: 4.664
  R²  : 0.595
----------------------------------------
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000128 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 725
[LightGBM] [Info] Number of data points in the train set: 922, number of used features: 67
[LightGBM] [Info] Start training from score 5.085683




Fold 5
  MAE : 2.967
  RMSE: 4.602
  R²  : 0.586
----------------------------------------
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000153 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 741
[LightGBM] [Info] Number of data points in the train set: 1106, number of used features: 70
[LightGBM] [Info] Start training from score 5.107595




Fold 6
  MAE : 3.102
  RMSE: 4.784
  R²  : 0.548
----------------------------------------
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000060 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 750
[LightGBM] [Info] Number of data points in the train set: 1290, number of used features: 70
[LightGBM] [Info] Start training from score 5.089147




Fold 7
  MAE : 2.954
  RMSE: 4.385
  R²  : 0.622
----------------------------------------
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 761
[LightGBM] [Info] Number of data points in the train set: 1474, number of used features: 71
[LightGBM] [Info] Start training from score 5.070556
Fold 8
  MAE : 3.143
  RMSE: 4.496
  R²  : 0.603
----------------------------------------
Average Backtest Performance:
  MAE : 3.125
  RMSE: 4.661
  R²  : 0.573




In [None]:
# Block: Kill (Future Race Simulation with Manual Starting Grid)

import numpy as np
import pandas as pd

# ---- Step 1: Manually define starting grid ----
grid_data = [
    ("Max Verstappen", "Red Bull Racing"),
    ("Carlos Sainz", "Williams"),
    ("Liam Lawson", "Racing Bulls"),
    ("Kimi Antonelli", "Mercedes"),
    ("George Russell", "Mercedes"),
    ("Yuki Tsunoda", "Red Bull Racing"),
    ("Lando Norris", "McLaren"),
    ("Isack Hadjar", "Racing Bulls"),
    ("Oscar Piastri", "McLaren"),
    ("Charles Leclerc", "Ferrari"),
    ("Fernando Alonso", "Aston Martin"),
    ("Lewis Hamilton", "Ferrari"),
    ("Gabriel Bortoleto", "Kick Sauber"),
    ("Lance Stroll", "Aston Martin"),
    ("Oliver Bearman", "Haas F1 Team"),
    ("Franco Colapinto", "Alpine"),
    ("Nico Hulkenberg", "Kick Sauber"),
    ("Pierre Gasly", "Alpine"),
    ("Alexander Albon", "Williams"),
    ("Esteban Ocon", "Haas F1 Team"),
]

future_race = pd.DataFrame(grid_data, columns=["drivername", "teamname"])
future_race["starting_grid_position"] = np.arange(1, len(future_race) + 1)
future_race["year"] = 2025
future_race["round"] = 17
future_race["location"] = "Baku"

# ---- Step 2: Attach pre-race features from history ----
driver_last_form = df_feat.groupby("drivername")["driver_form_decay"].last()
team_last_form = df_feat.groupby("teamname")["team_form_decay"].last()

future_race["driver_form_decay"] = future_race["drivername"].map(driver_last_form)
future_race["team_form_decay"] = future_race["teamname"].map(team_last_form)

future_race["grid_advantage"] = future_race["starting_grid_position"].max() - future_race["starting_grid_position"]
future_race["season_progress"] = 17 / 22  # assume 22 rounds in 2025
future_race["rainfall"] = 0  # dry forecast

# ---- Step 3: Features ----
X_future = future_race[categorical_features + numeric_features]
X_future_p = preprocessor.transform(X_future)

# ---- Step 4: Monte Carlo simulation ----
n_sim = 1000
all_preds = []

for i in range(n_sim):
    preds = model.predict(X_future_p) + np.random.normal(0, 2, size=len(X_future))
    all_preds.append(preds)

all_preds = np.array(all_preds)

# ---- Step 5: Expected points ----
future_race["pred_points_mean"] = all_preds.mean(axis=0)
future_race["pred_points_std"] = all_preds.std(axis=0)
future_race["pred_points_5pct"] = np.percentile(all_preds, 5, axis=0)
future_race["pred_points_95pct"] = np.percentile(all_preds, 95, axis=0)

# ---- Step 6: Finishing probabilities ----
sim_orders = np.argsort(-all_preds, axis=1)  # descending by points per sim
n_drivers = len(future_race)
probs = pd.DataFrame(0.0, index=future_race["drivername"],
                     columns=["P1_prob", "Podium_prob", "Top10_prob"])

for d in range(n_drivers):
    ranks = (sim_orders == d).nonzero()[1]
    probs.iloc[d, 0] = np.mean(ranks == 0)     # win probability
    probs.iloc[d, 1] = np.mean(ranks < 3)      # podium probability
    probs.iloc[d, 2] = np.mean(ranks < 10)     # top-10 probability

# ---- Step 7: Merge results ----
results = future_race[[
    "drivername", "teamname", "starting_grid_position",
    "pred_points_mean", "pred_points_std",
    "pred_points_5pct", "pred_points_95pct"
]].copy()

results = results.merge(probs, left_on="drivername", right_index=True)
results = results.sort_values("P1_prob", ascending=False).reset_index(drop=True)

print(results)




           drivername         teamname  starting_grid_position  \
0      Max Verstappen  Red Bull Racing                       1   
1        Lando Norris          McLaren                       7   
2       Oscar Piastri          McLaren                       9   
3        Yuki Tsunoda  Red Bull Racing                       6   
4      Kimi Antonelli         Mercedes                       4   
5        Carlos Sainz         Williams                       2   
6         Liam Lawson     Racing Bulls                       3   
7      George Russell         Mercedes                       5   
8        Isack Hadjar     Racing Bulls                       8   
9     Charles Leclerc          Ferrari                      10   
10    Fernando Alonso     Aston Martin                      11   
11     Lewis Hamilton          Ferrari                      12   
12  Gabriel Bortoleto      Kick Sauber                      13   
13       Lance Stroll     Aston Martin                      14   
14     Oli

# XG Boost

In [None]:
# Block: ShootXGB (XGBoost + Rolling Window Backtesting)

import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# ---- Preprocessing pipeline ----
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", StandardScaler(), numeric_features)
    ]
)

# ---- XGBoost regressor ----
model_xgb = xgb.XGBRegressor(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    reg_alpha=0.0,
    random_state=42,
    tree_method="hist",
    n_jobs=-1
)

# ---- Rolling-window backtesting ----
df_sorted = df_feat.sort_values(by=["year", "round"]).reset_index(drop=True)
X_sorted = df_sorted[categorical_features + numeric_features]
y_sorted = df_sorted["points"]

tscv = TimeSeriesSplit(n_splits=8)
fold_results = []

for fold, (train_idx, test_idx) in enumerate(tscv.split(X_sorted)):
    X_train, X_test = X_sorted.iloc[train_idx], X_sorted.iloc[test_idx]
    y_train, y_test = y_sorted.iloc[train_idx], y_sorted.iloc[test_idx]

    # Preprocess
    X_train_p = preprocessor.fit_transform(X_train)
    X_test_p = preprocessor.transform(X_test)

    # Fit model
    model_xgb.fit(X_train_p, y_train)
    y_pred = model_xgb.predict(X_test_p)

    # Metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    fold_results.append((mae, rmse, r2))
    print(f"Fold {fold+1}")
    print(f"  MAE : {mae:.3f}")
    print(f"  RMSE: {rmse:.3f}")
    print(f"  R²  : {r2:.3f}")
    print("-"*40)

# ---- Average results ----
mae_avg = np.mean([r[0] for r in fold_results])
rmse_avg = np.mean([r[1] for r in fold_results])
r2_avg = np.mean([r[2] for r in fold_results])

print("Average Backtest Performance (XGBoost):")
print(f"  MAE : {mae_avg:.3f}")
print(f"  RMSE: {rmse_avg:.3f}")
print(f"  R²  : {r2_avg:.3f}")


Fold 1
  MAE : 3.481
  RMSE: 5.395
  R²  : 0.423
----------------------------------------
Fold 2
  MAE : 2.805
  RMSE: 4.279
  R²  : 0.640
----------------------------------------
Fold 3
  MAE : 2.900
  RMSE: 4.463
  R²  : 0.608
----------------------------------------
Fold 4
  MAE : 2.992
  RMSE: 4.617
  R²  : 0.603
----------------------------------------
Fold 5
  MAE : 2.819
  RMSE: 4.475
  R²  : 0.608
----------------------------------------
Fold 6
  MAE : 3.023
  RMSE: 4.766
  R²  : 0.551
----------------------------------------
Fold 7
  MAE : 2.827
  RMSE: 4.239
  R²  : 0.647
----------------------------------------
Fold 8
  MAE : 3.017
  RMSE: 4.323
  R²  : 0.633
----------------------------------------
Average Backtest Performance (XGBoost):
  MAE : 2.983
  RMSE: 4.570
  R²  : 0.589


In [None]:
# Block: Kill (Future Race Simulation with Manual Starting Grid)

import numpy as np
import pandas as pd

# ---- Step 1: Manually define starting grid ----
grid_data = [
    ("Max Verstappen", "Red Bull Racing"),
    ("Carlos Sainz", "Williams"),
    ("Liam Lawson", "Racing Bulls"),
    ("Kimi Antonelli", "Mercedes"),
    ("George Russell", "Mercedes"),
    ("Yuki Tsunoda", "Red Bull Racing"),
    ("Lando Norris", "McLaren"),
    ("Isack Hadjar", "Racing Bulls"),
    ("Oscar Piastri", "McLaren"),
    ("Charles Leclerc", "Ferrari"),
    ("Fernando Alonso", "Aston Martin"),
    ("Lewis Hamilton", "Ferrari"),
    ("Gabriel Bortoleto", "Kick Sauber"),
    ("Lance Stroll", "Aston Martin"),
    ("Oliver Bearman", "Haas F1 Team"),
    ("Franco Colapinto", "Alpine"),
    ("Nico Hulkenberg", "Kick Sauber"),
    ("Pierre Gasly", "Alpine"),
    ("Alexander Albon", "Williams"),
    ("Esteban Ocon", "Haas F1 Team"),
]

future_race = pd.DataFrame(grid_data, columns=["drivername", "teamname"])
future_race["starting_grid_position"] = np.arange(1, len(future_race) + 1)
future_race["year"] = 2025
future_race["round"] = 17
future_race["location"] = "Baku"

# ---- Step 2: Attach pre-race features from history ----
driver_last_form = df_feat.groupby("drivername")["driver_form_decay"].last()
team_last_form = df_feat.groupby("teamname")["team_form_decay"].last()

future_race["driver_form_decay"] = future_race["drivername"].map(driver_last_form)
future_race["team_form_decay"] = future_race["teamname"].map(team_last_form)

future_race["grid_advantage"] = future_race["starting_grid_position"].max() - future_race["starting_grid_position"]
future_race["season_progress"] = 17 / 22  # assume 22 rounds in 2025
future_race["rainfall"] = 0  # dry forecast

# ---- Step 3: Features ----
X_future = future_race[categorical_features + numeric_features]
X_future_p = preprocessor.transform(X_future)

# ---- Step 4: Monte Carlo simulation ----
n_sim = 1000
all_preds = []

for i in range(n_sim):
    preds = model_xgb.predict(X_future_p) + np.random.normal(0, 2, size=len(X_future))
    all_preds.append(preds)

all_preds = np.array(all_preds)

# ---- Step 5: Expected points ----
future_race["pred_points_mean"] = all_preds.mean(axis=0)
future_race["pred_points_std"] = all_preds.std(axis=0)
future_race["pred_points_5pct"] = np.percentile(all_preds, 5, axis=0)
future_race["pred_points_95pct"] = np.percentile(all_preds, 95, axis=0)

# ---- Step 6: Finishing probabilities ----
sim_orders = np.argsort(-all_preds, axis=1)  # descending by points per sim
n_drivers = len(future_race)
probs = pd.DataFrame(0.0, index=future_race["drivername"],
                     columns=["P1_prob", "Podium_prob", "Top10_prob"])

for d in range(n_drivers):
    ranks = (sim_orders == d).nonzero()[1]
    probs.iloc[d, 0] = np.mean(ranks == 0)     # win probability
    probs.iloc[d, 1] = np.mean(ranks < 3)      # podium probability
    probs.iloc[d, 2] = np.mean(ranks < 10)     # top-10 probability

# ---- Step 7: Merge results ----
results = future_race[[
    "drivername", "teamname", "starting_grid_position",
    "pred_points_mean", "pred_points_std",
    "pred_points_5pct", "pred_points_95pct"
]].copy()

results = results.merge(probs, left_on="drivername", right_index=True)
results = results.sort_values("P1_prob", ascending=False).reset_index(drop=True)

print(results)


           drivername         teamname  starting_grid_position  \
0      Max Verstappen  Red Bull Racing                       1   
1        Lando Norris          McLaren                       7   
2       Oscar Piastri          McLaren                       9   
3        Carlos Sainz         Williams                       2   
4      Kimi Antonelli         Mercedes                       4   
5         Liam Lawson     Racing Bulls                       3   
6        Yuki Tsunoda  Red Bull Racing                       6   
7      George Russell         Mercedes                       5   
8        Isack Hadjar     Racing Bulls                       8   
9     Charles Leclerc          Ferrari                      10   
10    Fernando Alonso     Aston Martin                      11   
11     Lewis Hamilton          Ferrari                      12   
12  Gabriel Bortoleto      Kick Sauber                      13   
13       Lance Stroll     Aston Martin                      14   
14     Oli