In [None]:
!pip install kagglehub[pandas-datasets]



In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("jtrotman/formula-1-race-data")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/formula-1-race-data


In [None]:
# =========================================================
# Formula-1 dataset → feature table (per driver–race row)
# =========================================================
import pandas as pd

# ---- CONFIG -------------------------------------------------------------
PATH  = "/kaggle/input/formula-1-race-data/"   # change if needed
NULLS = ['\\N', '']                            # Ergast NA tokens

# ---- 1. Load CSVs --------------------------------------------------------
races       = pd.read_csv(f"{PATH}races.csv",       na_values=NULLS, parse_dates=['date'])
drivers     = pd.read_csv(f"{PATH}drivers.csv",     na_values=NULLS, parse_dates=['dob'])
results     = pd.read_csv(f"{PATH}results.csv",     na_values=NULLS)
qualifying  = pd.read_csv(f"{PATH}qualifying.csv",  na_values=NULLS)
constructors = pd.read_csv(f"{PATH}constructors.csv", na_values=NULLS)
d_standings = pd.read_csv(f"{PATH}driver_standings.csv", na_values=NULLS)

# ---- 2. Clean numeric columns & create winner flag ----------------------
results['position'] = pd.to_numeric(results['position'], errors='coerce')
results['grid']     = pd.to_numeric(results['grid'],     errors='coerce')
results['winner']   = (results['position'] == 1).astype(int)

# ---- 3. Merge qualifying lap times (q1–q3) + qual position -------------
q_cols = ['raceId', 'driverId', 'q1', 'q2', 'q3', 'position']  # 'position' = qual rank
df = results.merge(qualifying[q_cols], on=['raceId', 'driverId'], how='left') \
            .rename(columns={'position': 'qual_pos'})

# --- 3. Merge qualifying lap times + quali-position --------------------
q_cols = ['raceId', 'driverId', 'q1', 'q2', 'q3', 'position']  # 'position' = qual rank
df = results.merge(
        qualifying[q_cols],
        on=['raceId', 'driverId'],
        how='left',
        suffixes=('', '_qual')       # <-- keeps results columns unchanged
)

df = df.rename(columns={'position_qual': 'qual_pos',   # qualifying P-number
                        'position':      'finish_pos'}) # race result P-number

# ---- 4. Add race & driver metadata --------------------------------------
df = df.merge(races[['raceId', 'date', 'year', 'round', 'circuitId']],
              on='raceId', how='left') \
       .merge(drivers[['driverId', 'dob', 'nationality']],
              on='driverId', how='left')

df['age'] = ((df['date'] - df['dob']).dt.days // 365).astype('Int16')

# ---- 5. Add season-to-date driver points --------------------------------
d_stand_prev = (
    d_standings
      .merge(races[['raceId', 'year', 'round']], on='raceId')
      .rename(columns={'points': 'season_pts'})
)
df = df.merge(d_stand_prev[['raceId', 'driverId', 'season_pts']],
              on=['raceId', 'driverId'], how='left')

In [None]:
print(df.columns)

Index(['resultId', 'raceId', 'driverId', 'constructorId', 'number', 'grid',
       'finish_pos', 'positionText', 'positionOrder', 'points', 'laps', 'time',
       'milliseconds', 'fastestLap', 'rank', 'fastestLapTime',
       'fastestLapSpeed', 'statusId', 'winner', 'q1', 'q2', 'q3', 'qual_pos',
       'date', 'year', 'round', 'circuitId', 'dob', 'nationality', 'age',
       'season_pts', 'is_winner'],
      dtype='object')


In [None]:
# 1️⃣ -- Build modelling table (leak-free) ------------------------

target_col = 'is_winner'
group_col  = 'raceId'
leakage_cols = [
    # any column known only after the chequered flag:
    'positionOrder', 'positionText', 'points', 'laps', 'milliseconds',
    'time', 'rank', 'fastestLap', 'fastestLapTime', 'fastestLapSpeed', 'statusId',
    'winner','finish_pos', # Add 'winner' to the list of leakage columns
]

id_cols   = ['driverId', 'constructorId', group_col]
drop_cols = leakage_cols + [target_col]

X = df.drop(columns=drop_cols).copy()
y      = df[target_col]
groups = df[group_col]

# 👉 NOW derive column-type lists
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols     = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [None]:
print("Columns in X (features):")
print(X.columns)

Columns in X (features):
Index(['resultId', 'raceId', 'driverId', 'constructorId', 'number', 'grid',
       'q1', 'q2', 'q3', 'qual_pos', 'date', 'year', 'round', 'circuitId',
       'dob', 'nationality', 'age', 'season_pts'],
      dtype='object')


In [None]:
# ╔═══════════════════════════════════════════════════════════════╗
# ║ 2. GROUP-BASED TRAIN / TEST SPLIT                             ║
# ╚═══════════════════════════════════════════════════════════════╝
from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train, X_test = X.iloc[train_idx].copy(), X.iloc[test_idx].copy()
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
groups_train    = groups.iloc[train_idx]


In [None]:
print("Columns in X (features):")
print(X.columns)

# Select only numeric columns for correlation calculation
numeric_df = df.select_dtypes(include=['number'])

# Calculate correlations with the target variable 'is_winner'
correlations = numeric_df.corr()['is_winner'].sort_values(ascending=False)
print("\nCorrelations with 'is_winner':")
print(correlations)

Columns in X (features):
Index(['resultId', 'raceId', 'driverId', 'constructorId', 'number', 'grid',
       'q1', 'q2', 'q3', 'qual_pos', 'date', 'year', 'round', 'circuitId',
       'dob', 'nationality', 'age', 'season_pts'],
      dtype='object')

Correlations with 'is_winner':
is_winner          1.000000
winner             1.000000
points             0.557565
season_pts         0.272961
laps               0.130351
fastestLapSpeed    0.047744
milliseconds       0.046882
age                0.031231
fastestLap         0.027696
year               0.012976
raceId             0.010153
resultId           0.010052
round              0.002058
circuitId          0.001402
constructorId     -0.039201
driverId          -0.042486
number            -0.078379
statusId          -0.130393
grid              -0.246271
rank              -0.278670
qual_pos          -0.318008
positionOrder     -0.322919
finish_pos        -0.401425
Name: is_winner, dtype: float64


In [None]:
# ╔═══════════════════════════════════════════════════════════════╗
# ║ 3. PRE-PROCESSING + BASELINE (LOGIT)                          ║
# ╚═══════════════════════════════════════════════════════════════╝
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


preprocess = ColumnTransformer(
    [
        # Numeric: median impute → scale
        ('num',
         Pipeline([
             ('impute', SimpleImputer(strategy='median')),
             ('scale',  StandardScaler())
         ]),
         numeric_cols),

        # Categorical: most-frequent impute → one-hot
        ('cat',
         Pipeline([
             ('impute', SimpleImputer(strategy='most_frequent')),
             ('ohe',    OneHotEncoder(handle_unknown='ignore', sparse_output=True))
         ]),
         cat_cols)
    ],
    remainder='drop'
)

logit_clf = Pipeline([
    ('prep', preprocess),
    ('model', LogisticRegression(max_iter=500, class_weight='balanced'))
])

logit_clf.fit(X_train, y_train)
logit_pred = logit_clf.predict_proba(X_test)[:,1]
print("Baseline ROC-AUC:", roc_auc_score(y_test, logit_pred).round(4))


Baseline ROC-AUC: 0.9159


In [None]:
# ╔═══════════════════════════════════════════════════════════════╗
# ║ 4. LIGHTGBM + RANDOMISED SEARCH                               ║
# ╚═══════════════════════════════════════════════════════════════╝
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

lgbm = LGBMClassifier(objective='binary', n_estimators=800, class_weight='balanced',
                      random_state=42, n_jobs=-1)

param_dist = {
    'model__num_leaves':       [31, 50, 75, 100],
    'model__learning_rate':    [0.01, 0.03, 0.05],
    'model__max_depth':        [-1, 10, 20, 40],
    'model__subsample':        [0.7, 0.8, 0.9, 1.0],
    'model__colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}

lgbm_pipe = Pipeline([
    ('prep', preprocess),
    ('model', lgbm)
])

cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
search = RandomizedSearchCV(lgbm_pipe, param_dist,
                            n_iter=25, scoring='roc_auc',
                            cv=cv, verbose=1, n_jobs=-1, refit=True)

search.fit(X_train, y_train)
print("Best CV ROC-AUC:", search.best_score_.round(4))
print("Best params:", search.best_params_)


Fitting 4 folds for each of 25 candidates, totalling 100 fits




[LightGBM] [Info] Number of positive: 909, number of negative: 20512
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003413 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1633
[LightGBM] [Info] Number of data points in the train set: 21421, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Best CV ROC-AUC: 0.9605
Best params: {'model__subsample': 0.7, 'model__num_leaves': 31, 'model__max_depth': -1, 'model__learning_rate': 0.01, 'model__colsample_bytree': 0.8}


In [None]:
# ╔═══════════════════════════════════════════════════════════════╗
# ║ 5. PER-RACE TOP-1 ACCURACY & ROC-AUC                          ║
# ╚═══════════════════════════════════════════════════════════════╝
from sklearn.metrics import roc_auc_score

def per_race_metrics(model, X, y, race_ids):
    """
    Returns Top-1 accuracy (did we pick the real winner per race?)
    and overall ROC-AUC.
    """
    proba = model.predict_proba(X)[:,1]
    df_eval = pd.DataFrame({'raceId': race_ids, 'y': y.values, 'p': proba})

    # Top-1 accuracy
    top1_correct = []
    for rid, grp in df_eval.groupby('raceId'):
        winner_pred = grp.loc[grp['p'].idxmax(), 'y']
        top1_correct.append(winner_pred == 1)
    top1 = np.mean(top1_correct)

    # ROC-AUC (global, not per-race)
    auc = roc_auc_score(y, proba)

    return top1, auc

In [None]:
# ╔═══════════════════════════════════════════════════════════════╗
# ║ 6. FINAL EVALUATION & FEATURE IMPORTANCE                      ║
# ╚═══════════════════════════════════════════════════════════════╝
best_model = search.best_estimator_

top1, auc  = per_race_metrics(best_model, X_test, y_test, race_ids=groups.iloc[test_idx])
print(f"Top-1 accuracy : {top1:.3f}")
print(f"ROC-AUC        : {auc:.3f}")

# -- LightGBM feature importances ----------------------------------------
importances = best_model.named_steps['model'].feature_importances_
feature_names = best_model.named_steps['prep'].get_feature_names_out()
fi = (pd.Series(importances, index=feature_names)
        .sort_values(ascending=False).head(25))
display(fi.to_frame('importance'))




Top-1 accuracy : 0.524
ROC-AUC        : 0.963


Unnamed: 0,importance
num__season_pts,4881
num__round,2496
num__grid,2417
num__number,1960
num__resultId,1945
num__circuitId,1675
num__age,1478
num__driverId,1366
num__year,1322
num__constructorId,1136


In [86]:
import joblib
joblib.dump(best_model, "model.pkl")

['model.pkl']