**Research Question 1 (Regression):** Can we accurately predict a driver's fastest lap time using only *pre-race* data?
- **Methods:** Ridge Regression and LASSO (to handle multicollinearity).
- **Goal:** Identify the most critical pre-race factors (e.g., qualifying performance, grid position) while avoiding "data leakage" (using post-race data like average lap time).

Import and Loading of the Data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob
import os
from scipy import stats
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV, RidgeCV, LinearRegression, LogisticRegression
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, RocCurveDisplay, confusion_matrix

filepath = glob.glob('f1datasets/*.csv') # 14 datasets
f1_dfs = {}
for file in filepath:
    filename = os.path.basename(file)
    df_name = os.path.splitext(filename)[0]
    f1_dfs[df_name] = pd.read_csv(file)
print("Datasets loaded.")

cv5 = KFold(n_splits=5, shuffle=True, random_state=7604) # 5-fold cross-validation

Datasets loaded.


Cleaning the data // Feature Engineering

In [None]:
for df_name in f1_dfs:
    f1_dfs[df_name].replace(r'\\N', np.nan, inplace=True)
print("Missing values replaced.")

def time_to_ms(time_str): # converts time strings to milliseconds
    if pd.isna(time_str): 
        return np.nan
    parts = str(time_str).split(':')
    try: 
        if len(parts) == 2: # M:SS.mmm
            return int((int(parts[0]) * 60 + float(parts[1])) * 1000)
        elif len(parts) == 3: # H:MM:SS.mmm
            return int((int(parts[0]) * 3600 + int(parts[1]) * 60 + float(parts[2])) * 1000)
        else: return np.nan
    except ValueError: 
        return np.nan

# Clean 'results' (Response Variables)
df_results = f1_dfs['results'].copy()
df_results['fastestLapTime_ms'] = df_results['fastestLapTime'].apply(time_to_ms)
df_results['positionOrder'] = pd.to_numeric(df_results['positionOrder'], errors='coerce')
df_results['grid'] = pd.to_numeric(df_results['grid'], errors='coerce')

# Clean 'qualifying' (Predictors)
df_quali = f1_dfs['qualifying'].copy()
for col in ['q1', 'q2', 'q3']:
    df_quali[f'{col}_ms'] = df_quali[col].apply(time_to_ms)
df_quali_short = df_quali[['raceId', 'driverId', 'constructorId', 'position', 'q1_ms', 'q2_ms', 'q3_ms']].rename(columns={'position': 'qualifying_position'})

# Clean 'races' & 'drivers' (Context)
df_races = f1_dfs['races'].copy()
df_races['date'] = pd.to_datetime(df_races['date'])
df_races_short = df_races[['raceId', 'year', 'round', 'circuitId', 'date']]

df_drivers = f1_dfs['drivers'].copy()
df_drivers['dob'] = pd.to_datetime(df_drivers['dob'])
df_drivers_short = df_drivers[['driverId', 'driverRef', 'nationality', 'dob']]

# Merge into Master DataFrame
master_f1 = pd.merge(df_results, df_races_short, on='raceId', how='left')
master_f1 = pd.merge(master_f1, df_quali_short, on=['raceId', 'driverId', 'constructorId'], how='left')
master_f1 = pd.merge(master_f1, df_drivers_short, on='driverId', how='left')
master_f1['driver_age_at_race'] = (master_f1['date'] - master_f1['dob']).dt.days / 365.25

print("Master DataFrame created.")