In [1]:
import fastf1
import numpy as np
import pandas as pd 
import os 
import matplotlib.pyplot as plt 
import sklearn 
import time 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [44]:
df1 = pd.read_csv("dataset/f1_data_combined.csv")
df2 = pd.read_csv("dataset/Race_Info.csv", sep=";")

df1["RaceKey"] = df1["RaceName"].str.strip() + "_" + df1["Year"].astype(str)
df2["RaceKey"] = df2["RaceName"].str.strip() + "_" + df2["Year"].astype(str)


In [45]:
df = pd.merge(
    df1,
    df2.drop(columns=["RaceName", "Year"]),  # drop duplicates so RaceKey is unique
    on="RaceKey",
    how="left",
    validate="m:1"  # many drivers to 1 race info
)

In [18]:
df.head()

Unnamed: 0,Driver,LapTime_FP1,Compound_FP1,SpeedST_FP1,SpeedFL_FP1,SpeedI1_FP1,SpeedI2_FP1,LongestStintCompound_FP1,LongestStintLaps_FP1,AvgLapTime_LongestStint_FP1,...,FastestPracticeTime,FastestQualifyingTime,FastestPracticeCompound,IsImputed_FP1,IsImputed_FP2,IsImputed_FP3,RaceKey,Laps,Laps_Done,Type of Track
0,ALO,90.842,2.0,5.0,1.0,1.0,7.0,0.0,11.0,94.212571,...,89.418,88.92,0.0,False,False,False,Saudi Arabian Grand Prix_2021,50.0,50.0,High
1,BOT,90.009,0.0,5.0,1.0,6.0,8.0,0.0,9.0,91.891667,...,89.019,87.622,0.0,False,False,False,Saudi Arabian Grand Prix_2021,50.0,50.0,High
2,GAS,90.263,0.0,2.0,4.0,2.0,5.0,0.0,12.0,92.1928,...,88.715,88.125,1.0,False,False,False,Saudi Arabian Grand Prix_2021,50.0,50.0,High
3,GIO,90.318,0.0,8.0,6.0,2.0,10.0,1.0,10.0,93.5195,...,89.59,88.616,0.0,False,False,False,Saudi Arabian Grand Prix_2021,50.0,50.0,High
4,HAM,89.786,0.0,4.0,1.0,6.0,10.0,0.0,10.0,91.68625,...,88.314,87.511,2.0,False,False,False,Saudi Arabian Grand Prix_2021,50.0,50.0,High


In [46]:
df.rename(columns={"Type of Track": "TrackType"}, inplace=True)

In [47]:
# categorize type of track column
type_track = {"Low": 0, "LowMedium": 1, "Medium": 2, "MediumHigh": 3,"High": 4}
df['TrackType'] = df['TrackType'].map(type_track)
# laps to int
df['Laps'] = df['Laps'].astype(int)
df["Laps_Done"] = df["Laps_Done"].astype(int)

# REMOVE UNNECESSARY COLUMNS

# Remove unnecessary columns qualifying
colsToDrop = ["Q1_TopSpeedST", "Q1_AirTemp", "Q1_TrackTemp", "Q1_WindSpeed",
    "Q1_WindDirection", "Q1_Humidity", "Q1_Pressure",
    "Q2_TopSpeedST", "Q2_AirTemp", "Q2_TrackTemp", "Q2_WindSpeed",
    "Q2_WindDirection", "Q2_Humidity", "Q2_Pressure",
    "Q3_TopSpeedST", "Q3_AirTemp", "Q3_TrackTemp", "Q3_WindSpeed",
    "Q3_WindDirection", "Q3_Humidity", "Q3_Pressure"]

df.drop(columns=colsToDrop, inplace=True)

# Remove unnecessary columns practice sessions
colsToDrop = ["SpeedFL_FP1", "SpeedI1_FP1", "SpeedI2_FP1",
    "SpeedFL_FP2", "SpeedI1_FP2", "SpeedI2_FP2",
    "SpeedFL_FP3", "SpeedI1_FP3", "SpeedI2_FP3"]

df.drop(columns=colsToDrop, inplace=True)

colsToDrop = [
# Fastest Lap Weather (Practice Sessions)
    "AirTemp_FastestLap_FP1", "TrackTemp_FastestLap_FP1", "WindSpeed_FastestLap_FP1",
    "WindDirection_FastestLap_FP1", "Humidity_FastestLap_FP1", "Pressure_FastestLap_FP1",
    "AirTemp_FastestLap_FP2", "TrackTemp_FastestLap_FP2", "WindSpeed_FastestLap_FP2",
    "WindDirection_FastestLap_FP2", "Humidity_FastestLap_FP2", "Pressure_FastestLap_FP2",
    "AirTemp_FastestLap_FP3", "TrackTemp_FastestLap_FP3", "WindSpeed_FastestLap_FP3",
    "WindDirection_FastestLap_FP3", "Humidity_FastestLap_FP3", "Pressure_FastestLap_FP3",

    # Average Weather During Longest Stint
    "AvgAirTemp_LongestStint_FP1", "AvgTrackTemp_LongestStint_FP1", "AvgWindSpeed_LongestStint_FP1",
    "AvgWindDirection_LongestStint_FP1", "AvgHumidity_LongestStint_FP1", "AvgPressure_LongestStint_FP1",
    "AvgAirTemp_LongestStint_FP2", "AvgTrackTemp_LongestStint_FP2", "AvgWindSpeed_LongestStint_FP2",
    "AvgWindDirection_LongestStint_FP2", "AvgHumidity_LongestStint_FP2", "AvgPressure_LongestStint_FP2",
    "AvgAirTemp_LongestStint_FP3", "AvgTrackTemp_LongestStint_FP3", "AvgWindSpeed_LongestStint_FP3",
    "AvgWindDirection_LongestStint_FP3", "AvgHumidity_LongestStint_FP3", "AvgPressure_LongestStint_FP3"
]

df.drop(columns=colsToDrop, inplace=True)
   

In [96]:
def predict_qualy():
    df_qualy = df.dropna(subset=["FastestQualifyingTime"])

    df_qualy = df_qualy.fillna({ "FastestPracticeCompound": 0 }) 

    # ── 1) Your single feature & target
    X = df_qualy[['FastestPracticeTime', "TrackType", "FastestPracticeCompound"]]
    y = df_qualy['FastestQualifyingTime']

    # ── 2) Build a list of unique (RaceName, Year) combos…
    unique_races = df_qualy[['RaceName','Year']].drop_duplicates().reset_index(drop=True)

    # ── 3) …and sample one at random
    test_race = unique_races.sample(n=1).iloc[0]
    print("Test set will be:", test_race['RaceName'], test_race['Year'])

    # ── 4) Create your mask
    mask_test = (
        (df_qualy['RaceName'] == test_race['RaceName']) &
        (df_qualy['Year']       == test_race['Year'])
    )


    X_train = X[~mask_test]
    y_train = y[~mask_test]
    X_test  = X[ mask_test]
    y_test  = y[ mask_test]

    # ── 5) Fit your model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # ── 6) Predict & evaluate
    y_pred = model.predict(X_test)
    mse    = mean_squared_error(y_test, y_pred)
    print(f"MSE on {test_race['RaceName']} {test_race['Year']}: {mse:.4f}")

    # ── 7) Build & compare the predicted grid
    test_df = df_qualy[mask_test].copy()
    test_df['PredTime']      = y_pred
    test_df['PredPosition']  = test_df['PredTime'] \
                                .rank(method='first', ascending=True).astype(int)
    test_df['ActualPosition']= test_df['QualifyingPosition']
    test_df['ActualTime']    = test_df['FastestQualifyingTime']

    correct = (test_df['PredPosition'] == test_df['ActualPosition']).sum()
    total   = len(test_df)
    print(f"Exact grid‐position matches: {correct}/{total} ({correct/total:.0%})\n")

    # 2) Compute the bias (mean difference)
    bias = np.mean(y_pred - y_test)

    # 3) Calibrate your predictions by subtracting that bias
    y_pred_cal = y_pred - bias

    # 4) Inject both into your test DataFrame
    test_df = df_qualy[mask_test].copy()
    test_df['PredTimeRaw'] = y_pred
    test_df['PredTimeCal'] = y_pred_cal
    test_df['ActualTime']  = test_df['FastestQualifyingTime']

    # 5) Rank both raw and calibrated predictions
    test_df['PosRaw'] = test_df['PredTimeRaw'] \
                        .rank(method='first', ascending=True).astype(int)
    test_df['PosCal'] = test_df['PredTimeCal'] \
                        .rank(method='first', ascending=True).astype(int)
    test_df['ActPos'] = test_df['QualifyingPosition']

    # 6) Print a clean comparison
    print(f"{'P#':>3}  {'Driver':<6}  {'RawPred':>7}  {'CalPred':>7}  {'Actual':>7}  {'ActP':>4}  {'RawP':>4}  {'CalP':>4}")
    print("-"* 50)
    for _, r in test_df.sort_values('PosCal').iterrows():
        print(f"{r['PosCal']:>3}  {r['Driver']:<6}  {r['PredTimeRaw']:7.3f}  {r['PredTimeCal']:7.3f}  {r['ActualTime']:7.3f}  {int(r['ActPos']):>4}  {int(r['PosRaw']):>4}  {int(r['PosCal']):>4}")

In [99]:
predict_qualy()

Test set will be: Mexican Grand Prix 2019
MSE on Mexican Grand Prix 2019: 0.6599
Exact grid‐position matches: 4/20 (20%)

 P#  Driver  RawPred  CalPred   Actual  ActP  RawP  CalP
--------------------------------------------------
  1  LEC      75.856   75.297   75.024     2     1     1
  2  VET      75.884   75.324   75.170     3     2     2
  3  BOT      75.974   75.414   75.338     6     3     3
  4  HAM      76.099   75.540   75.262     4     4     4
  5  SAI      76.364   75.805   76.014     7     5     5
  6  VER      76.451   75.892   74.758     1     6     6
  7  GAS      76.831   76.271   76.586    10     7     7
  8  ALB      76.835   76.275   75.336     5     8     8
  9  NOR      76.888   76.329   76.322     8     9     9
 10  PER      76.951   76.392   76.687    11    10    10
 11  KVY      77.283   76.724   76.469     9    11    11
 12  RAI      77.501   76.941   76.967    14    12    12
 13  STR      77.631   77.071   78.065    16    13    13
 14  GIO      77.646   77.087