In [1]:
import fastf1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Enable cache
fastf1.Cache.enable_cache('../data/cache')

done


In [2]:
# Load 2025 season schedule
print("Loading 2025 season schedule...")
season_2025 = fastf1.get_event_schedule(2025)

# Filter to actual races (remove testing)
races = season_2025[season_2025['EventFormat'] != 'testing'].copy()

print(f"\nTotal races in 2025: {len(races)}")
print("\nFirst 18 races:")
print(races[['RoundNumber', 'EventName', 'Country', 'EventDate']].head(18))

Loading 2025 season schedule...

Total races in 2025: 24

First 18 races:
    RoundNumber                  EventName         Country  EventDate
1             1      Australian Grand Prix       Australia 2025-03-16
2             2         Chinese Grand Prix           China 2025-03-23
3             3        Japanese Grand Prix           Japan 2025-04-06
4             4         Bahrain Grand Prix         Bahrain 2025-04-13
5             5   Saudi Arabian Grand Prix    Saudi Arabia 2025-04-20
6             6           Miami Grand Prix   United States 2025-05-04
7             7  Emilia Romagna Grand Prix           Italy 2025-05-18
8             8          Monaco Grand Prix          Monaco 2025-05-25
9             9         Spanish Grand Prix           Spain 2025-06-01
10           10        Canadian Grand Prix          Canada 2025-06-15
11           11        Austrian Grand Prix         Austria 2025-06-29
12           12         British Grand Prix  United Kingdom 2025-07-06
13           13 

In [3]:
# Load first 18 races
races_to_load = races.head(18)

print(f"\n{'='*60}")
print(f"Loading {len(races_to_load)} races...")
print(f"This will take 10-15 minutes ☕")
print(f"{'='*60}\n")

all_race_data = []
failed_races = []

for idx, race_event in races_to_load.iterrows():
    race_name = race_event['EventName']
    round_num = race_event['RoundNumber']
    
    try:
        print(f"[{round_num}/{len(races_to_load)}] Loading {race_name}...")
        
        session = fastf1.get_session(2025, race_name, 'R')
        session.load()
        
        results = session.results.copy()
        results['RaceName'] = race_name
        results['RoundNumber'] = round_num
        results['Country'] = race_event['Country']
        results['EventDate'] = race_event['EventDate']
        
        all_race_data.append(results)
        print(f"  ✓ Loaded {len(results)} drivers")
        
    except Exception as e:
        print(f"  ✗ Failed: {e}")
        failed_races.append(race_name)

print(f"\n{'='*60}")
print(f"✓ Successfully loaded {len(all_race_data)} races")
if failed_races:
    print(f"✗ Failed: {failed_races}")

# Combine all data
df_2025 = pd.concat(all_race_data, ignore_index=True)

print(f"\nDataset Summary:")
print(f"  Total results: {len(df_2025)}")
print(f"  Races: {df_2025['RaceName'].nunique()}")
print(f"  Drivers: {df_2025['Abbreviation'].nunique()}")
print(f"\nSample:")
print(df_2025[['RoundNumber', 'RaceName', 'Abbreviation', 'TeamName', 'GridPosition', 'Position']].head(10))

core           INFO 	Loading data for Australian Grand Prix - Race [v3.6.1]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...



Loading 18 races...
This will take 10-15 minutes ☕

[1/18] Loading Australian Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

  ✓ Loaded 20 drivers
[2/18] Loading Chinese Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

  ✓ Loaded 20 drivers
[3/18] Loading Japanese Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

  ✓ Loaded 20 drivers
[4/18] Loading Bahrain Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

  ✓ Loaded 20 drivers
[5/18] Loading Saudi Arabian Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

  ✓ Loaded 20 drivers
[6/18] Loading Miami Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

  ✓ Loaded 20 drivers
[7/18] Loading Emilia Romagna Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

  ✓ Loaded 20 drivers
[8/18] Loading Monaco Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

  ✓ Loaded 20 drivers
[9/18] Loading Spanish Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

  ✓ Loaded 19 drivers
[10/18] Loading Canadian Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

  ✓ Loaded 20 drivers
[11/18] Loading Austrian Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

  ✓ Loaded 20 drivers
[12/18] Loading British Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

  ✓ Loaded 20 drivers
[13/18] Loading Belgian Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

  ✓ Loaded 20 drivers
[14/18] Loading Hungarian Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

  ✓ Loaded 20 drivers
[15/18] Loading Dutch Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

  ✓ Loaded 20 drivers
[16/18] Loading Italian Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

  ✓ Loaded 20 drivers
[17/18] Loading Azerbaijan Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

  ✓ Loaded 20 drivers
[18/18] Loading Singapore Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

  ✓ Loaded 20 drivers

✓ Successfully loaded 18 races

Dataset Summary:
  Total results: 359
  Races: 18
  Drivers: 21

Sample:
   RoundNumber               RaceName Abbreviation         TeamName  \
0            1  Australian Grand Prix          NOR          McLaren   
1            1  Australian Grand Prix          VER  Red Bull Racing   
2            1  Australian Grand Prix          RUS         Mercedes   
3            1  Australian Grand Prix          ANT         Mercedes   
4            1  Australian Grand Prix          ALB         Williams   
5            1  Australian Grand Prix          STR     Aston Martin   
6            1  Australian Grand Prix          HUL      Kick Sauber   
7            1  Australian Grand Prix          LEC          Ferrari   
8            1  Australian Grand Prix          PIA          McLaren   
9            1  Australian Grand Prix          HAM          Ferrari   

   GridPosition  Position  
0           1.0       1.0  
1           3.0       2.0  
2     

In [4]:
# Quick data check
print("Data Quality Check:")
print(f"Missing GridPositions: {df_2025['GridPosition'].isna().sum()}")
print(f"Missing Positions (DNFs): {df_2025['Position'].isna().sum()}")
print(f"\nUnique teams: {df_2025['TeamName'].nunique()}")
print(df_2025['TeamName'].unique())
print(f"\nUnique drivers: {df_2025['Abbreviation'].nunique()}")
print(df_2025['Abbreviation'].unique())

# Check for weird values
print(f"\nPosition range: {df_2025['Position'].min()} to {df_2025['Position'].max()}")
print(f"GridPosition range: {df_2025['GridPosition'].min()} to {df_2025['GridPosition'].max()}")

Data Quality Check:
Missing GridPositions: 0
Missing Positions (DNFs): 0

Unique teams: 10
['McLaren' 'Red Bull Racing' 'Mercedes' 'Williams' 'Aston Martin'
 'Kick Sauber' 'Ferrari' 'Alpine' 'Racing Bulls' 'Haas F1 Team']

Unique drivers: 21
['NOR' 'VER' 'RUS' 'ANT' 'ALB' 'STR' 'HUL' 'LEC' 'PIA' 'HAM' 'GAS' 'TSU'
 'OCO' 'BEA' 'LAW' 'BOR' 'ALO' 'SAI' 'DOO' 'HAD' 'COL']

Position range: 1.0 to 20.0
GridPosition range: 1.0 to 20.0


In [8]:
# Sort by driver and race order
df_sorted = df_2025.sort_values(['Abbreviation', 'RoundNumber']).copy()

# Calculate rolling average of last 3 race finishes per driver
df_sorted['Driver_Last3_AvgFinish'] = (
    df_sorted.groupby('Abbreviation')['Position']
    .transform(lambda x: x.rolling(window=3, min_periods=1).mean().shift(1))
)

# Same for teams
df_sorted['Team_Last3_AvgFinish'] = (
    df_sorted.groupby('TeamName')['Position']
    .transform(lambda x: x.rolling(window=3, min_periods=1).mean().shift(1))
)

# Check it worked
print("Sample with new features:")
print(df_sorted[['RoundNumber', 'Abbreviation', 'TeamName', 'Position', 
                 'Driver_Last3_AvgFinish', 'Team_Last3_AvgFinish']].head(20))

Sample with new features:
     RoundNumber Abbreviation      TeamName  Position  Driver_Last3_AvgFinish  \
4              1          ALB      Williams       5.0                     NaN   
26             2          ALB      Williams       7.0                5.000000   
48             3          ALB      Williams       9.0                6.000000   
71             4          ALB      Williams      12.0                7.000000   
88             5          ALB      Williams       9.0                9.333333   
104            6          ALB      Williams       5.0               10.000000   
124            7          ALB      Williams       5.0                8.666667   
148            8          ALB      Williams       9.0                6.333333   
178            9          ALB      Williams      19.0                6.333333   
198           10          ALB      Williams      20.0               11.000000   
215           11          ALB      Williams      17.0               16.000000   
22

In [11]:
df_sorted['Podium'] = (df_sorted['Position'] <= 3.0).astype(int)

print(f"Podium column added!")
print(f"Total podiums: {df_sorted['Podium'].sum()}")
print(f"Total non-podiums: {len(df_sorted) - df_sorted['Podium'].sum()}")

Podium column added!
Total podiums: 54
Total non-podiums: 305


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Prepare the data
model_df = df_sorted.copy()

# Remove rows with NaN in our new features (first few races per driver)
model_df = model_df.dropna(subset=['Driver_Last3_AvgFinish', 'Team_Last3_AvgFinish'])

print(f"Dataset after removing NaN: {len(model_df)} samples")
print(f"Podiums: {model_df['Podium'].sum()}, Non-podiums: {len(model_df) - model_df['Podium'].sum()}")

# Select features
features = ['GridPosition', 'Driver_Last3_AvgFinish', 'Team_Last3_AvgFinish']

# Encode categorical features
team_encoder = LabelEncoder()
driver_encoder = LabelEncoder()

model_df['TeamName_encoded'] = team_encoder.fit_transform(model_df['TeamName'])
model_df['Driver_encoded'] = driver_encoder.fit_transform(model_df['Abbreviation'])

# Add encoded features
features.extend(['TeamName_encoded', 'Driver_encoded'])

# Prepare X and y
X = model_df[features]
y = model_df['Podium']

print(f"\nFeatures: {features}")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

Dataset after removing NaN: 338 samples
Podiums: 51, Non-podiums: 287

Features: ['GridPosition', 'Driver_Last3_AvgFinish', 'Team_Last3_AvgFinish', 'TeamName_encoded', 'Driver_encoded']
X shape: (338, 5)
y shape: (338,)


In [13]:
# Split data: Use races 1-15 for training, 16-18 for testing
# This is more realistic than random split (we predict future races)
train_mask = model_df['RoundNumber'] <= 15
test_mask = model_df['RoundNumber'] > 15

X_train = X[train_mask]
X_test = X[test_mask]
y_train = y[train_mask]
y_test = y[test_mask]

print(f"Training set: {len(X_train)} samples ({y_train.sum()} podiums)")
print(f"Test set: {len(X_test)} samples ({y_test.sum()} podiums)")

# Train Random Forest
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced'  # Handle class imbalance
)

rf_model.fit(X_train, y_train)

print("\n✓ Model trained!")

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)

print(f"\n{'='*60}")
print(f"MODEL PERFORMANCE")
print(f"{'='*60}")
print(f"Accuracy: {accuracy*100:.1f}%\n")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No Podium', 'Podium']))

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

Training set: 278 samples (42 podiums)
Test set: 60 samples (9 podiums)

✓ Model trained!

MODEL PERFORMANCE
Accuracy: 90.0%

Classification Report:
              precision    recall  f1-score   support

   No Podium       0.94      0.94      0.94        51
      Podium       0.67      0.67      0.67         9

    accuracy                           0.90        60
   macro avg       0.80      0.80      0.80        60
weighted avg       0.90      0.90      0.90        60


Feature Importance:
                  Feature  Importance
0            GridPosition    0.414178
2    Team_Last3_AvgFinish    0.245845
1  Driver_Last3_AvgFinish    0.208180
4          Driver_encoded    0.097777
3        TeamName_encoded    0.034021


In [14]:
# Load qualifying data for all 18 races
print("Loading qualifying sessions...")

quali_data = []

for round_num in range(1, 19):
    race_name = races.loc[races['RoundNumber'] == round_num, 'EventName'].values[0]
    
    try:
        print(f"Loading quali for Round {round_num}: {race_name}...")
        
        # Load qualifying session
        quali = fastf1.get_session(2025, race_name, 'Q')
        quali.load()
        
        # Get results
        quali_results = quali.results[['Abbreviation', 'Position', 'Q3', 'Q2', 'Q1']].copy()
        quali_results['RoundNumber'] = round_num
        quali_results['RaceName'] = race_name
        quali_results.rename(columns={'Position': 'Quali_Position'}, inplace=True)
        
        quali_data.append(quali_results)
        print(f"  ✓ Loaded")
        
    except Exception as e:
        print(f"  ✗ Failed: {e}")

# Combine all quali data
df_quali = pd.concat(quali_data, ignore_index=True)

print(f"\n✓ Loaded qualifying data for {len(df_quali)} entries")
print(f"\nSample:")
print(df_quali[['RoundNumber', 'RaceName', 'Abbreviation', 'Quali_Position']].head(10))

core           INFO 	Loading data for Australian Grand Prix - Qualifying [v3.6.1]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Loading qualifying sessions...
Loading quali for Round 1: Australian Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to

  ✓ Loaded
Loading quali for Round 2: Chinese Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to

  ✓ Loaded
Loading quali for Round 3: Japanese Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to

  ✓ Loaded
Loading quali for Round 4: Bahrain Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to

  ✓ Loaded
Loading quali for Round 5: Saudi Arabian Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to

  ✓ Loaded
Loading quali for Round 6: Miami Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to

  ✓ Loaded
Loading quali for Round 7: Emilia Romagna Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to

  ✓ Loaded
Loading quali for Round 8: Monaco Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to

  ✓ Loaded
Loading quali for Round 9: Spanish Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to

  ✓ Loaded
Loading quali for Round 10: Canadian Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to

  ✓ Loaded
Loading quali for Round 11: Austrian Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to

  ✓ Loaded
Loading quali for Round 12: British Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to

  ✓ Loaded
Loading quali for Round 13: Belgian Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to

  ✓ Loaded
Loading quali for Round 14: Hungarian Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to

  ✓ Loaded
Loading quali for Round 15: Dutch Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to

  ✓ Loaded
Loading quali for Round 16: Italian Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to

  ✓ Loaded
Loading quali for Round 17: Azerbaijan Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to

  ✓ Loaded
Loading quali for Round 18: Singapore Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to

  ✓ Loaded

✓ Loaded qualifying data for 360 entries

Sample:
   RoundNumber               RaceName Abbreviation  Quali_Position
0            1  Australian Grand Prix          NOR             1.0
1            1  Australian Grand Prix          PIA             2.0
2            1  Australian Grand Prix          VER             3.0
3            1  Australian Grand Prix          RUS             4.0
4            1  Australian Grand Prix          TSU             5.0
5            1  Australian Grand Prix          ALB             6.0
6            1  Australian Grand Prix          LEC             7.0
7            1  Australian Grand Prix          HAM             8.0
8            1  Australian Grand Prix          GAS             9.0
9            1  Australian Grand Prix          SAI            10.0


In [15]:
# Merge qualifying data with our sorted race data
df_sorted = df_sorted.merge(
    df_quali[['RoundNumber', 'Abbreviation', 'Quali_Position']], 
    on=['RoundNumber', 'Abbreviation'], 
    how='left'
)

# Feature 1: Grid Penalty (boolean)
df_sorted['Grid_Penalty'] = (df_sorted['GridPosition'] != df_sorted['Quali_Position']).astype(int)

# Feature 2: Penalty Size (how many places)
df_sorted['Penalty_Places'] = df_sorted['GridPosition'] - df_sorted['Quali_Position']

# Feature 3: Gap to pole position (we'll use quali position as proxy for now)
df_sorted['Gap_To_Pole'] = df_sorted['Quali_Position'] - 1

# Check it worked
print("Qualifying features added!")
print(f"\nDrivers with grid penalties: {df_sorted['Grid_Penalty'].sum()}")
print(f"\nSample with new features:")
print(df_sorted[['RoundNumber', 'Abbreviation', 'Quali_Position', 'GridPosition', 
                 'Grid_Penalty', 'Penalty_Places']].head(20))

Qualifying features added!

Drivers with grid penalties: 75

Sample with new features:
    RoundNumber Abbreviation  Quali_Position  GridPosition  Grid_Penalty  \
0             1          ALB             6.0           6.0             0   
1             2          ALB            10.0          10.0             0   
2             3          ALB             9.0           9.0             0   
3             4          ALB            15.0          15.0             0   
4             5          ALB            11.0          11.0             0   
5             6          ALB             7.0           7.0             0   
6             7          ALB             7.0           7.0             0   
7             8          ALB            10.0          10.0             0   
8             9          ALB            11.0          11.0             0   
9            10          ALB            10.0           9.0             1   
10           11          ALB            12.0          12.0             0   
1

In [16]:
# Load weather data for all races
print("Loading weather data...")

weather_data = []

for round_num in range(1, 19):
    race_name = races.loc[races['RoundNumber'] == round_num, 'EventName'].values[0]
    
    try:
        print(f"Loading weather for Round {round_num}: {race_name}...")
        
        # Load race session (weather is tied to race)
        session = fastf1.get_session(2025, race_name, 'R')
        session.load()
        
        # Get weather at race start (first timestamp)
        weather = session.weather_data
        if len(weather) > 0:
            race_start_weather = weather.iloc[0]  # First weather reading
            
            weather_data.append({
                'RoundNumber': round_num,
                'RaceName': race_name,
                'AirTemp': race_start_weather['AirTemp'],
                'TrackTemp': race_start_weather['TrackTemp'],
                'Humidity': race_start_weather['Humidity'],
                'Rainfall': race_start_weather['Rainfall']
            })
            print(f"  ✓ Loaded")
        
    except Exception as e:
        print(f"  ✗ Failed: {e}")

# Create weather dataframe
df_weather = pd.DataFrame(weather_data)

print(f"\n✓ Loaded weather for {len(df_weather)} races")
print(f"\nSample:")
print(df_weather.head(10))

core           INFO 	Loading data for Australian Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


Loading weather data...
Loading weather for Round 1: Australian Grand Prix...


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['4', '1', '63', '12', '23', '18', '27', '16', '81', '44', '10', '22', '31', '87', '30', '5', '14', '55', '7', '6']
core           INFO 	Loading data for Chinese Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


  ✓ Loaded
Loading weather for Round 2: Chinese Grand Prix...


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['81', '4', '63', '1', '31', '12', '23', '87', '18', '55', '6', '30', '7', '5', '27', '22', '14', '16', '44', '10']
core           INFO 	Loading data for Japanese Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


  ✓ Loaded
Loading weather for Round 3: Japanese Grand Prix...


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '81', '16', '63', '12', '44', '6', '23', '87', '14', '22', '10', '55', '7', '27', '30', '31', '5', '18']
core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


  ✓ Loaded
Loading weather for Round 4: Bahrain Grand Prix...


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['81', '63', '4', '16', '44', '1', '10', '31', '22', '87', '12', '23', '6', '7', '14', '30', '18', '5', '55', '27']
core           INFO 	Loading data for Saudi Arabian Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


  ✓ Loaded
Loading weather for Round 5: Saudi Arabian Grand Prix...


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['81', '1', '16', '4', '63', '12', '44', '55', '23', '6', '14', '30', '87', '31', '27', '18', '7', '5', '22', '10']
core           INFO 	Loading data for Miami Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


  ✓ Loaded
Loading weather for Round 6: Miami Grand Prix...


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['81', '4', '63', '1', '23', '12', '16', '44', '55', '22', '6', '31', '10', '27', '14', '18', '30', '5', '87', '7']
core           INFO 	Loading data for Emilia Romagna Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


  ✓ Loaded
Loading weather for Round 7: Emilia Romagna Grand Prix...


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '81', '44', '23', '16', '63', '55', '6', '22', '14', '27', '10', '30', '18', '43', '87', '5', '12', '31']
core           INFO 	Loading data for Monaco Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


  ✓ Loaded
Loading weather for Round 8: Monaco Grand Prix...


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['4', '16', '81', '1', '44', '6', '31', '30', '23', '55', '63', '87', '43', '5', '18', '27', '22', '12', '14', '10']
core           INFO 	Loading data for Spanish Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


  ✓ Loaded
Loading weather for Round 9: Spanish Grand Prix...


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 19 drivers: ['81', '4', '16', '63', '27', '44', '6', '10', '14', '1', '30', '5', '22', '55', '43', '31', '87', '12', '23']
core           INFO 	Loading data for Canadian Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


  ✓ Loaded
Loading weather for Round 10: Canadian Grand Prix...


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['63', '1', '12', '81', '16', '44', '14', '27', '31', '55', '87', '22', '43', '5', '10', '6', '18', '4', '30', '23']
core           INFO 	Loading data for Austrian Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


  ✓ Loaded
Loading weather for Round 11: Austrian Grand Prix...


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['4', '81', '16', '44', '63', '30', '14', '5', '27', '31', '87', '6', '10', '18', '43', '22', '23', '1', '12', '55']
core           INFO 	Loading data for British Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


  ✓ Loaded
Loading weather for Round 12: British Grand Prix...


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['4', '81', '27', '44', '1', '10', '18', '23', '14', '63', '87', '55', '31', '16', '22', '12', '6', '5', '30', '43']
core           INFO 	Loading data for Belgian Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


  ✓ Loaded
Loading weather for Round 13: Belgian Grand Prix...


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['81', '4', '16', '1', '63', '23', '44', '30', '5', '10', '87', '27', '22', '18', '31', '12', '14', '55', '43', '6']
core           INFO 	Loading data for Hungarian Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


  ✓ Loaded
Loading weather for Round 14: Hungarian Grand Prix...


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['4', '81', '63', '16', '14', '5', '18', '30', '1', '12', '6', '44', '27', '55', '23', '31', '22', '43', '10', '87']
core           INFO 	Loading data for Dutch Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


  ✓ Loaded
Loading weather for Round 15: Dutch Grand Prix...


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['81', '1', '6', '63', '23', '87', '18', '14', '22', '31', '43', '30', '55', '27', '5', '12', '10', '4', '16', '44']
core           INFO 	Loading data for Italian Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


  ✓ Loaded
Loading weather for Round 16: Italian Grand Prix...


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '81', '16', '63', '44', '23', '5', '12', '6', '55', '87', '22', '30', '31', '10', '43', '18', '14', '27']
core           INFO 	Loading data for Azerbaijan Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


  ✓ Loaded
Loading weather for Round 17: Azerbaijan Grand Prix...


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '63', '55', '12', '30', '22', '4', '44', '16', '6', '5', '87', '23', '31', '14', '27', '18', '10', '43', '81']
core           INFO 	Loading data for Singapore Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


  ✓ Loaded
Loading weather for Round 18: Singapore Grand Prix...


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['63', '1', '4', '81', '12', '16', '14', '44', '87', '55', '6', '22', '18', '23', '30', '43', '5', '31', '10', '27']


  ✓ Loaded

✓ Loaded weather for 18 races

Sample:
   RoundNumber                   RaceName  AirTemp  TrackTemp  Humidity  \
0            1      Australian Grand Prix     15.8       18.8      91.0   
1            2         Chinese Grand Prix     27.5       42.2      16.0   
2            3        Japanese Grand Prix     14.6       22.5      76.0   
3            4         Bahrain Grand Prix     27.9       34.5      45.0   
4            5   Saudi Arabian Grand Prix     28.7       39.4      71.0   
5            6           Miami Grand Prix     26.5       39.9      68.0   
6            7  Emilia Romagna Grand Prix     23.8       45.1      40.0   
7            8          Monaco Grand Prix     21.5       42.6      55.0   
8            9         Spanish Grand Prix     29.9       48.2      49.0   
9           10        Canadian Grand Prix     23.0       47.1      28.0   

   Rainfall  
0      True  
1     False  
2     False  
3     False  
4     False  
5     False  
6     False  
7     False

In [17]:
# Merge weather data
df_sorted = df_sorted.merge(
    df_weather[['RoundNumber', 'AirTemp', 'TrackTemp', 'Humidity', 'Rainfall']], 
    on='RoundNumber', 
    how='left'
)

print("✓ Weather data merged")

# Now calculate gap to teammate in qualifying
# First, find who are teammates (same team)
df_quali_with_team = df_quali.merge(
    df_sorted[['RoundNumber', 'Abbreviation', 'TeamName']].drop_duplicates(),
    on=['RoundNumber', 'Abbreviation'],
    how='left'
)

# For each driver, find their teammate's quali position
teammate_quali = []

for idx, row in df_quali_with_team.iterrows():
    # Find teammate (same team, different driver, same round)
    teammate = df_quali_with_team[
        (df_quali_with_team['RoundNumber'] == row['RoundNumber']) &
        (df_quali_with_team['TeamName'] == row['TeamName']) &
        (df_quali_with_team['Abbreviation'] != row['Abbreviation'])
    ]
    
    if len(teammate) > 0:
        teammate_pos = teammate.iloc[0]['Quali_Position']
        gap = row['Quali_Position'] - teammate_pos
    else:
        gap = 0  # No teammate (shouldn't happen, but just in case)
    
    teammate_quali.append({
        'RoundNumber': row['RoundNumber'],
        'Abbreviation': row['Abbreviation'],
        'Gap_To_Teammate_Quali': gap,
        'Beat_Teammate': 1 if gap < 0 else 0
    })

df_teammate = pd.DataFrame(teammate_quali)

# Merge with main dataframe
df_sorted = df_sorted.merge(
    df_teammate[['RoundNumber', 'Abbreviation', 'Gap_To_Teammate_Quali', 'Beat_Teammate']],
    on=['RoundNumber', 'Abbreviation'],
    how='left'
)

print("✓ Teammate gap calculated")
print(f"\nNew features summary:")
print(f"Total features now: {len(df_sorted.columns)}")
print(f"\nSample with all new features:")
print(df_sorted[['RoundNumber', 'Abbreviation', 'TeamName', 'Quali_Position', 
                 'Gap_To_Teammate_Quali', 'Beat_Teammate', 'TrackTemp', 'Rainfall']].head(20))

✓ Weather data merged
✓ Teammate gap calculated

New features summary:
Total features now: 39

Sample with all new features:
    RoundNumber Abbreviation      TeamName  Quali_Position  \
0             1          ALB      Williams             6.0   
1             2          ALB      Williams            10.0   
2             3          ALB      Williams             9.0   
3             4          ALB      Williams            15.0   
4             5          ALB      Williams            11.0   
5             6          ALB      Williams             7.0   
6             7          ALB      Williams             7.0   
7             8          ALB      Williams            10.0   
8             9          ALB      Williams            11.0   
9            10          ALB      Williams            10.0   
10           11          ALB      Williams            12.0   
11           12          ALB      Williams            14.0   
12           13          ALB      Williams             5.0   
13     

In [18]:
# Prepare data with new features
model_df = df_sorted.copy()

# Remove rows with NaN
model_df = model_df.dropna(subset=['Driver_Last3_AvgFinish', 'Team_Last3_AvgFinish', 
                                     'Gap_To_Teammate_Quali', 'TrackTemp'])

print(f"Dataset after removing NaN: {len(model_df)} samples")
print(f"Podiums: {model_df['Podium'].sum()}, Non-podiums: {len(model_df) - model_df['Podium'].sum()}")

# Select ALL features
features = [
    'GridPosition',
    'Quali_Position', 
    'Grid_Penalty',
    'Penalty_Places',
    'Gap_To_Pole',
    'Driver_Last3_AvgFinish',
    'Team_Last3_AvgFinish',
    'Gap_To_Teammate_Quali',
    'Beat_Teammate',
    'AirTemp',
    'TrackTemp',
    'Humidity',
    'Rainfall'
]

# Encode categorical
team_encoder = LabelEncoder()
driver_encoder = LabelEncoder()

model_df['TeamName_encoded'] = team_encoder.fit_transform(model_df['TeamName'])
model_df['Driver_encoded'] = driver_encoder.fit_transform(model_df['Abbreviation'])

features.extend(['TeamName_encoded', 'Driver_encoded'])

# Prepare X and y
X = model_df[features]
y = model_df['Podium']

print(f"\nTotal features: {len(features)}")
print(f"Features: {features}")
print(f"\nX shape: {X.shape}")
print(f"y shape: {y.shape}")

Dataset after removing NaN: 338 samples
Podiums: 51, Non-podiums: 287

Total features: 15
Features: ['GridPosition', 'Quali_Position', 'Grid_Penalty', 'Penalty_Places', 'Gap_To_Pole', 'Driver_Last3_AvgFinish', 'Team_Last3_AvgFinish', 'Gap_To_Teammate_Quali', 'Beat_Teammate', 'AirTemp', 'TrackTemp', 'Humidity', 'Rainfall', 'TeamName_encoded', 'Driver_encoded']

X shape: (338, 15)
y shape: (338,)


In [19]:
# Split data: races 1-15 train, 16-18 test
train_mask = model_df['RoundNumber'] <= 15
test_mask = model_df['RoundNumber'] > 15

X_train = X[train_mask]
X_test = X[test_mask]
y_train = y[train_mask]
y_test = y[test_mask]

print(f"Training set: {len(X_train)} samples ({y_train.sum()} podiums)")
print(f"Test set: {len(X_test)} samples ({y_test.sum()} podiums)")

# Train Random Forest with new features
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced'
)

rf_model.fit(X_train, y_train)
print("\n✓ Model trained!")

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)

print(f"\n{'='*60}")
print(f"MODEL PERFORMANCE (With New Features)")
print(f"{'='*60}")
print(f"Accuracy: {accuracy*100:.1f}%")
print(f"Previous accuracy (5 features): 90.0%")
print(f"Improvement: {(accuracy - 0.90)*100:+.1f}%\n")

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No Podium', 'Podium']))

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

Training set: 278 samples (42 podiums)
Test set: 60 samples (9 podiums)

✓ Model trained!

MODEL PERFORMANCE (With New Features)
Accuracy: 90.0%
Previous accuracy (5 features): 90.0%
Improvement: +0.0%

Classification Report:
              precision    recall  f1-score   support

   No Podium       0.92      0.96      0.94        51
      Podium       0.71      0.56      0.62         9

    accuracy                           0.90        60
   macro avg       0.82      0.76      0.78        60
weighted avg       0.89      0.90      0.89        60


Top 10 Most Important Features:
                   Feature  Importance
4              Gap_To_Pole    0.191417
1           Quali_Position    0.169725
0             GridPosition    0.164645
6     Team_Last3_AvgFinish    0.132328
5   Driver_Last3_AvgFinish    0.106076
14          Driver_encoded    0.064386
7    Gap_To_Teammate_Quali    0.043973
13        TeamName_encoded    0.031086
9                  AirTemp    0.025424
11                Humidi

In [22]:
from xgboost import XGBClassifier

# Train XGBoost model
print("Training XGBoost model...")

xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42,
    scale_pos_weight=(len(y_train) - y_train.sum()) / y_train.sum(),  # Handle imbalance
    eval_metric='logloss'
)

xgb_model.fit(X_train, y_train)
print("✓ XGBoost model trained!")

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print(f"\n{'='*60}")
print(f"XGBoost PERFORMANCE")
print(f"{'='*60}")
print(f"XGBoost Accuracy: {accuracy_xgb*100:.1f}%")
print(f"Random Forest Accuracy: {accuracy*100:.1f}%")
print(f"Improvement: {(accuracy_xgb - accuracy)*100:+.1f}%\n")

print("Classification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=['No Podium', 'Podium']))

# Feature importance for XGBoost
feature_importance_xgb = pd.DataFrame({
    'Feature': features,
    'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 10 Most Important Features (XGBoost):")
print(feature_importance_xgb.head(10))

# Compare predictions
comparison = pd.DataFrame({
    'Actual': y_test.values,
    'RF_Pred': y_pred,
    'XGB_Pred': y_pred_xgb
})
comparison['RF_Correct'] = (comparison['Actual'] == comparison['RF_Pred']).astype(int)
comparison['XGB_Correct'] = (comparison['Actual'] == comparison['XGB_Pred']).astype(int)

print(f"\nPrediction Comparison:")
print(f"Random Forest correct: {comparison['RF_Correct'].sum()}/{len(comparison)}")
print(f"XGBoost correct: {comparison['XGB_Correct'].sum()}/{len(comparison)}")

Training XGBoost model...
✓ XGBoost model trained!

XGBoost PERFORMANCE
XGBoost Accuracy: 88.3%
Random Forest Accuracy: 90.0%
Improvement: -1.7%

Classification Report:
              precision    recall  f1-score   support

   No Podium       0.92      0.94      0.93        51
      Podium       0.62      0.56      0.59         9

    accuracy                           0.88        60
   macro avg       0.77      0.75      0.76        60
weighted avg       0.88      0.88      0.88        60


Top 10 Most Important Features (XGBoost):
                   Feature  Importance
1           Quali_Position    0.423590
0             GridPosition    0.300926
6     Team_Last3_AvgFinish    0.050206
5   Driver_Last3_AvgFinish    0.045536
2             Grid_Penalty    0.037245
9                  AirTemp    0.034261
7    Gap_To_Teammate_Quali    0.030672
10               TrackTemp    0.021895
14          Driver_encoded    0.021126
11                Humidity    0.020271

Prediction Comparison:
Random F

In [23]:
# Check what races are left in 2025
remaining_races = races[races['RoundNumber'] > 18]

print("Remaining races in 2025:")
print(remaining_races[['RoundNumber', 'EventName', 'Country', 'EventDate']])

Remaining races in 2025:
    RoundNumber                 EventName               Country  EventDate
19           19  United States Grand Prix         United States 2025-10-19
20           20    Mexico City Grand Prix                Mexico 2025-10-26
21           21      São Paulo Grand Prix                Brazil 2025-11-09
22           22      Las Vegas Grand Prix         United States 2025-11-22
23           23          Qatar Grand Prix                 Qatar 2025-11-30
24           24      Abu Dhabi Grand Prix  United Arab Emirates 2025-12-07


In [24]:
# test on races 19-21. getting the test data and features.
# Load races 19-21 (already happened)
print("Loading races 19-21 for validation...")

validation_races = []

for round_num in [19, 20, 21]:
    race_name = races.loc[races['RoundNumber'] == round_num, 'EventName'].values[0]
    
    try:
        print(f"Loading Round {round_num}: {race_name}...")
        session = fastf1.get_session(2025, race_name, 'R')
        session.load()
        
        results = session.results.copy()
        results['RaceName'] = race_name
        results['RoundNumber'] = round_num
        
        validation_races.append(results)
        print(f"  ✓ Loaded")
        
    except Exception as e:
        print(f"  ✗ Failed: {e}")

if validation_races:
    df_validation = pd.concat(validation_races, ignore_index=True)
    print(f"\n✓ Loaded {len(df_validation)} results from {len(validation_races)} races")
    print(f"\nSample:")
    print(df_validation[['RoundNumber', 'RaceName', 'Abbreviation', 'GridPosition', 'Position']].head(10))
else:
    print("\nNo validation races loaded")

core           INFO 	Loading data for United States Grand Prix - Race [v3.6.1]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Loading races 19-21 for validation...
Loading Round 19: United States Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

  ✓ Loaded
Loading Round 20: Mexico City Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!

  ✓ Loaded
Loading Round 21: São Paulo Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written t

  ✓ Loaded

✓ Loaded 60 results from 3 races

Sample:
   RoundNumber                  RaceName Abbreviation  GridPosition  Position
0           19  United States Grand Prix          VER           1.0       1.0
1           19  United States Grand Prix          NOR           2.0       2.0
2           19  United States Grand Prix          LEC           3.0       3.0
3           19  United States Grand Prix          HAM           5.0       4.0
4           19  United States Grand Prix          PIA           6.0       5.0
5           19  United States Grand Prix          RUS           4.0       6.0
6           19  United States Grand Prix          TSU          13.0       7.0
7           19  United States Grand Prix          HUL          11.0       8.0
8           19  United States Grand Prix          BEA           8.0       9.0
9           19  United States Grand Prix          ALO          10.0      10.0


In [25]:
# We need to add all the features to validation data like we did for training data

# First, let's add the basic features we can calculate
df_validation['Podium'] = (df_validation['Position'] <= 3.0).astype(int)

# For form features, we need data from races 1-21
# Let's combine our training data (races 1-18) with validation (19-21)
df_all = pd.concat([df_sorted, df_validation], ignore_index=True).sort_values(['Abbreviation', 'RoundNumber'])

# Recalculate rolling features for ALL races
df_all['Driver_Last3_AvgFinish'] = (
    df_all.groupby('Abbreviation')['Position']
    .transform(lambda x: x.rolling(window=3, min_periods=1).mean().shift(1))
)

df_all['Team_Last3_AvgFinish'] = (
    df_all.groupby('TeamName')['Position']
    .transform(lambda x: x.rolling(window=3, min_periods=1).mean().shift(1))
)

# Filter to just validation races (19-21)
df_val_with_features = df_all[df_all['RoundNumber'].isin([19, 20, 21])].copy()

print(f"Validation data with form features: {len(df_val_with_features)} samples")
print(f"\nSample:")
print(df_val_with_features[['RoundNumber', 'Abbreviation', 'Position', 'Driver_Last3_AvgFinish']].head(10))

Validation data with form features: 60 samples

Sample:
     RoundNumber Abbreviation  Position  Driver_Last3_AvgFinish
372           19          ALB      14.0               11.333333
390           20          ALB      12.0               13.666667
409           21          ALB      11.0               13.333333
368           19          ALO      10.0               13.666667
396           20          ALO      18.0               10.666667
412           21          ALO      14.0               11.666667
371           19          ANT      13.0                6.000000
384           20          ANT       6.0                7.333333
400           21          ANT       2.0                8.000000
367           19          BEA       9.0               11.000000


In [26]:
# Load qualifying data for races 19-21
print("Loading qualifying data for validation races...")

val_quali_data = []

for round_num in [19, 20, 21]:
    race_name = races.loc[races['RoundNumber'] == round_num, 'EventName'].values[0]
    
    try:
        print(f"Loading quali for Round {round_num}: {race_name}...")
        quali = fastf1.get_session(2025, race_name, 'Q')
        quali.load()
        
        quali_results = quali.results[['Abbreviation', 'Position']].copy()
        quali_results['RoundNumber'] = round_num
        quali_results.rename(columns={'Position': 'Quali_Position'}, inplace=True)
        
        val_quali_data.append(quali_results)
        print(f"  ✓ Loaded")
        
    except Exception as e:
        print(f"  ✗ Failed: {e}")

df_val_quali = pd.concat(val_quali_data, ignore_index=True)

# Load weather data for races 19-21
print("\nLoading weather data for validation races...")

val_weather_data = []

for round_num in [19, 20, 21]:
    race_name = races.loc[races['RoundNumber'] == round_num, 'EventName'].values[0]
    
    try:
        print(f"Loading weather for Round {round_num}: {race_name}...")
        session = fastf1.get_session(2025, race_name, 'R')
        session.load()
        
        weather = session.weather_data
        if len(weather) > 0:
            race_start_weather = weather.iloc[0]
            
            val_weather_data.append({
                'RoundNumber': round_num,
                'AirTemp': race_start_weather['AirTemp'],
                'TrackTemp': race_start_weather['TrackTemp'],
                'Humidity': race_start_weather['Humidity'],
                'Rainfall': race_start_weather['Rainfall']
            })
            print(f"  ✓ Loaded")
        
    except Exception as e:
        print(f"  ✗ Failed: {e}")

df_val_weather = pd.DataFrame(val_weather_data)

print(f"\n✓ Quali data: {len(df_val_quali)} entries")
print(f"✓ Weather data: {len(df_val_weather)} races")

core           INFO 	Loading data for United States Grand Prix - Qualifying [v3.6.1]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...


Loading qualifying data for validation races...
Loading quali for Round 19: United States Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to

  ✓ Loaded
Loading quali for Round 20: Mexico City Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for timing_app_data. Loading data...
_api           INFO 	Fetching timing app data...
req            INFO 	Data has been written to

  ✓ Loaded
Loading quali for Round 21: São Paulo Grand Prix...


req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
req            INFO 	No cached data found for car_data. Loading data...
_api           INFO 	Fetching car data...
req            INFO 	No cached data found for weather_data. Loading data...
_api           INFO 	Fetching weather data...
req            INFO 	No cached data found for race_control_messages. Loading data...
_api           INFO 	Fetching race control messages...
core           INFO 	Finished loading data for 0 driv

  ✓ Loaded

Loading weather data for validation races...
Loading weather for Round 19: United States Grand Prix...


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '16', '44', '81', '63', '22', '27', '87', '14', '30', '18', '12', '23', '31', '6', '43', '5', '10', '55']
core           INFO 	Loading data for Mexico City Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


  ✓ Loaded
Loading weather for Round 20: Mexico City Grand Prix...


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['4', '16', '1', '87', '81', '12', '63', '44', '31', '5', '22', '23', '6', '18', '10', '43', '55', '14', '27', '30']
core           INFO 	Loading data for São Paulo Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...


  ✓ Loaded
Loading weather for Round 21: São Paulo Grand Prix...


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['4', '12', '1', '63', '81', '87', '30', '6', '27', '10', '23', '31', '55', '14', '43', '18', '22', '44', '16', '5']


  ✓ Loaded

✓ Quali data: 40 entries
✓ Weather data: 3 races


In [32]:
# Drop existing quali/weather columns if they exist
cols_to_drop = ['Quali_Position', 'AirTemp', 'TrackTemp', 'Humidity', 'Rainfall', 
                'Grid_Penalty', 'Penalty_Places', 'Gap_To_Pole']

for col in cols_to_drop:
    if col in df_val_with_features.columns:
        df_val_with_features = df_val_with_features.drop(columns=[col])

# Now merge quali data
df_val_with_features = df_val_with_features.merge(
    df_val_quali[['RoundNumber', 'Abbreviation', 'Quali_Position']],
    on=['RoundNumber', 'Abbreviation'],
    how='left'
)

# Merge weather data
df_val_with_features = df_val_with_features.merge(
    df_val_weather[['RoundNumber', 'AirTemp', 'TrackTemp', 'Humidity', 'Rainfall']],
    on='RoundNumber',
    how='left'
)

# Calculate penalty features
df_val_with_features['Grid_Penalty'] = (df_val_with_features['GridPosition'] != df_val_with_features['Quali_Position']).astype(int)
df_val_with_features['Penalty_Places'] = df_val_with_features['GridPosition'] - df_val_with_features['Quali_Position']
df_val_with_features['Gap_To_Pole'] = df_val_with_features['Quali_Position'] - 1
df_val_with_features['Gap_To_Teammate_Quali'] = 0  # Simplification
df_val_with_features['Beat_Teammate'] = 0

# Encode
df_val_with_features['TeamName_encoded'] = team_encoder.transform(df_val_with_features['TeamName'])
df_val_with_features['Driver_encoded'] = driver_encoder.transform(df_val_with_features['Abbreviation'])

# Prepare features
X_val = df_val_with_features[features].fillna(0)
y_val = df_val_with_features['Podium']

print(f"✓ Validation set prepared: {len(X_val)} samples")
print(f"Actual podiums: {y_val.sum()}")

# Predict
y_val_pred = rf_model.predict(X_val)
y_val_prob = rf_model.predict_proba(X_val)[:, 1]

print(f"Predicted podiums: {y_val_pred.sum()}")

✓ Validation set prepared: 60 samples
Actual podiums: 9
Predicted podiums: 8


In [34]:
# Better prediction: Pick top 3 per race
print("IMPROVED PREDICTIONS (Top 3 per race):")
print("="*60)

correct_predictions = 0
total_podiums = 0

for round_num in [19, 20, 21]:
    race_data = results_comparison[results_comparison['RoundNumber'] == round_num].copy()
    race_name = race_data['RaceName'].iloc[0]
    
    # Sort by podium probability and pick top 3
    top3_predicted = race_data.nlargest(3, 'Podium_Probability')
    actual_podium = race_data[race_data['Podium'] == 1]
    
    print(f"\nRound {round_num}: {race_name}")
    print("-" * 60)
    
    print("Predicted Podium (Top 3 by probability):")
    for i, row in top3_predicted.iterrows():
        actual_pos = row['Position']
        was_correct = "✓" if row['Podium'] == 1 else "✗"
        print(f"  {was_correct} {row['Abbreviation']:3s} ({row['TeamName']:20s}) - {row['Podium_Probability']*100:.1f}% prob, actual P{int(actual_pos)}")
    
    print("\nActual Podium:")
    for i, row in actual_podium.iterrows():
        print(f"  P{int(row['Position'])}: {row['Abbreviation']} ({row['TeamName']}) from P{int(row['GridPosition'])}")
    
    # Count correct predictions
    predicted_drivers = set(top3_predicted['Abbreviation'])
    actual_drivers = set(actual_podium['Abbreviation'])
    correct = len(predicted_drivers & actual_drivers)
    
    correct_predictions += correct
    total_podiums += 3
    
    print(f"\nCorrect: {correct}/3")

print("\n" + "="*60)
print(f"OVERALL: {correct_predictions}/{total_podiums} podiums predicted correctly ({correct_predictions/total_podiums*100:.1f}%)")

IMPROVED PREDICTIONS (Top 3 per race):

Round 19: United States Grand Prix
------------------------------------------------------------
Predicted Podium (Top 3 by probability):
  ✓ NOR (McLaren             ) - 93.0% prob, actual P2
  ✓ LEC (Ferrari             ) - 71.0% prob, actual P3
  ✓ VER (Red Bull Racing     ) - 48.0% prob, actual P1

Actual Podium:
  P3: LEC (Ferrari) from P3
  P2: NOR (McLaren) from P2
  P1: VER (Red Bull Racing) from P1

Correct: 3/3

Round 20: Mexico City Grand Prix
------------------------------------------------------------
Predicted Podium (Top 3 by probability):
  ✓ NOR (McLaren             ) - 90.0% prob, actual P1
  ✓ LEC (Ferrari             ) - 78.0% prob, actual P2
  ✗ HAM (Ferrari             ) - 42.0% prob, actual P8

Actual Podium:
  P2: LEC (Ferrari) from P2
  P1: NOR (McLaren) from P1
  P3: VER (Red Bull Racing) from P5

Correct: 2/3

Round 21: São Paulo Grand Prix
------------------------------------------------------------
Predicted Podium (To

In [35]:
# Replace rolling average with exponential weighted mean
# (More weight on recent races)

df_sorted['Driver_Last3_AvgFinish'] = (
    df_sorted.groupby('Abbreviation')['Position']
    .transform(lambda x: x.ewm(span=3, adjust=False).mean().shift(1))
)

df_sorted['Team_Last3_AvgFinish'] = (
    df_sorted.groupby('TeamName')['Position']
    .transform(lambda x: x.ewm(span=3, adjust=False).mean().shift(1))
)

print("✓ Exponential weighting applied to form features")
print("\nExample - Verstappen's form over season:")
ver_sample = df_sorted[df_sorted['Abbreviation'] == 'VER'][['RoundNumber', 'Position', 'Driver_Last3_AvgFinish']].head(15)
print(ver_sample)

✓ Exponential weighting applied to form features

Example - Verstappen's form over season:
     RoundNumber  Position  Driver_Last3_AvgFinish
341            1       2.0                     NaN
342            2       4.0                2.000000
343            3       1.0                3.000000
344            4       6.0                2.000000
345            5       2.0                4.000000
346            6       4.0                3.000000
347            7       1.0                3.500000
348            8       4.0                2.250000
349            9      10.0                3.125000
350           10       2.0                6.562500
351           11      18.0                4.281250
352           12       5.0               11.140625
353           13       4.0                8.070312
354           14       9.0                6.035156
355           15       2.0                7.517578


In [36]:
# Retrain model with exponentially weighted features
print("Retraining model with exponentially weighted form features...")

# Prepare data (same as before)
model_df = df_sorted.copy()
model_df = model_df.dropna(subset=['Driver_Last3_AvgFinish', 'Team_Last3_AvgFinish', 
                                     'Gap_To_Teammate_Quali', 'TrackTemp'])

print(f"Dataset: {len(model_df)} samples")

# Same features as before
features = [
    'GridPosition',
    'Quali_Position', 
    'Grid_Penalty',
    'Penalty_Places',
    'Gap_To_Pole',
    'Driver_Last3_AvgFinish',  # Now exponentially weighted!
    'Team_Last3_AvgFinish',    # Now exponentially weighted!
    'Gap_To_Teammate_Quali',
    'Beat_Teammate',
    'AirTemp',
    'TrackTemp',
    'Humidity',
    'Rainfall'
]

# Encode
team_encoder = LabelEncoder()
driver_encoder = LabelEncoder()
model_df['TeamName_encoded'] = team_encoder.fit_transform(model_df['TeamName'])
model_df['Driver_encoded'] = driver_encoder.fit_transform(model_df['Abbreviation'])
features.extend(['TeamName_encoded', 'Driver_encoded'])

# Prepare X, y
X = model_df[features]
y = model_df['Podium']

# Split
train_mask = model_df['RoundNumber'] <= 15
test_mask = model_df['RoundNumber'] > 15

X_train = X[train_mask]
X_test = X[test_mask]
y_train = y[train_mask]
y_test = y[test_mask]

# Train Random Forest
rf_model_ewm = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced'
)

rf_model_ewm.fit(X_train, y_train)
print("✓ Model retrained with exponential weighting!")

# Evaluate on test set (races 16-18)
y_pred = rf_model_ewm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\n{'='*60}")
print(f"TEST SET PERFORMANCE (Races 16-18)")
print(f"{'='*60}")
print(f"New accuracy (exponential weighting): {accuracy*100:.1f}%")
print(f"Previous accuracy (simple average): 90.0%")
print(f"Change: {(accuracy - 0.90)*100:+.1f}%")

Retraining model with exponentially weighted form features...
Dataset: 338 samples
✓ Model retrained with exponential weighting!

TEST SET PERFORMANCE (Races 16-18)
New accuracy (exponential weighting): 90.0%
Previous accuracy (simple average): 90.0%
Change: +0.0%


In [38]:
# Test on validation races (19-21) with new exponential weighting

# First, recalculate form features for ALL races including validation
df_all_ewm = pd.concat([df_sorted, df_validation], ignore_index=True).sort_values(['Abbreviation', 'RoundNumber'])

df_all_ewm['Driver_Last3_AvgFinish'] = (
    df_all_ewm.groupby('Abbreviation')['Position']
    .transform(lambda x: x.ewm(span=3, adjust=False).mean().shift(1))
)

df_all_ewm['Team_Last3_AvgFinish'] = (
    df_all_ewm.groupby('TeamName')['Position']
    .transform(lambda x: x.ewm(span=3, adjust=False).mean().shift(1))
)

# Filter to validation races
df_val_ewm = df_all_ewm[df_all_ewm['RoundNumber'].isin([19, 20, 21])].copy()

# Drop existing quali/weather columns if they exist
cols_to_drop = ['Quali_Position', 'AirTemp', 'TrackTemp', 'Humidity', 'Rainfall', 
                'Grid_Penalty', 'Penalty_Places', 'Gap_To_Pole', 'Gap_To_Teammate_Quali', 'Beat_Teammate']
for col in cols_to_drop:
    if col in df_val_ewm.columns:
        df_val_ewm = df_val_ewm.drop(columns=[col])

# Now merge quali and weather
df_val_ewm = df_val_ewm.merge(
    df_val_quali[['RoundNumber', 'Abbreviation', 'Quali_Position']],
    on=['RoundNumber', 'Abbreviation'],
    how='left'
)

df_val_ewm = df_val_ewm.merge(
    df_val_weather[['RoundNumber', 'AirTemp', 'TrackTemp', 'Humidity', 'Rainfall']],
    on='RoundNumber',
    how='left'
)

# Calculate other features
df_val_ewm['Grid_Penalty'] = (df_val_ewm['GridPosition'] != df_val_ewm['Quali_Position']).astype(int)
df_val_ewm['Penalty_Places'] = df_val_ewm['GridPosition'] - df_val_ewm['Quali_Position']
df_val_ewm['Gap_To_Pole'] = df_val_ewm['Quali_Position'] - 1
df_val_ewm['Gap_To_Teammate_Quali'] = 0
df_val_ewm['Beat_Teammate'] = 0

# Encode
df_val_ewm['TeamName_encoded'] = team_encoder.transform(df_val_ewm['TeamName'])
df_val_ewm['Driver_encoded'] = driver_encoder.transform(df_val_ewm['Abbreviation'])

# Prepare features
X_val_ewm = df_val_ewm[features].fillna(0)
y_val_ewm = df_val_ewm['Podium']

# Predict with new model
y_val_pred_ewm = rf_model_ewm.predict(X_val_ewm)
y_val_prob_ewm = rf_model_ewm.predict_proba(X_val_ewm)[:, 1]

val_accuracy_ewm = accuracy_score(y_val_ewm, y_val_pred_ewm)

print(f"{'='*60}")
print(f"VALIDATION (Races 19-21) - Exponential Weighting")
print(f"{'='*60}")
print(f"New accuracy: {val_accuracy_ewm*100:.1f}%")
print(f"Previous accuracy: 88.3%")
print(f"Improvement: {(val_accuracy_ewm - 0.883)*100:+.1f}%")

# Race-by-race predictions
results_comparison_ewm = df_val_ewm[['RoundNumber', 'RaceName', 'Abbreviation', 
                                      'TeamName', 'GridPosition', 'Position', 'Podium']].copy()
results_comparison_ewm['Podium_Probability'] = y_val_prob_ewm

correct_predictions = 0
for round_num in [19, 20, 21]:
    race_data = results_comparison_ewm[results_comparison_ewm['RoundNumber'] == round_num].copy()
    race_name = race_data['RaceName'].iloc[0]
    
    top3_predicted = race_data.nlargest(3, 'Podium_Probability')
    actual_podium = race_data[race_data['Podium'] == 1]
    
    predicted_drivers = set(top3_predicted['Abbreviation'])
    actual_drivers = set(actual_podium['Abbreviation'])
    correct = len(predicted_drivers & actual_drivers)
    correct_predictions += correct
    
    print(f"\nRound {round_num}: {race_name} - {correct}/3 correct")

print(f"\n{'='*60}")
print(f"OVERALL: {correct_predictions}/9 podiums correct ({correct_predictions/9*100:.1f}%)")
print(f"Previous: 6/9 (66.7%)")

VALIDATION (Races 19-21) - Exponential Weighting
New accuracy: 90.0%
Previous accuracy: 88.3%
Improvement: +1.7%

Round 19: United States Grand Prix - 3/3 correct

Round 20: Mexico City Grand Prix - 2/3 correct

Round 21: São Paulo Grand Prix - 1/3 correct

OVERALL: 6/9 podiums correct (66.7%)
Previous: 6/9 (66.7%)
