In [None]:
import fastf1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Enable cache
fastf1.Cache.enable_cache('../data/cache')

In [None]:
# Load 2025 season schedule
print("Loading 2025 season schedule...")
season_2025 = fastf1.get_event_schedule(2025)

# Filter to actual races (remove testing)
races = season_2025[season_2025['EventFormat'] != 'testing'].copy()

print(f"\nTotal races in 2025: {len(races)}")
print("\nFirst 18 races:")
print(races[['RoundNumber', 'EventName', 'Country', 'EventDate']].head(18))

In [None]:
# Load first 18 races
races_to_load = races.head(18)

print(f"\n{'='*60}")
print(f"Loading {len(races_to_load)} races...")
print(f"This will take 10-15 minutes ☕")
print(f"{'='*60}\n")

all_race_data = []
failed_races = []

for idx, race_event in races_to_load.iterrows():
    race_name = race_event['EventName']
    round_num = race_event['RoundNumber']
    
    try:
        print(f"[{round_num}/{len(races_to_load)}] Loading {race_name}...")
        
        session = fastf1.get_session(2025, race_name, 'R')
        session.load()
        
        results = session.results.copy()
        results['RaceName'] = race_name
        results['RoundNumber'] = round_num
        results['Country'] = race_event['Country']
        results['EventDate'] = race_event['EventDate']
        
        all_race_data.append(results)
        print(f"  ✓ Loaded {len(results)} drivers")
        
    except Exception as e:
        print(f"  ✗ Failed: {e}")
        failed_races.append(race_name)

print(f"\n{'='*60}")
print(f"✓ Successfully loaded {len(all_race_data)} races")
if failed_races:
    print(f"✗ Failed: {failed_races}")

# Combine all data
df_2025 = pd.concat(all_race_data, ignore_index=True)

print(f"\nDataset Summary:")
print(f"  Total results: {len(df_2025)}")
print(f"  Races: {df_2025['RaceName'].nunique()}")
print(f"  Drivers: {df_2025['Abbreviation'].nunique()}")
print(f"\nSample:")
print(df_2025[['RoundNumber', 'RaceName', 'Abbreviation', 'TeamName', 'GridPosition', 'Position']].head(10))

In [None]:
# Quick data check
print("Data Quality Check:")
print(f"Missing GridPositions: {df_2025['GridPosition'].isna().sum()}")
print(f"Missing Positions (DNFs): {df_2025['Position'].isna().sum()}")
print(f"\nUnique teams: {df_2025['TeamName'].nunique()}")
print(df_2025['TeamName'].unique())
print(f"\nUnique drivers: {df_2025['Abbreviation'].nunique()}")
print(df_2025['Abbreviation'].unique())

# Check for weird values
print(f"\nPosition range: {df_2025['Position'].min()} to {df_2025['Position'].max()}")
print(f"GridPosition range: {df_2025['GridPosition'].min()} to {df_2025['GridPosition'].max()}")

In [None]:
# Sort by driver and race order
df_sorted = df_2025.sort_values(['Abbreviation', 'RoundNumber']).copy()

# Calculate rolling average of last 3 race finishes per driver
df_sorted['Driver_Last3_AvgFinish'] = (
    df_sorted.groupby('Abbreviation')['Position']
    .transform(lambda x: x.rolling(window=3, min_periods=1).mean().shift(1))
)

# Same for teams
df_sorted['Team_Last3_AvgFinish'] = (
    df_sorted.groupby('TeamName')['Position']
    .transform(lambda x: x.rolling(window=3, min_periods=1).mean().shift(1))
)

# Check it worked
print("Sample with new features:")
print(df_sorted[['RoundNumber', 'Abbreviation', 'TeamName', 'Position', 
                 'Driver_Last3_AvgFinish', 'Team_Last3_AvgFinish']].head(20))

In [None]:
df_sorted['Podium'] = (df_sorted['Position'] <= 3.0).astype(int)

print(f"Podium column added!")
print(f"Total podiums: {df_sorted['Podium'].sum()}")
print(f"Total non-podiums: {len(df_sorted) - df_sorted['Podium'].sum()}")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Prepare the data
model_df = df_sorted.copy()

# Remove rows with NaN in our new features (first few races per driver)
model_df = model_df.dropna(subset=['Driver_Last3_AvgFinish', 'Team_Last3_AvgFinish'])

print(f"Dataset after removing NaN: {len(model_df)} samples")
print(f"Podiums: {model_df['Podium'].sum()}, Non-podiums: {len(model_df) - model_df['Podium'].sum()}")

# Select features
features = ['GridPosition', 'Driver_Last3_AvgFinish', 'Team_Last3_AvgFinish']

# Encode categorical features
team_encoder = LabelEncoder()
driver_encoder = LabelEncoder()

model_df['TeamName_encoded'] = team_encoder.fit_transform(model_df['TeamName'])
model_df['Driver_encoded'] = driver_encoder.fit_transform(model_df['Abbreviation'])

# Add encoded features
features.extend(['TeamName_encoded', 'Driver_encoded'])

# Prepare X and y
X = model_df[features]
y = model_df['Podium']

print(f"\nFeatures: {features}")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

In [None]:
# Split data: Use races 1-15 for training, 16-18 for testing
# This is more realistic than random split (we predict future races)
train_mask = model_df['RoundNumber'] <= 15
test_mask = model_df['RoundNumber'] > 15

X_train = X[train_mask]
X_test = X[test_mask]
y_train = y[train_mask]
y_test = y[test_mask]

print(f"Training set: {len(X_train)} samples ({y_train.sum()} podiums)")
print(f"Test set: {len(X_test)} samples ({y_test.sum()} podiums)")

# Train Random Forest
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced'  # Handle class imbalance
)

rf_model.fit(X_train, y_train)

print("\n✓ Model trained!")

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)

print(f"\n{'='*60}")
print(f"MODEL PERFORMANCE")
print(f"{'='*60}")
print(f"Accuracy: {accuracy*100:.1f}%\n")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No Podium', 'Podium']))

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

In [None]:
# Load qualifying data for all 18 races
print("Loading qualifying sessions...")

quali_data = []

for round_num in range(1, 19):
    race_name = races.loc[races['RoundNumber'] == round_num, 'EventName'].values[0]
    
    try:
        print(f"Loading quali for Round {round_num}: {race_name}...")
        
        # Load qualifying session
        quali = fastf1.get_session(2025, race_name, 'Q')
        quali.load()
        
        # Get results
        quali_results = quali.results[['Abbreviation', 'Position', 'Q3', 'Q2', 'Q1']].copy()
        quali_results['RoundNumber'] = round_num
        quali_results['RaceName'] = race_name
        quali_results.rename(columns={'Position': 'Quali_Position'}, inplace=True)
        
        quali_data.append(quali_results)
        print(f"  ✓ Loaded")
        
    except Exception as e:
        print(f"  ✗ Failed: {e}")

# Combine all quali data
df_quali = pd.concat(quali_data, ignore_index=True)

print(f"\n✓ Loaded qualifying data for {len(df_quali)} entries")
print(f"\nSample:")
print(df_quali[['RoundNumber', 'RaceName', 'Abbreviation', 'Quali_Position']].head(10))

In [None]:
# Merge qualifying data with our sorted race data
df_sorted = df_sorted.merge(
    df_quali[['RoundNumber', 'Abbreviation', 'Quali_Position']], 
    on=['RoundNumber', 'Abbreviation'], 
    how='left'
)

# Feature 1: Grid Penalty (boolean)
df_sorted['Grid_Penalty'] = (df_sorted['GridPosition'] != df_sorted['Quali_Position']).astype(int)

# Feature 2: Penalty Size (how many places)
df_sorted['Penalty_Places'] = df_sorted['GridPosition'] - df_sorted['Quali_Position']

# Feature 3: Gap to pole position (we'll use quali position as proxy for now)
df_sorted['Gap_To_Pole'] = df_sorted['Quali_Position'] - 1

# Check it worked
print("Qualifying features added!")
print(f"\nDrivers with grid penalties: {df_sorted['Grid_Penalty'].sum()}")
print(f"\nSample with new features:")
print(df_sorted[['RoundNumber', 'Abbreviation', 'Quali_Position', 'GridPosition', 
                 'Grid_Penalty', 'Penalty_Places']].head(20))

In [None]:
# Load weather data for all races
print("Loading weather data...")

weather_data = []

for round_num in range(1, 19):
    race_name = races.loc[races['RoundNumber'] == round_num, 'EventName'].values[0]
    
    try:
        print(f"Loading weather for Round {round_num}: {race_name}...")
        
        # Load race session (weather is tied to race)
        session = fastf1.get_session(2025, race_name, 'R')
        session.load()
        
        # Get weather at race start (first timestamp)
        weather = session.weather_data
        if len(weather) > 0:
            race_start_weather = weather.iloc[0]  # First weather reading
            
            weather_data.append({
                'RoundNumber': round_num,
                'RaceName': race_name,
                'AirTemp': race_start_weather['AirTemp'],
                'TrackTemp': race_start_weather['TrackTemp'],
                'Humidity': race_start_weather['Humidity'],
                'Rainfall': race_start_weather['Rainfall']
            })
            print(f"  ✓ Loaded")
        
    except Exception as e:
        print(f"  ✗ Failed: {e}")

# Create weather dataframe
df_weather = pd.DataFrame(weather_data)

print(f"\n✓ Loaded weather for {len(df_weather)} races")
print(f"\nSample:")
print(df_weather.head(10))

In [None]:
# Merge weather data
df_sorted = df_sorted.merge(
    df_weather[['RoundNumber', 'AirTemp', 'TrackTemp', 'Humidity', 'Rainfall']], 
    on='RoundNumber', 
    how='left'
)

print("✓ Weather data merged")

# Now calculate gap to teammate in qualifying
# First, find who are teammates (same team)
df_quali_with_team = df_quali.merge(
    df_sorted[['RoundNumber', 'Abbreviation', 'TeamName']].drop_duplicates(),
    on=['RoundNumber', 'Abbreviation'],
    how='left'
)

# For each driver, find their teammate's quali position
teammate_quali = []

for idx, row in df_quali_with_team.iterrows():
    # Find teammate (same team, different driver, same round)
    teammate = df_quali_with_team[
        (df_quali_with_team['RoundNumber'] == row['RoundNumber']) &
        (df_quali_with_team['TeamName'] == row['TeamName']) &
        (df_quali_with_team['Abbreviation'] != row['Abbreviation'])
    ]
    
    if len(teammate) > 0:
        teammate_pos = teammate.iloc[0]['Quali_Position']
        gap = row['Quali_Position'] - teammate_pos
    else:
        gap = 0  # No teammate (shouldn't happen, but just in case)
    
    teammate_quali.append({
        'RoundNumber': row['RoundNumber'],
        'Abbreviation': row['Abbreviation'],
        'Gap_To_Teammate_Quali': gap,
        'Beat_Teammate': 1 if gap < 0 else 0
    })

df_teammate = pd.DataFrame(teammate_quali)

# Merge with main dataframe
df_sorted = df_sorted.merge(
    df_teammate[['RoundNumber', 'Abbreviation', 'Gap_To_Teammate_Quali', 'Beat_Teammate']],
    on=['RoundNumber', 'Abbreviation'],
    how='left'
)

print("✓ Teammate gap calculated")
print(f"\nNew features summary:")
print(f"Total features now: {len(df_sorted.columns)}")
print(f"\nSample with all new features:")
print(df_sorted[['RoundNumber', 'Abbreviation', 'TeamName', 'Quali_Position', 
                 'Gap_To_Teammate_Quali', 'Beat_Teammate', 'TrackTemp', 'Rainfall']].head(20))

In [None]:
# Prepare data with new features
model_df = df_sorted.copy()

# Remove rows with NaN
model_df = model_df.dropna(subset=['Driver_Last3_AvgFinish', 'Team_Last3_AvgFinish', 
                                     'Gap_To_Teammate_Quali', 'TrackTemp'])

print(f"Dataset after removing NaN: {len(model_df)} samples")
print(f"Podiums: {model_df['Podium'].sum()}, Non-podiums: {len(model_df) - model_df['Podium'].sum()}")

# Select ALL features
features = [
    'GridPosition',
    'Quali_Position', 
    'Grid_Penalty',
    'Penalty_Places',
    'Gap_To_Pole',
    'Driver_Last3_AvgFinish',
    'Team_Last3_AvgFinish',
    'Gap_To_Teammate_Quali',
    'Beat_Teammate',
    'AirTemp',
    'TrackTemp',
    'Humidity',
    'Rainfall'
]

# Encode categorical
team_encoder = LabelEncoder()
driver_encoder = LabelEncoder()

model_df['TeamName_encoded'] = team_encoder.fit_transform(model_df['TeamName'])
model_df['Driver_encoded'] = driver_encoder.fit_transform(model_df['Abbreviation'])

features.extend(['TeamName_encoded', 'Driver_encoded'])

# Prepare X and y
X = model_df[features]
y = model_df['Podium']

print(f"\nTotal features: {len(features)}")
print(f"Features: {features}")
print(f"\nX shape: {X.shape}")
print(f"y shape: {y.shape}")

In [None]:
# Split data: races 1-15 train, 16-18 test
train_mask = model_df['RoundNumber'] <= 15
test_mask = model_df['RoundNumber'] > 15

X_train = X[train_mask]
X_test = X[test_mask]
y_train = y[train_mask]
y_test = y[test_mask]

print(f"Training set: {len(X_train)} samples ({y_train.sum()} podiums)")
print(f"Test set: {len(X_test)} samples ({y_test.sum()} podiums)")

# Train Random Forest with new features
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced'
)

rf_model.fit(X_train, y_train)
print("\n✓ Model trained!")

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)

print(f"\n{'='*60}")
print(f"MODEL PERFORMANCE (With New Features)")
print(f"{'='*60}")
print(f"Accuracy: {accuracy*100:.1f}%")
print(f"Previous accuracy (5 features): 90.0%")
print(f"Improvement: {(accuracy - 0.90)*100:+.1f}%\n")

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No Podium', 'Podium']))

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

In [None]:
from xgboost import XGBClassifier

# Train XGBoost model
print("Training XGBoost model...")

xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42,
    scale_pos_weight=(len(y_train) - y_train.sum()) / y_train.sum(),  # Handle imbalance
    eval_metric='logloss'
)

xgb_model.fit(X_train, y_train)
print("✓ XGBoost model trained!")

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print(f"\n{'='*60}")
print(f"XGBoost PERFORMANCE")
print(f"{'='*60}")
print(f"XGBoost Accuracy: {accuracy_xgb*100:.1f}%")
print(f"Random Forest Accuracy: {accuracy*100:.1f}%")
print(f"Improvement: {(accuracy_xgb - accuracy)*100:+.1f}%\n")

print("Classification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=['No Podium', 'Podium']))

# Feature importance for XGBoost
feature_importance_xgb = pd.DataFrame({
    'Feature': features,
    'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 10 Most Important Features (XGBoost):")
print(feature_importance_xgb.head(10))

# Compare predictions
comparison = pd.DataFrame({
    'Actual': y_test.values,
    'RF_Pred': y_pred,
    'XGB_Pred': y_pred_xgb
})
comparison['RF_Correct'] = (comparison['Actual'] == comparison['RF_Pred']).astype(int)
comparison['XGB_Correct'] = (comparison['Actual'] == comparison['XGB_Pred']).astype(int)

print(f"\nPrediction Comparison:")
print(f"Random Forest correct: {comparison['RF_Correct'].sum()}/{len(comparison)}")
print(f"XGBoost correct: {comparison['XGB_Correct'].sum()}/{len(comparison)}")

In [None]:
# Check what races are left in 2025
remaining_races = races[races['RoundNumber'] > 18]

print("Remaining races in 2025:")
print(remaining_races[['RoundNumber', 'EventName', 'Country', 'EventDate']])

In [None]:
# test on races 19-21. getting the test data and features.
# Load races 19-21 (already happened)
print("Loading races 19-21 for validation...")

validation_races = []

for round_num in [19, 20, 21]:
    race_name = races.loc[races['RoundNumber'] == round_num, 'EventName'].values[0]
    
    try:
        print(f"Loading Round {round_num}: {race_name}...")
        session = fastf1.get_session(2025, race_name, 'R')
        session.load()
        
        results = session.results.copy()
        results['RaceName'] = race_name
        results['RoundNumber'] = round_num
        
        validation_races.append(results)
        print(f"  ✓ Loaded")
        
    except Exception as e:
        print(f"  ✗ Failed: {e}")

if validation_races:
    df_validation = pd.concat(validation_races, ignore_index=True)
    print(f"\n✓ Loaded {len(df_validation)} results from {len(validation_races)} races")
    print(f"\nSample:")
    print(df_validation[['RoundNumber', 'RaceName', 'Abbreviation', 'GridPosition', 'Position']].head(10))
else:
    print("\nNo validation races loaded")

In [None]:
# We need to add all the features to validation data like we did for training data

# First, let's add the basic features we can calculate
df_validation['Podium'] = (df_validation['Position'] <= 3.0).astype(int)

# For form features, we need data from races 1-21
# Let's combine our training data (races 1-18) with validation (19-21)
df_all = pd.concat([df_sorted, df_validation], ignore_index=True).sort_values(['Abbreviation', 'RoundNumber'])

# Recalculate rolling features for ALL races
df_all['Driver_Last3_AvgFinish'] = (
    df_all.groupby('Abbreviation')['Position']
    .transform(lambda x: x.rolling(window=3, min_periods=1).mean().shift(1))
)

df_all['Team_Last3_AvgFinish'] = (
    df_all.groupby('TeamName')['Position']
    .transform(lambda x: x.rolling(window=3, min_periods=1).mean().shift(1))
)

# Filter to just validation races (19-21)
df_val_with_features = df_all[df_all['RoundNumber'].isin([19, 20, 21])].copy()

print(f"Validation data with form features: {len(df_val_with_features)} samples")
print(f"\nSample:")
print(df_val_with_features[['RoundNumber', 'Abbreviation', 'Position', 'Driver_Last3_AvgFinish']].head(10))

In [None]:
# Load qualifying data for races 19-21
print("Loading qualifying data for validation races...")

val_quali_data = []

for round_num in [19, 20, 21]:
    race_name = races.loc[races['RoundNumber'] == round_num, 'EventName'].values[0]
    
    try:
        print(f"Loading quali for Round {round_num}: {race_name}...")
        quali = fastf1.get_session(2025, race_name, 'Q')
        quali.load()
        
        quali_results = quali.results[['Abbreviation', 'Position']].copy()
        quali_results['RoundNumber'] = round_num
        quali_results.rename(columns={'Position': 'Quali_Position'}, inplace=True)
        
        val_quali_data.append(quali_results)
        print(f"  ✓ Loaded")
        
    except Exception as e:
        print(f"  ✗ Failed: {e}")

df_val_quali = pd.concat(val_quali_data, ignore_index=True)

# Load weather data for races 19-21
print("\nLoading weather data for validation races...")

val_weather_data = []

for round_num in [19, 20, 21]:
    race_name = races.loc[races['RoundNumber'] == round_num, 'EventName'].values[0]
    
    try:
        print(f"Loading weather for Round {round_num}: {race_name}...")
        session = fastf1.get_session(2025, race_name, 'R')
        session.load()
        
        weather = session.weather_data
        if len(weather) > 0:
            race_start_weather = weather.iloc[0]
            
            val_weather_data.append({
                'RoundNumber': round_num,
                'AirTemp': race_start_weather['AirTemp'],
                'TrackTemp': race_start_weather['TrackTemp'],
                'Humidity': race_start_weather['Humidity'],
                'Rainfall': race_start_weather['Rainfall']
            })
            print(f"  ✓ Loaded")
        
    except Exception as e:
        print(f"  ✗ Failed: {e}")

df_val_weather = pd.DataFrame(val_weather_data)

print(f"\n✓ Quali data: {len(df_val_quali)} entries")
print(f"✓ Weather data: {len(df_val_weather)} races")

In [None]:
# Drop existing quali/weather columns if they exist
cols_to_drop = ['Quali_Position', 'AirTemp', 'TrackTemp', 'Humidity', 'Rainfall', 
                'Grid_Penalty', 'Penalty_Places', 'Gap_To_Pole']

for col in cols_to_drop:
    if col in df_val_with_features.columns:
        df_val_with_features = df_val_with_features.drop(columns=[col])

# Now merge quali data
df_val_with_features = df_val_with_features.merge(
    df_val_quali[['RoundNumber', 'Abbreviation', 'Quali_Position']],
    on=['RoundNumber', 'Abbreviation'],
    how='left'
)

# Merge weather data
df_val_with_features = df_val_with_features.merge(
    df_val_weather[['RoundNumber', 'AirTemp', 'TrackTemp', 'Humidity', 'Rainfall']],
    on='RoundNumber',
    how='left'
)

# Calculate penalty features
df_val_with_features['Grid_Penalty'] = (df_val_with_features['GridPosition'] != df_val_with_features['Quali_Position']).astype(int)
df_val_with_features['Penalty_Places'] = df_val_with_features['GridPosition'] - df_val_with_features['Quali_Position']
df_val_with_features['Gap_To_Pole'] = df_val_with_features['Quali_Position'] - 1
df_val_with_features['Gap_To_Teammate_Quali'] = 0  # Simplification
df_val_with_features['Beat_Teammate'] = 0

# Encode
df_val_with_features['TeamName_encoded'] = team_encoder.transform(df_val_with_features['TeamName'])
df_val_with_features['Driver_encoded'] = driver_encoder.transform(df_val_with_features['Abbreviation'])

# Prepare features
X_val = df_val_with_features[features].fillna(0)
y_val = df_val_with_features['Podium']

print(f"✓ Validation set prepared: {len(X_val)} samples")
print(f"Actual podiums: {y_val.sum()}")

# Predict
y_val_pred = rf_model.predict(X_val)
y_val_prob = rf_model.predict_proba(X_val)[:, 1]

print(f"Predicted podiums: {y_val_pred.sum()}")

In [None]:
# Better prediction: Pick top 3 per race
print("IMPROVED PREDICTIONS (Top 3 per race):")
print("="*60)

correct_predictions = 0
total_podiums = 0

for round_num in [19, 20, 21]:
    race_data = results_comparison[results_comparison['RoundNumber'] == round_num].copy()
    race_name = race_data['RaceName'].iloc[0]
    
    # Sort by podium probability and pick top 3
    top3_predicted = race_data.nlargest(3, 'Podium_Probability')
    actual_podium = race_data[race_data['Podium'] == 1]
    
    print(f"\nRound {round_num}: {race_name}")
    print("-" * 60)
    
    print("Predicted Podium (Top 3 by probability):")
    for i, row in top3_predicted.iterrows():
        actual_pos = row['Position']
        was_correct = "✓" if row['Podium'] == 1 else "✗"
        print(f"  {was_correct} {row['Abbreviation']:3s} ({row['TeamName']:20s}) - {row['Podium_Probability']*100:.1f}% prob, actual P{int(actual_pos)}")
    
    print("\nActual Podium:")
    for i, row in actual_podium.iterrows():
        print(f"  P{int(row['Position'])}: {row['Abbreviation']} ({row['TeamName']}) from P{int(row['GridPosition'])}")
    
    # Count correct predictions
    predicted_drivers = set(top3_predicted['Abbreviation'])
    actual_drivers = set(actual_podium['Abbreviation'])
    correct = len(predicted_drivers & actual_drivers)
    
    correct_predictions += correct
    total_podiums += 3
    
    print(f"\nCorrect: {correct}/3")

print("\n" + "="*60)
print(f"OVERALL: {correct_predictions}/{total_podiums} podiums predicted correctly ({correct_predictions/total_podiums*100:.1f}%)")

In [None]:
# Replace rolling average with exponential weighted mean
# (More weight on recent races)

df_sorted['Driver_Last3_AvgFinish'] = (
    df_sorted.groupby('Abbreviation')['Position']
    .transform(lambda x: x.ewm(span=3, adjust=False).mean().shift(1))
)

df_sorted['Team_Last3_AvgFinish'] = (
    df_sorted.groupby('TeamName')['Position']
    .transform(lambda x: x.ewm(span=3, adjust=False).mean().shift(1))
)

print("✓ Exponential weighting applied to form features")
print("\nExample - Verstappen's form over season:")
ver_sample = df_sorted[df_sorted['Abbreviation'] == 'VER'][['RoundNumber', 'Position', 'Driver_Last3_AvgFinish']].head(15)
print(ver_sample)

In [None]:
# Retrain model with exponentially weighted features
print("Retraining model with exponentially weighted form features...")

# Prepare data (same as before)
model_df = df_sorted.copy()
model_df = model_df.dropna(subset=['Driver_Last3_AvgFinish', 'Team_Last3_AvgFinish', 
                                     'Gap_To_Teammate_Quali', 'TrackTemp'])

print(f"Dataset: {len(model_df)} samples")

# Same features as before
features = [
    'GridPosition',
    'Quali_Position', 
    'Grid_Penalty',
    'Penalty_Places',
    'Gap_To_Pole',
    'Driver_Last3_AvgFinish',  # Now exponentially weighted!
    'Team_Last3_AvgFinish',    # Now exponentially weighted!
    'Gap_To_Teammate_Quali',
    'Beat_Teammate',
    'AirTemp',
    'TrackTemp',
    'Humidity',
    'Rainfall'
]

# Encode
team_encoder = LabelEncoder()
driver_encoder = LabelEncoder()
model_df['TeamName_encoded'] = team_encoder.fit_transform(model_df['TeamName'])
model_df['Driver_encoded'] = driver_encoder.fit_transform(model_df['Abbreviation'])
features.extend(['TeamName_encoded', 'Driver_encoded'])

# Prepare X, y
X = model_df[features]
y = model_df['Podium']

# Split
train_mask = model_df['RoundNumber'] <= 15
test_mask = model_df['RoundNumber'] > 15

X_train = X[train_mask]
X_test = X[test_mask]
y_train = y[train_mask]
y_test = y[test_mask]

# Train Random Forest
rf_model_ewm = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced'
)

rf_model_ewm.fit(X_train, y_train)
print("✓ Model retrained with exponential weighting!")

# Evaluate on test set (races 16-18)
y_pred = rf_model_ewm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\n{'='*60}")
print(f"TEST SET PERFORMANCE (Races 16-18)")
print(f"{'='*60}")
print(f"New accuracy (exponential weighting): {accuracy*100:.1f}%")
print(f"Previous accuracy (simple average): 90.0%")
print(f"Change: {(accuracy - 0.90)*100:+.1f}%")

In [None]:
# Test on validation races (19-21) with new exponential weighting

# First, recalculate form features for ALL races including validation
df_all_ewm = pd.concat([df_sorted, df_validation], ignore_index=True).sort_values(['Abbreviation', 'RoundNumber'])

df_all_ewm['Driver_Last3_AvgFinish'] = (
    df_all_ewm.groupby('Abbreviation')['Position']
    .transform(lambda x: x.ewm(span=3, adjust=False).mean().shift(1))
)

df_all_ewm['Team_Last3_AvgFinish'] = (
    df_all_ewm.groupby('TeamName')['Position']
    .transform(lambda x: x.ewm(span=3, adjust=False).mean().shift(1))
)

# Filter to validation races
df_val_ewm = df_all_ewm[df_all_ewm['RoundNumber'].isin([19, 20, 21])].copy()

# Drop existing quali/weather columns if they exist
cols_to_drop = ['Quali_Position', 'AirTemp', 'TrackTemp', 'Humidity', 'Rainfall', 
                'Grid_Penalty', 'Penalty_Places', 'Gap_To_Pole', 'Gap_To_Teammate_Quali', 'Beat_Teammate']
for col in cols_to_drop:
    if col in df_val_ewm.columns:
        df_val_ewm = df_val_ewm.drop(columns=[col])

# Now merge quali and weather
df_val_ewm = df_val_ewm.merge(
    df_val_quali[['RoundNumber', 'Abbreviation', 'Quali_Position']],
    on=['RoundNumber', 'Abbreviation'],
    how='left'
)

df_val_ewm = df_val_ewm.merge(
    df_val_weather[['RoundNumber', 'AirTemp', 'TrackTemp', 'Humidity', 'Rainfall']],
    on='RoundNumber',
    how='left'
)

# Calculate other features
df_val_ewm['Grid_Penalty'] = (df_val_ewm['GridPosition'] != df_val_ewm['Quali_Position']).astype(int)
df_val_ewm['Penalty_Places'] = df_val_ewm['GridPosition'] - df_val_ewm['Quali_Position']
df_val_ewm['Gap_To_Pole'] = df_val_ewm['Quali_Position'] - 1
df_val_ewm['Gap_To_Teammate_Quali'] = 0
df_val_ewm['Beat_Teammate'] = 0

# Encode
df_val_ewm['TeamName_encoded'] = team_encoder.transform(df_val_ewm['TeamName'])
df_val_ewm['Driver_encoded'] = driver_encoder.transform(df_val_ewm['Abbreviation'])

# Prepare features
X_val_ewm = df_val_ewm[features].fillna(0)
y_val_ewm = df_val_ewm['Podium']

# Predict with new model
y_val_pred_ewm = rf_model_ewm.predict(X_val_ewm)
y_val_prob_ewm = rf_model_ewm.predict_proba(X_val_ewm)[:, 1]

val_accuracy_ewm = accuracy_score(y_val_ewm, y_val_pred_ewm)

print(f"{'='*60}")
print(f"VALIDATION (Races 19-21) - Exponential Weighting")
print(f"{'='*60}")
print(f"New accuracy: {val_accuracy_ewm*100:.1f}%")
print(f"Previous accuracy: 88.3%")
print(f"Improvement: {(val_accuracy_ewm - 0.883)*100:+.1f}%")

# Race-by-race predictions
results_comparison_ewm = df_val_ewm[['RoundNumber', 'RaceName', 'Abbreviation', 
                                      'TeamName', 'GridPosition', 'Position', 'Podium']].copy()
results_comparison_ewm['Podium_Probability'] = y_val_prob_ewm

correct_predictions = 0
for round_num in [19, 20, 21]:
    race_data = results_comparison_ewm[results_comparison_ewm['RoundNumber'] == round_num].copy()
    race_name = race_data['RaceName'].iloc[0]
    
    top3_predicted = race_data.nlargest(3, 'Podium_Probability')
    actual_podium = race_data[race_data['Podium'] == 1]
    
    predicted_drivers = set(top3_predicted['Abbreviation'])
    actual_drivers = set(actual_podium['Abbreviation'])
    correct = len(predicted_drivers & actual_drivers)
    correct_predictions += correct
    
    print(f"\nRound {round_num}: {race_name} - {correct}/3 correct")

print(f"\n{'='*60}")
print(f"OVERALL: {correct_predictions}/9 podiums correct ({correct_predictions/9*100:.1f}%)")
print(f"Previous: 6/9 (66.7%)")