In [None]:
import fastf1
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

fastf1.Cache.enable_cache('../data/cache')

print("Setup complete!")

In [None]:
# Load 2024 season schedule
season = fastf1.get_event_schedule(2024)
print(f"2024 has {len(season)} races")
print("\nFirst 10 races:")
print(season[['RoundNumber', 'EventName', 'Location', 'Country']].head(10))

In [None]:
# Load the Bahrain race (first race of 2024)
print("Loading Bahrain 2024 race data...")
race = fastf1.get_session(2024, 'Bahrain', 'R')  # 'R' = Race
race.load()  # This will take ~30 seconds the first time

print(f"\n✓ Loaded: {race.event['EventName']}")
print(f"Date: {race.event['EventDate']}")
print(f"Winner: {race.results.iloc[0]['Abbreviation']} ({race.results.iloc[0]['TeamName']})")

In [None]:
# Get the full race results
results = race.results

# Show top 10 finishers
print("Top 10 finishers:\n")
print(results[['Position', 'Abbreviation', 'TeamName', 'GridPosition', 'Points']].head(10))

In [None]:
# Compare qualifying position vs race finish
plt.figure(figsize=(10, 6))

# Plot each driver
for idx, row in results.head(15).iterrows():
    plt.plot([row['GridPosition'], row['Position']], 
             [0, 1], 
             marker='o', 
             label=row['Abbreviation'])
    
plt.yticks([0, 1], ['Qualifying', 'Race Finish'])
plt.xlabel('Position')
plt.title('Bahrain 2024: Grid Position → Race Finish')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.gca().invert_xaxis()  # Position 1 on the left
plt.tight_layout()
plt.show()

In [None]:
# Let's load the first 5 races of 2024
races_to_load = ['Bahrain', 'Saudi Arabia', 'Australia', 'Japan', 'China']

all_results = []

for race_name in races_to_load:
    print(f"Loading {race_name}...")
    session = fastf1.get_session(2024, race_name, 'R')
    session.load()
    
    # Get results and add race name
    results = session.results.copy()
    results['RaceName'] = race_name
    all_results.append(results)
    
# Combine all results
df = pd.concat(all_results, ignore_index=True)

print(f"\n✓ Loaded {len(df)} driver results across {len(races_to_load)} races")
print(f"\nSample data:")
print(df[['RaceName', 'Abbreviation', 'TeamName', 'GridPosition', 'Position']].head(10))

In [None]:
# Filter for drivers who started P1 (pole position)
pole_sitters = df[df['GridPosition'] == 1.0].copy()

print("Pole Position Analysis:")
print("=" * 50)

for idx, row in pole_sitters.iterrows():
    result = "Won" if row['Position'] == 1.0 else f"Finished P{int(row['Position'])}"
    print(f"{row['RaceName']:20s} - {row['Abbreviation']} ({row['TeamName']:20s}) - {result}")

# Calculate win rate
pole_wins = len(pole_sitters[pole_sitters['Position'] == 1.0])
total_poles = len(pole_sitters)
win_rate = (pole_wins / total_poles) * 100

print("\n" + "=" * 50)
print(f"Pole position win rate: {pole_wins}/{total_poles} = {win_rate:.1f}%")

In [None]:
# Average finishing position by team
team_performance = df.groupby('TeamName').agg({
    'Position': ['mean', 'count'],
    'Points': 'sum'
}).round(2)

team_performance.columns = ['Avg_Finish', 'Races', 'Total_Points']
team_performance = team_performance.sort_values('Avg_Finish')

print("Team Performance (First 5 Races):")
print("=" * 60)
print(team_performance)

In [None]:
# Prepare data for modeling
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create features
model_df = df[['GridPosition', 'TeamName', 'Abbreviation']].copy()
model_df['Won'] = (df['Position'] == 1.0).astype(int)  # 1 if won, 0 if not

# Remove any rows with missing data
model_df = model_df.dropna()

print(f"Dataset: {len(model_df)} driver results")
print(f"Winners: {model_df['Won'].sum()}")
print(f"Non-winners: {len(model_df) - model_df['Won'].sum()}")

# Predicting just the winner means the classes are really imbalanced

In [None]:
# I will predict the podium results instead
# Create podium target variable
model_df['Podium'] = (df['Position'] <= 3.0).astype(int)

print("Podium Distribution:")
print(f"Podium finishes: {model_df['Podium'].sum()} ({model_df['Podium'].sum()/len(model_df)*100:.1f}%)")
print(f"Non-podium: {len(model_df) - model_df['Podium'].sum()} ({(len(model_df) - model_df['Podium'].sum())/len(model_df)*100:.1f}%)")

# Show some examples
print("\nSample data:")
print(model_df[['GridPosition', 'TeamName', 'Abbreviation', 'Podium']].head(15))

In [None]:
from sklearn.preprocessing import LabelEncoder

# Prepare features
X = model_df[['GridPosition', 'TeamName', 'Abbreviation']].copy()
y = model_df['Podium']

# Encode team and driver names as numbers
team_encoder = LabelEncoder()
driver_encoder = LabelEncoder()

X['TeamName_encoded'] = team_encoder.fit_transform(X['TeamName'])
X['Driver_encoded'] = driver_encoder.fit_transform(X['Abbreviation'])

# Final feature set
X_final = X[['GridPosition', 'TeamName_encoded', 'Driver_encoded']]

print("Features prepared:")
print(X_final.head())
print(f"\nFeature shape: {X_final.shape}")
print(f"Target shape: {y.shape}")

In [None]:
# Split into train and test (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"\nTrain podiums: {y_train.sum()}/{len(y_train)} ({y_train.sum()/len(y_train)*100:.1f}%)")
print(f"Test podiums: {y_test.sum()}/{len(y_test)} ({y_test.sum()/len(y_test)*100:.1f}%)")

# Train Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=100, 
    random_state=42,
    class_weight='balanced'  # Handle imbalance
)

rf_model.fit(X_train, y_train)

print("\n✓ Model trained!")

In [None]:
# Make predictions
y_pred = rf_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print("Model Performance:")
print("=" * 50)
print(f"Accuracy: {accuracy*100:.1f}%")
print("\nDetailed Results:")
print(classification_report(y_test, y_pred, target_names=['No Podium', 'Podium']))

# Show feature importance
feature_names = ['GridPosition', 'Team', 'Driver']
importances = rf_model.feature_importances_

print("\nFeature Importance:")
for name, importance in zip(feature_names, importances):
    print(f"{name:15s}: {importance:.3f} ({importance*100:.1f}%)")

In [None]:
# Create a fake qualifying result for next race
# Let's say: VER on pole, LEC P2, NOR P3
test_grid = pd.DataFrame({
    'GridPosition': [1, 2, 3, 4, 5],
    'Abbreviation': ['VER', 'LEC', 'NOR', 'SAI', 'PER'],
    'TeamName': ['Red Bull Racing', 'Ferrari', 'McLaren', 'Ferrari', 'Red Bull Racing']
})

# Encode the features
test_grid['TeamName_encoded'] = team_encoder.transform(test_grid['TeamName'])
test_grid['Driver_encoded'] = driver_encoder.transform(test_grid['Abbreviation'])

X_predict = test_grid[['GridPosition', 'TeamName_encoded', 'Driver_encoded']]

# Predict podium probability
podium_probs = rf_model.predict_proba(X_predict)[:, 1]  # Probability of podium
test_grid['Podium_Probability'] = podium_probs

print("Raw Predictions:")
print(test_grid[['GridPosition', 'Abbreviation', 'TeamName', 'Podium_Probability']])

In [None]:
# Pick top 3 by probability (this is the correct approach)
test_grid_sorted = test_grid.sort_values('Podium_Probability', ascending=False)

print("\n" + "=" * 60)
print("PREDICTED PODIUM (Top 3):")
print("=" * 60)

for idx, (i, row) in enumerate(test_grid_sorted.head(3).iterrows(), 1):
    print(f"P{idx}: {row['Abbreviation']:3s} ({row['TeamName']:20s}) - {row['Podium_Probability']*100:.1f}% confidence")

print("\n" + "=" * 60)
print("Predicted to miss podium:")
print("=" * 60)

for idx, (i, row) in enumerate(test_grid_sorted.tail(2).iterrows(), 4):
    print(f"P{idx}: {row['Abbreviation']:3s} ({row['TeamName']:20s}) - {row['Podium_Probability']*100:.1f}% confidence")