In [3]:
pip install fastf1 pandas numpy scikit-learn matplotlib seaborn joblib requests


Collecting fastf1
  Downloading fastf1-3.6.1-py3-none-any.whl.metadata (4.6 kB)
Collecting rapidfuzz (from fastf1)
  Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Collecting requests-cache>=1.0.0 (from fastf1)
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting timple>=0.1.6 (from fastf1)
  Downloading timple-0.1.8-py3-none-any.whl.metadata (2.0 kB)
Collecting websockets<14,>=10.3 (from fastf1)
  Downloading websockets-13.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting cattrs>=22.2 (from requests-cache>=1.0.0->fastf1)
  Downloading cattrs-25.2.0-py3-none-any.whl.metadata (8.4 kB)
Collecting url-normalize>=1.4 (from requests-cache>=1.0.0->fastf1)
  Downloading url_normalize-2.2.1-py3-none-any.whl.metadata (5.6 kB)
Downloading fastf1-3.6.1-py3-none-any.whl (148 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [11]:
import fastf1
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib
import json
from datetime import datetime

# Set style for professional visuals
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

class F1RacePredictor:
    def __init__(self, cache_dir="f1_cache"):
        """Initialize the F1 race predictor"""
        fastf1.Cache.enable_cache(cache_dir)
        self.data = None
        self.model = None
        self.feature_importance = None

    def load_historical_data(self, year, race_number, session_type="R"):
        """Load historical race data from FastF1"""
        print(f"Loading {year} race data...")
        session = fastf1.get_session(year, race_number, session_type)
        session.load()

        # Get lap data
        laps = session.laps.copy()
        laps = laps[["Driver", "LapTime", "Sector1Time", "Sector2Time", "Sector3Time",
                     "Compound", "TyreLife", "FreshTyre", "Stint", "TrackStatus", "Position"]]

        # Only use green flag laps
        laps = laps[laps["TrackStatus"] == "1"]
        laps.dropna(subset=["LapTime", "Sector1Time", "Sector2Time", "Sector3Time"], inplace=True)

        # Convert times to seconds
        for col in ["LapTime", "Sector1Time", "Sector2Time", "Sector3Time"]:
            laps[f"{col} (s)"] = laps[col].dt.total_seconds()

        # Calculate consistency (standard deviation of lap times)
        consistency = laps.groupby("Driver")["LapTime (s)"].std().reset_index()
        consistency.columns = ["Driver", "Consistency"]

        # Get average lap times by driver
        avg_times = laps.groupby("Driver").agg({
            "LapTime (s)": "mean",
            "Sector1Time (s)": "mean",
            "Sector2Time (s)": "mean",
            "Sector3Time (s)": "mean",
            "TyreLife": "mean",
            "Stint": "max",
            "Position": "min"  # Best position achieved
        }).reset_index()

        # Merge with consistency data
        self.historical_data = pd.merge(avg_times, consistency, on="Driver")

        # Add team information
        driver_teams = {}
        for driver in laps["Driver"].unique():
            try:
                driver_teams[driver] = laps[laps["Driver"] == driver].iloc[0]["Team"]
            except:
                driver_teams[driver] = "Unknown"

        self.historical_data["Team"] = self.historical_data["Driver"].map(driver_teams)

        print(f"Loaded data for {len(self.historical_data)} drivers")
        return self.historical_data

    def get_weather_data(self, lat, lon, race_date, api_key=None):
        """Get weather data from OpenWeatherMap API or use simulated data"""
        # If no API key, use simulated data for Monza (typical early September weather)
        if api_key is None:
            print("Using simulated weather data for Monza")
            return {
                "temperature": 26.5,  # Warm September day
                "humidity": 65,
                "rain_probability": 0.1,  # Low chance of rain
                "conditions": "Clear"
            }

        try:
            base_url = "http://api.openweathermap.org/data/2.5/forecast"
            params = {
                "lat": lat,
                "lon": lon,
                "appid": api_key,
                "units": "metric"
            }

            response = requests.get(base_url, params=params)
            weather_data = response.json()

            # Find forecast closest to race time (typically 15:00 local)
            race_time = f"{race_date} 15:00:00"
            forecast_data = None

            for forecast in weather_data["list"]:
                if forecast["dt_txt"] == race_time:
                    forecast_data = forecast
                    break

            if forecast_data:
                weather_info = {
                    "temperature": forecast_data["main"]["temp"],
                    "humidity": forecast_data["main"]["humidity"],
                    "rain_probability": forecast_data.get("pop", 0),
                    "conditions": forecast_data["weather"][0]["main"]
                }
                return weather_info
        except:
            pass

        # Return default values if API call fails
        return {
            "temperature": 26.5,
            "humidity": 65,
            "rain_probability": 0.1,
            "conditions": "Clear"
        }

    def create_features(self, qualifying_data, weather_data, team_performance):
        """Create feature set for prediction"""
        # Start with qualifying data
        features = qualifying_data.copy()

        # Add weather data
        features["Temperature"] = weather_data["temperature"]
        features["Humidity"] = weather_data["humidity"]
        features["RainProbability"] = weather_data["rain_probability"]
        features["IsRainy"] = 1 if weather_data["rain_probability"] > 0.5 else 0

        # Add team performance
        features["TeamPerformance"] = features["Team"].map(team_performance)

        # Merge with historical sector times
        if self.historical_data is not None:
            historical_avgs = self.historical_data.groupby("Driver").agg({
                "Sector1Time (s)": "mean",
                "Sector2Time (s)": "mean",
                "Sector3Time (s)": "mean",
                "Consistency": "mean"
            }).reset_index()

            features = features.merge(historical_avgs, on="Driver", how="left")

        return features

    def train_model(self, features, target, test_size=0.2, random_state=42):
        """Train the prediction model"""
        # Separate features and target
        X = features.drop(["Driver", "Team", "QualifyingTime (s)"], axis=1, errors="ignore")
        y = target

        # Handle missing values
        imputer = SimpleImputer(strategy="mean")
        X_imputed = imputer.fit_transform(X)

        # Scale features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_imputed)

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X_scaled, y, test_size=test_size, random_state=random_state
        )

        # Train model
        self.model = GradientBoostingRegressor(
            n_estimators=150,
            learning_rate=0.1,
            max_depth=4,
            random_state=random_state
        )

        self.model.fit(X_train, y_train)

        # Evaluate model
        y_pred = self.model.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        print(f"Model Performance:")
        print(f"MAE: {mae:.3f} seconds")
        print(f"R² Score: {r2:.3f}")

        # Store feature importance
        self.feature_importance = pd.DataFrame({
            "feature": X.columns,
            "importance": self.model.feature_importances_
        }).sort_values("importance", ascending=False)

        return self.model, mae, r2

    def predict_race(self, features):
        """Predict race results"""
        # Prepare features for prediction
        X = features.drop(["Driver", "Team", "QualifyingTime (s)"], axis=1, errors="ignore")

        # Handle missing values
        imputer = SimpleImputer(strategy="mean")
        X_imputed = imputer.fit_transform(X)

        # Scale features
        scaler = StandardScaler()
        X_scaled = scaler.transform(X_imputed)

        # Make predictions
        predictions = self.model.predict(X_scaled)

        # Create results dataframe
        results = features[["Driver", "Team", "QualifyingTime (s)"]].copy()
        results["PredictedRaceTime (s)"] = predictions
        results = results.sort_values("PredictedRaceTime (s)")
        results["PredictedPosition"] = range(1, len(results) + 1)

        return results

    def visualize_results(self, predictions, historical_data=None):
        """Create visualizations of the results"""
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))

        # Plot 1: Predicted race results
        axes[0, 0].barh(predictions["Driver"], predictions["PredictedRaceTime (s)"],
                       color=plt.cm.viridis(np.linspace(0, 1, len(predictions))))
        axes[0, 0].set_xlabel("Predicted Race Time (s)")
        axes[0, 0].set_title("2025 Italian GP - Predicted Race Results")
        axes[0, 0].invert_yaxis()

        # Plot 2: Qualifying vs Race pace
        axes[0, 1].scatter(predictions["QualifyingTime (s)"], predictions["PredictedRaceTime (s)"])
        for i, driver in enumerate(predictions["Driver"]):
            axes[0, 1].annotate(driver,
                               (predictions["QualifyingTime (s)"].iloc[i],
                                predictions["PredictedRaceTime (s)"].iloc[i]),
                               xytext=(5, 5), textcoords='offset points', fontsize=8)
        axes[0, 1].set_xlabel("Qualifying Time (s)")
        axes[0, 1].set_ylabel("Predicted Race Time (s)")
        axes[0, 1].set_title("Qualifying vs Race Pace (Monza 2025)")

        # Plot 3: Feature importance
        if self.feature_importance is not None:
            axes[1, 0].barh(self.feature_importance["feature"], self.feature_importance["importance"])
            axes[1, 0].set_xlabel("Importance")
            axes[1, 0].set_title("Feature Importance in Race Prediction")

        # Plot 4: Team performance comparison
        team_avg = predictions.groupby("Team")["PredictedRaceTime (s)"].mean().sort_values()
        axes[1, 1].barh(team_avg.index, team_avg.values)
        axes[1, 1].set_xlabel("Average Predicted Race Time (s)")
        axes[1, 1].set_title("Team Performance Comparison (Monza 2025)")

        plt.tight_layout()
        plt.savefig("monza_2025_prediction_results.png", dpi=300, bbox_inches="tight")
        plt.show()

    def save_model(self, filepath):
        """Save the trained model to file"""
        joblib.dump(self.model, filepath)
        print(f"Model saved to {filepath}")

    def load_model(self, filepath):
        """Load a trained model from file"""
        self.model = joblib.load(filepath)
        print(f"Model loaded from {filepath}")

# Example usage for 2025 Italian GP
if __name__ == "__main__":
    # Initialize predictor
    predictor = F1RacePredictor()

    # Load historical data (2024 Italian GP for training)
    # Note: You would need to check which round the 2024 Italian GP was
    # For now, we'll assume it was round 15 of the 2024 season
    historical_data = predictor.load_historical_data(2024, 15, "R")

    # Define team performance scores (based on 2024 constructor standings)
    # These would be updated with real 2024 data when available
    team_performance = {
        "Red Bull": 0.95,
        "Ferrari": 0.90,  # Ferrari typically strong at Monza
        "McLaren": 0.87,
        "Mercedes": 0.85,
        "Aston Martin": 0.78,
        "Alpine": 0.72,
        "Williams": 0.70,  # Williams often performs well at low-downforce tracks
        "RB": 0.65,
        "Kick Sauber": 0.62,
        "Haas": 0.60
    }

    # Simulated qualifying data for 2025 Italian GP
    # Monza is the "Temple of Speed" with very low downforce
    qualifying_data = pd.DataFrame({
        "Driver": ["VER", "LEC", "SAI", "NOR", "PIA", "RUS", "HAM", "ALO", "ALB", "GAS"],
        "Team": ["Red Bull", "Ferrari", "Ferrari", "McLaren", "McLaren",
                "Mercedes", "Mercedes", "Aston Martin", "Williams", "Alpine"],
        "QualifyingTime (s)": [76.2, 76.5, 76.7, 76.8, 77.0, 77.1, 77.3, 77.5, 77.6, 77.8]
    })

    # Get weather data for Monza, Italy (September 7, 2025)
    weather_data = predictor.get_weather_data(45.6156, 9.2814, "2025-09-07")

    # Create features for prediction
    features = predictor.create_features(qualifying_data, weather_data, team_performance)

    # For training, we'll use historical lap times as target
    target = historical_data["LapTime (s)"]

    # Train the model
    model, mae, r2 = predictor.train_model(features, target)

    # Make predictions
    predictions = predictor.predict_race(features)

    # Display results
    print("\n🏁 Predicted 2025 Italian GP Results 🏁")
    print(predictions[["Driver", "Team", "PredictedRaceTime (s)", "PredictedPosition"]])

    # Visualize results
    predictor.visualize_results(predictions, historical_data)

    # Save model for future use
    predictor.save_model("monza_2025_predictor_model.pkl")

    # Compare with actual results (when available)
    print("\nNote: Once the actual 2025 Italian GP results are available,")
    print("you can compare these predictions with reality to validate the model!")

core           INFO 	Loading data for Monaco Grand Prix - Race [v3.6.1]
INFO:fastf1.fastf1.core:Loading data for Monaco Grand Prix - Race [v3.6.1]
req            INFO 	No cached data found for session_info. Loading data...
INFO:fastf1.fastf1.req:No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
INFO:fastf1.api:Fetching session info data...
req            INFO 	Data has been written to cache!
INFO:fastf1.fastf1.req:Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
INFO:fastf1.fastf1.req:No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
INFO:fastf1.api:Fetching driver list...
req            INFO 	Data has been written to cache!
INFO:fastf1.fastf1.req:Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
INFO:fastf1.fastf1.req:No cached data found for session_status_dat

KeyError: 'list'