In [1]:
import requests
import pandas as pd
import json
from datetime import datetime

# Base URL for OpenF1 API
BASE_URL = "https://api.openf1.org/v1"

def get_data(endpoint, params=None):
    """Helper function to fetch data from OpenF1 API"""
    url = f"{BASE_URL}/{endpoint}"
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error {response.status_code}: {response.text}")
        return None

# 1. First, let's see what meetings (race weekends) are available for 2023
print("=== EXPLORING 2023 RACE MEETINGS ===")
meetings_2023 = get_data("meetings", {"year": 2023})

if meetings_2023:
    meetings_df = pd.DataFrame(meetings_2023)
    print(f"Found {len(meetings_df)} meetings in 2023")
    print("\nSample meetings:")
    print(meetings_df[['meeting_name', 'country_name', 'date_start', 'meeting_key']].head())

    # Let's pick a specific race - Singapore GP 2023 (mentioned in the docs)
    singapore_meeting = meetings_df[meetings_df['country_name'] == 'Singapore'].iloc[0]
    print(f"\n=== SELECTED RACE: {singapore_meeting['meeting_name']} ===")
    print(f"Meeting Key: {singapore_meeting['meeting_key']}")
    print(f"Date: {singapore_meeting['date_start']}")

    meeting_key = singapore_meeting['meeting_key']

    # 2. Get all sessions for this meeting (Practice, Qualifying, Race)
    print("\n=== SESSIONS FOR SINGAPORE GP 2023 ===")
    sessions = get_data("sessions", {"meeting_key": meeting_key})

    if sessions:
        sessions_df = pd.DataFrame(sessions)
        print("Available sessions:")
        print(sessions_df[['session_name', 'session_type', 'session_key', 'date_start']])

        # Find qualifying and race sessions
        qualifying_session = sessions_df[sessions_df['session_name'] == 'Qualifying'].iloc[0]
        race_session = sessions_df[sessions_df['session_name'] == 'Race'].iloc[0]

        print(f"\nQualifying Session Key: {qualifying_session['session_key']}")
        print(f"Race Session Key: {race_session['session_key']}")

        # 3. Get drivers for this meeting
        print("\n=== DRIVERS IN SINGAPORE GP 2023 ===")
        drivers = get_data("drivers", {"session_key": qualifying_session['session_key']})

        if drivers:
            drivers_df = pd.DataFrame(drivers)
            print(f"Found {len(drivers_df)} drivers")
            print("Drivers and teams:")
            print(drivers_df[['driver_number', 'full_name', 'name_acronym', 'team_name']].head(10))

            # 4. Get qualifying positions (our main predictor)
            print("\n=== QUALIFYING POSITIONS ===")
            qual_positions = get_data("position", {"session_key": qualifying_session['session_key']})

            if qual_positions:
                # Get final qualifying positions (latest position for each driver)
                qual_df = pd.DataFrame(qual_positions)
                print(f"Found {len(qual_df)} position records")

                # Get the final qualifying order
                final_qual = qual_df.groupby('driver_number')['position'].last().reset_index()
                final_qual = final_qual.merge(drivers_df[['driver_number', 'full_name', 'team_name']],
                                             on='driver_number', how='left')
                final_qual = final_qual.sort_values('position')

                print("Final Qualifying Order:")
                print(final_qual[['position', 'full_name', 'team_name']].head(10))

                # 5. Get race results (our target variable)
                print("\n=== RACE RESULTS ===")
                race_positions = get_data("position", {"session_key": race_session['session_key']})

                if race_positions:
                    race_df = pd.DataFrame(race_positions)

                    # Get final race positions
                    final_race = race_df.groupby('driver_number')['position'].last().reset_index()
                    final_race = final_race.merge(drivers_df[['driver_number', 'full_name', 'team_name']],
                                                 on='driver_number', how='left')
                    final_race = final_race.sort_values('position')

                    print("Final Race Results:")
                    print(final_race[['position', 'full_name', 'team_name']].head(10))

                    # 6. Combine qualifying and race data to see the correlation
                    print("\n=== QUALIFYING vs RACE COMPARISON ===")
                    comparison = final_qual[['driver_number', 'position']].rename(columns={'position': 'qual_pos'}).merge(
                        final_race[['driver_number', 'position', 'full_name']].rename(columns={'position': 'race_pos'}),
                        on='driver_number'
                    )
                    comparison['position_change'] = comparison['race_pos'] - comparison['qual_pos']
                    comparison = comparison.sort_values('race_pos')

                    print("Top 10 Race Finishers (Qualifying vs Race):")
                    print(comparison[['full_name', 'qual_pos', 'race_pos', 'position_change']].head(10))

                    # 7. Let's also check weather data for this race
                    print("\n=== WEATHER CONDITIONS ===")
                    weather = get_data("weather", {"session_key": race_session['session_key']})

                    if weather and len(weather) > 0:
                        weather_df = pd.DataFrame(weather)
                        print(f"Found {len(weather_df)} weather records")

                        # Get average conditions
                        avg_weather = weather_df.agg({
                            'air_temperature': 'mean',
                            'track_temperature': 'mean',
                            'humidity': 'mean',
                            'rainfall': 'mean',
                            'wind_speed': 'mean'
                        })

                        print("Average race conditions:")
                        for metric, value in avg_weather.items():
                            print(f"  {metric}: {value:.1f}")

                        # Check if it rained
                        max_rainfall = weather_df['rainfall'].max()
                        print(f"  Max rainfall: {max_rainfall}")
                        print(f"  Race condition: {'WET' if max_rainfall > 0 else 'DRY'}")

                    print("\n=== DATA STRUCTURE SUMMARY ===")
                    print("✅ Successfully extracted:")
                    print(f"  - Meeting info: {len(meetings_df)} races in 2023")
                    print(f"  - Session data: {len(sessions_df)} sessions per race weekend")
                    print(f"  - Driver data: {len(drivers_df)} drivers with team info")
                    print(f"  - Qualifying positions: {len(final_qual)} drivers")
                    print(f"  - Race results: {len(final_race)} drivers")
                    print(f"  - Weather data: {len(weather_df) if weather else 0} records")

                    print("\n✅ Key features identified for ML model:")
                    print("  - Qualifying position (strongest predictor)")
                    print("  - Driver & team information")
                    print("  - Weather conditions")
                    print("  - Circuit information")
                    print("\n🎯 Next step: Build systematic data collection for 2022-2024!")

else:
    print("Failed to fetch meetings data")

=== EXPLORING 2023 RACE MEETINGS ===
Found 23 meetings in 2023

Sample meetings:
               meeting_name  country_name                 date_start  \
0        Pre-Season Testing       Bahrain  2023-02-23T07:00:00+00:00   
1        Bahrain Grand Prix       Bahrain  2023-03-03T11:30:00+00:00   
2  Saudi Arabian Grand Prix  Saudi Arabia  2023-03-17T13:30:00+00:00   
3     Australian Grand Prix     Australia  2023-03-31T01:30:00+00:00   
4     Azerbaijan Grand Prix    Azerbaijan  2023-04-28T09:30:00+00:00   

   meeting_key  
0         1140  
1         1141  
2         1142  
3         1143  
4         1207  

=== SELECTED RACE: Singapore Grand Prix ===
Meeting Key: 1219
Date: 2023-09-15T09:30:00+00:00

=== SESSIONS FOR SINGAPORE GP 2023 ===
Available sessions:
  session_name session_type  session_key                 date_start
0   Practice 1     Practice         9158  2023-09-15T09:30:00+00:00
1   Practice 2     Practice         9159  2023-09-15T13:00:00+00:00
2   Practice 3     Practi

In [2]:
import requests
import pandas as pd
import numpy as np
from datetime import datetime
import time
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Base URL for OpenF1 API
BASE_URL = "https://api.openf1.org/v1"

def get_data(endpoint, params=None, max_retries=3):
    """Helper function to fetch data from OpenF1 API with retry logic"""
    url = f"{BASE_URL}/{endpoint}"

    for attempt in range(max_retries):
        try:
            response = requests.get(url, params=params, timeout=30)
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Error {response.status_code} on attempt {attempt + 1}: {response.text}")
                if attempt < max_retries - 1:
                    time.sleep(2)  # Wait before retry
        except Exception as e:
            print(f"Request failed on attempt {attempt + 1}: {e}")
            if attempt < max_retries - 1:
                time.sleep(2)

    return None

In [4]:
def get_final_positions(session_key, session_type="race"):
    """Get final positions for a session, handling DNFs as position 21+"""
    positions = get_data("position", {"session_key": session_key})

    if not positions:
        return pd.DataFrame()

    positions_df = pd.DataFrame(positions)

    # Get the latest position for each driver
    final_positions = positions_df.groupby('driver_number')['position'].last().reset_index()

    # Get all drivers for this session to catch any missing from position data (DNFs)
    drivers = get_data("drivers", {"session_key": session_key})
    if drivers:
        drivers_df = pd.DataFrame(drivers)
        all_drivers = set(drivers_df['driver_number'].unique())
        positioned_drivers = set(final_positions['driver_number'].unique())

        # Find drivers missing from position data (DNFs)
        dnf_drivers = all_drivers - positioned_drivers

        if dnf_drivers:
            print(f"  Found {len(dnf_drivers)} DNF drivers in {session_type}: {list(dnf_drivers)}")

            # Assign DNF positions starting from 21
            max_position = final_positions['position'].max() if len(final_positions) > 0 else 0
            dnf_position_start = max(21, max_position + 1)

            dnf_data = []
            for i, driver_num in enumerate(sorted(dnf_drivers)):
                dnf_data.append({
                    'driver_number': driver_num,
                    'position': dnf_position_start + i
                })

            if dnf_data:
                dnf_df = pd.DataFrame(dnf_data)
                final_positions = pd.concat([final_positions, dnf_df], ignore_index=True)

    return final_positions

In [5]:
def collect_race_data(year):
    """Collect all race data for a given year"""
    print(f"\n{'='*50}")
    print(f"COLLECTING DATA FOR {year}")
    print(f"{'='*50}")

    # Get all meetings for the year
    meetings = get_data("meetings", {"year": year})
    if not meetings:
        print(f"No meetings found for {year}")
        return pd.DataFrame()

    meetings_df = pd.DataFrame(meetings)

    # Filter out testing sessions - only keep Grand Prix races
    gp_meetings = meetings_df[meetings_df['meeting_name'].str.contains('Grand Prix', case=False, na=False)]
    print(f"Found {len(gp_meetings)} Grand Prix meetings in {year}")

    race_data = []

    for idx, meeting in tqdm(gp_meetings.iterrows(), total=len(gp_meetings), desc=f"Processing {year} races"):
        try:
            print(f"\n--- Processing: {meeting['meeting_name']} ---")

            # Get sessions for this meeting
            sessions = get_data("sessions", {"meeting_key": meeting['meeting_key']})
            if not sessions:
                print(f"  No sessions found for {meeting['meeting_name']}")
                continue

            sessions_df = pd.DataFrame(sessions)

            # Find qualifying and race sessions
            qualifying_sessions = sessions_df[sessions_df['session_name'] == 'Qualifying']
            race_sessions = sessions_df[sessions_df['session_name'] == 'Race']

            if len(qualifying_sessions) == 0:
                print(f"  No qualifying session found for {meeting['meeting_name']}")
                continue

            if len(race_sessions) == 0:
                print(f"  No race session found for {meeting['meeting_name']}")
                continue

            qualifying_session = qualifying_sessions.iloc[0]
            race_session = race_sessions.iloc[0]

            print(f"  Qualifying Key: {qualifying_session['session_key']}")
            print(f"  Race Key: {race_session['session_key']}")

            # Get drivers for this race
            drivers = get_data("drivers", {"session_key": qualifying_session['session_key']})
            if not drivers:
                print(f"  No driver data found for {meeting['meeting_name']}")
                continue

            drivers_df = pd.DataFrame(drivers)

            # Get qualifying positions
            print("  Getting qualifying positions...")
            qual_positions = get_final_positions(qualifying_session['session_key'], "qualifying")

            if len(qual_positions) == 0:
                print(f"  No qualifying positions found for {meeting['meeting_name']}")
                continue

            # Get race results
            print("  Getting race results...")
            race_positions = get_final_positions(race_session['session_key'], "race")

            if len(race_positions) == 0:
                print(f"  No race results found for {meeting['meeting_name']}")
                continue

            # Get weather data
            print("  Getting weather data...")
            weather = get_data("weather", {"session_key": race_session['session_key']})
            weather_features = {}

            if weather and len(weather) > 0:
                weather_df = pd.DataFrame(weather)
                weather_features = {
                    'avg_air_temp': weather_df['air_temperature'].mean(),
                    'avg_track_temp': weather_df['track_temperature'].mean(),
                    'avg_humidity': weather_df['humidity'].mean(),
                    'total_rainfall': weather_df['rainfall'].sum(),
                    'max_rainfall': weather_df['rainfall'].max(),
                    'avg_wind_speed': weather_df['wind_speed'].mean(),
                    'is_wet_race': weather_df['rainfall'].max() > 0
                }
            else:
                print("  No weather data available")
                weather_features = {
                    'avg_air_temp': None, 'avg_track_temp': None, 'avg_humidity': None,
                    'total_rainfall': None, 'max_rainfall': None, 'avg_wind_speed': None,
                    'is_wet_race': False
                }

            # Combine all data
            combined_data = qual_positions.merge(
                race_positions.rename(columns={'position': 'race_position'}),
                on='driver_number',
                how='outer'
            ).merge(
                drivers_df[['driver_number', 'full_name', 'name_acronym', 'team_name', 'country_code']],
                on='driver_number',
                how='left'
            )

            # Rename qualifying position column
            combined_data = combined_data.rename(columns={'position': 'qualifying_position'})

            # Add race metadata
            for col, value in meeting.items():
                combined_data[f'meeting_{col}'] = value

            # Add session keys
            combined_data['qualifying_session_key'] = qualifying_session['session_key']
            combined_data['race_session_key'] = race_session['session_key']

            # Add weather features
            for weather_col, weather_val in weather_features.items():
                combined_data[weather_col] = weather_val

            # Handle missing race positions (treat as DNF)
            combined_data['race_position'] = combined_data['race_position'].fillna(22)  # DNF position

            # Handle missing qualifying positions (DNS - Did Not Start)
            combined_data['qualifying_position'] = combined_data['qualifying_position'].fillna(21)  # DNS position

            race_data.append(combined_data)

            print(f"  ✅ Collected data for {len(combined_data)} drivers")

            # Larger delay to respect API rate limits (10 requests per 10 seconds)
            time.sleep(5)

        except Exception as e:
            print(f"  ❌ Error processing {meeting['meeting_name']}: {e}")
            continue

    if race_data:
        all_race_data = pd.concat(race_data, ignore_index=True)
        print(f"\n✅ Successfully collected data for {len(all_race_data)} driver-race combinations in {year}")
        return all_race_data
    else:
        print(f"\n❌ No data collected for {year}")
        return pd.DataFrame()


In [6]:
def add_derived_features(df):
    """Add derived features for ML model"""
    print("\n--- Adding derived features ---")

    # Ensure data is sorted by date
    date_col = 'meeting_date_start' if 'meeting_date_start' in df.columns else 'date_start'
    year_col = 'meeting_year' if 'meeting_year' in df.columns else 'year'

    df = df.sort_values([year_col, date_col]).reset_index(drop=True)

    derived_features = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Adding features"):
        features = row.to_dict()

        # Get historical data for this driver up to this race (excluding current race)
        # Use the correct column names with meeting_ prefix
        date_col = 'meeting_date_start' if 'meeting_date_start' in df.columns else 'date_start'
        circuit_col = 'meeting_circuit_key' if 'meeting_circuit_key' in df.columns else 'circuit_key'
        year_col = 'meeting_year' if 'meeting_year' in df.columns else 'year'

        driver_history = df[
            (df['driver_number'] == row['driver_number']) &
            (df[date_col] < row[date_col])
        ].copy()

        # Driver career features
        features['driver_races_completed'] = len(driver_history)
        features['driver_career_wins'] = len(driver_history[driver_history['race_position'] == 1])
        features['driver_career_podiums'] = len(driver_history[driver_history['race_position'] <= 3])
        features['driver_career_top5'] = len(driver_history[driver_history['race_position'] <= 5])
        features['driver_career_points_rate'] = len(driver_history[driver_history['race_position'] <= 10]) / max(1, len(driver_history))

        # Recent form (last 5 races)
        recent_history = driver_history.tail(5)
        if len(recent_history) > 0:
            features['driver_recent_avg_position'] = recent_history['race_position'].mean()
            features['driver_recent_avg_qual_position'] = recent_history['qualifying_position'].mean()
            features['driver_recent_wins'] = len(recent_history[recent_history['race_position'] == 1])
            features['driver_recent_podiums'] = len(recent_history[recent_history['race_position'] <= 3])
        else:
            features['driver_recent_avg_position'] = None
            features['driver_recent_avg_qual_position'] = None
            features['driver_recent_wins'] = 0
            features['driver_recent_podiums'] = 0

        # Circuit-specific performance
        circuit_history = driver_history[driver_history[circuit_col] == row[circuit_col]]
        features['driver_circuit_races'] = len(circuit_history)
        if len(circuit_history) > 0:
            features['driver_circuit_avg_position'] = circuit_history['race_position'].mean()
            features['driver_circuit_avg_qual'] = circuit_history['qualifying_position'].mean()
            features['driver_circuit_wins'] = len(circuit_history[circuit_history['race_position'] == 1])
            features['driver_circuit_podiums'] = len(circuit_history[circuit_history['race_position'] <= 3])
        else:
            features['driver_circuit_avg_position'] = None
            features['driver_circuit_avg_qual'] = None
            features['driver_circuit_wins'] = 0
            features['driver_circuit_podiums'] = 0

        # Team performance at this point in time
        team_history = df[
            (df['team_name'] == row['team_name']) &
            (df[date_col] < row[date_col]) &
            (df[year_col] == row[year_col])  # Same season
        ].copy()

        if len(team_history) > 0:
            features['team_season_avg_position'] = team_history['race_position'].mean()
            features['team_season_wins'] = len(team_history[team_history['race_position'] == 1])
            features['team_season_podiums'] = len(team_history[team_history['race_position'] <= 3])
            features['team_season_points_rate'] = len(team_history[team_history['race_position'] <= 10]) / max(1, len(team_history))
        else:
            features['team_season_avg_position'] = None
            features['team_season_wins'] = 0
            features['team_season_podiums'] = 0
            features['team_season_points_rate'] = None

        derived_features.append(features)

    result_df = pd.DataFrame(derived_features)
    print(f"✅ Added derived features. Dataset now has {len(result_df.columns)} columns")

    return result_df

In [7]:
# Main execution
print("🏎️  F1 PREDICTIVE ANALYTICS DATA COLLECTION PIPELINE")
print("=" * 60)

# Start with 2023 to validate the approach
print("Starting with 2023 data for validation...")

# Collect 2023 data
data_2023 = collect_race_data(2023)

if len(data_2023) > 0:
    print(f"\n📊 2023 DATA SUMMARY:")
    print(f"Total driver-race combinations: {len(data_2023)}")
    print(f"Unique races: {data_2023['meeting_meeting_name'].nunique()}")
    print(f"Unique drivers: {data_2023['driver_number'].nunique()}")
    print(f"Unique teams: {data_2023['team_name'].nunique()}")

    # Add derived features
    enhanced_2023 = add_derived_features(data_2023)

    # Show sample of final data
    print(f"\n📋 SAMPLE OF ENHANCED DATASET:")
    sample_cols = ['meeting_meeting_name', 'full_name', 'team_name', 'qualifying_position', 'race_position',
                   'driver_races_completed', 'driver_recent_avg_position', 'team_season_avg_position', 'is_wet_race']
    print(enhanced_2023[sample_cols].head(10))

    # Show data quality
    print(f"\n🔍 DATA QUALITY CHECK:")
    print(f"Missing qualifying positions: {enhanced_2023['qualifying_position'].isna().sum()}")
    print(f"Missing race positions: {enhanced_2023['race_position'].isna().sum()}")

    # Check for weather data
    weather_cols = ['avg_air_temp', 'air_temperature', 'track_temperature']
    weather_col = None
    for col in weather_cols:
        if col in enhanced_2023.columns:
            weather_col = col
            break

    if weather_col:
        print(f"Weather data coverage: {enhanced_2023[weather_col].notna().sum()}/{len(enhanced_2023)} races")
    else:
        print("Weather data: Not available")

    # Feature engineering summary
    feature_cols = ['driver_races_completed', 'driver_recent_avg_position', 'team_season_avg_position']
    available_features = [col for col in feature_cols if col in enhanced_2023.columns]
    print(f"Generated features: {len(available_features)}/{len(feature_cols)} ({', '.join(available_features)})")

    # Analysis of top 3 finishes
    top3_analysis = enhanced_2023[enhanced_2023['race_position'] <= 3]
    print(f"\n🏆 TOP 3 FINISHES ANALYSIS:")
    print(f"Total top 3 finishes: {len(top3_analysis)}")
    print("\nQualifying position distribution for top 3 finishers:")
    print(top3_analysis['qualifying_position'].value_counts().sort_index().head(10))

    print(f"\n✅ 2023 data collection completed successfully!")
    print(f"Ready to expand to 2022 and 2024, then proceed to ML modeling.")

else:
    print("❌ Failed to collect 2023 data. Please check API connectivity.")

print("\n🎯 NEXT STEPS:")
print("1. Expand collection to 2022 and 2024")
print("2. Handle new drivers (those with no historical data)")
print("3. Build and train ML models")
print("4. Validate on 2025 races")

🏎️  F1 PREDICTIVE ANALYTICS DATA COLLECTION PIPELINE
Starting with 2023 data for validation...

COLLECTING DATA FOR 2023
Found 22 Grand Prix meetings in 2023


Processing 2023 races:   0%|          | 0/22 [00:00<?, ?it/s]


--- Processing: Bahrain Grand Prix ---
  Qualifying Key: 7768
  Race Key: 7953
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2023 races:   5%|▍         | 1/22 [00:06<02:08,  6.11s/it]


--- Processing: Saudi Arabian Grand Prix ---
  Qualifying Key: 7775
  Race Key: 7779
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2023 races:   9%|▉         | 2/22 [00:13<02:14,  6.75s/it]


--- Processing: Australian Grand Prix ---
  Qualifying Key: 7783
  Race Key: 7787
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2023 races:  14%|█▎        | 3/22 [00:20<02:08,  6.75s/it]


--- Processing: Azerbaijan Grand Prix ---
  Qualifying Key: 9064
  Race Key: 9070
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2023 races:  18%|█▊        | 4/22 [00:26<01:56,  6.49s/it]


--- Processing: Miami Grand Prix ---
  Qualifying Key: 9074
  Race Key: 9078
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2023 races:  23%|██▎       | 5/22 [00:35<02:06,  7.45s/it]


--- Processing: Monaco Grand Prix ---
  Qualifying Key: 9090
  Race Key: 9094
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2023 races:  27%|██▋       | 6/22 [00:42<01:55,  7.22s/it]


--- Processing: Spanish Grand Prix ---
  Qualifying Key: 9098
  Race Key: 9102
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2023 races:  32%|███▏      | 7/22 [00:48<01:46,  7.10s/it]


--- Processing: Canadian Grand Prix ---
  Qualifying Key: 9106
  Race Key: 9110
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2023 races:  36%|███▋      | 8/22 [00:55<01:36,  6.86s/it]


--- Processing: Austrian Grand Prix ---
  Qualifying Key: 9112
  Race Key: 9118
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2023 races:  41%|████      | 9/22 [01:01<01:27,  6.74s/it]


--- Processing: British Grand Prix ---
  Qualifying Key: 9122
  Race Key: 9126
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2023 races:  45%|████▌     | 10/22 [01:09<01:23,  6.96s/it]


--- Processing: Hungarian Grand Prix ---
  Qualifying Key: 9129
  Race Key: 9133
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2023 races:  50%|█████     | 11/22 [01:16<01:16,  6.92s/it]


--- Processing: Belgian Grand Prix ---
  Qualifying Key: 9135
  Race Key: 9141
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2023 races:  55%|█████▍    | 12/22 [01:22<01:07,  6.77s/it]


--- Processing: Dutch Grand Prix ---
  Qualifying Key: 9145
  Race Key: 9149
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2023 races:  59%|█████▉    | 13/22 [01:29<01:02,  6.90s/it]


--- Processing: Italian Grand Prix ---
  Qualifying Key: 9153
  Race Key: 9157
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2023 races:  64%|██████▎   | 14/22 [01:36<00:55,  6.93s/it]


--- Processing: Singapore Grand Prix ---
  Qualifying Key: 9161
  Race Key: 9165
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2023 races:  68%|██████▊   | 15/22 [01:43<00:48,  6.88s/it]


--- Processing: Japanese Grand Prix ---
  Qualifying Key: 9169
  Race Key: 9173
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2023 races:  73%|███████▎  | 16/22 [01:50<00:41,  6.84s/it]


--- Processing: Qatar Grand Prix ---
  Qualifying Key: 9215
  Race Key: 9221
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2023 races:  77%|███████▋  | 17/22 [01:56<00:34,  6.81s/it]


--- Processing: United States Grand Prix ---
  Qualifying Key: 9207
  Race Key: 9213
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2023 races:  82%|████████▏ | 18/22 [02:03<00:26,  6.68s/it]


--- Processing: Mexico City Grand Prix ---
  Qualifying Key: 9177
  Race Key: 9181
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2023 races:  86%|████████▋ | 19/22 [02:09<00:19,  6.56s/it]


--- Processing: São Paulo Grand Prix ---
  Qualifying Key: 9304
  Race Key: 9205
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2023 races:  91%|█████████ | 20/22 [02:15<00:13,  6.51s/it]


--- Processing: Las Vegas Grand Prix ---
  Qualifying Key: 9314
  Race Key: 9189
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2023 races:  95%|█████████▌| 21/22 [02:22<00:06,  6.46s/it]


--- Processing: Abu Dhabi Grand Prix ---
  Qualifying Key: 9193
  Race Key: 9197
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2023 races: 100%|██████████| 22/22 [02:28<00:00,  6.75s/it]



✅ Successfully collected data for 440 driver-race combinations in 2023

📊 2023 DATA SUMMARY:
Total driver-race combinations: 440
Unique races: 22
Unique drivers: 22
Unique teams: 10

--- Adding derived features ---


Adding features: 100%|██████████| 440/440 [00:02<00:00, 158.97it/s]

✅ Added derived features. Dataset now has 47 columns

📋 SAMPLE OF ENHANCED DATASET:
  meeting_meeting_name        full_name        team_name  qualifying_position  \
0   Bahrain Grand Prix   Max VERSTAPPEN  Red Bull Racing                    1   
1   Bahrain Grand Prix   Logan SARGEANT         Williams                   16   
2   Bahrain Grand Prix     Lando NORRIS          McLaren                   11   
3   Bahrain Grand Prix     Pierre GASLY           Alpine                   20   
4   Bahrain Grand Prix     Sergio PEREZ  Red Bull Racing                    2   
5   Bahrain Grand Prix  Fernando ALONSO     Aston Martin                    5   
6   Bahrain Grand Prix  Charles LECLERC          Ferrari                    3   
7   Bahrain Grand Prix     Lance STROLL     Aston Martin                    8   
8   Bahrain Grand Prix  Kevin MAGNUSSEN     Haas F1 Team                   17   
9   Bahrain Grand Prix    Nyck DE VRIES       AlphaTauri                   19   

   race_position  driver




In [8]:
data_2024 = collect_race_data(2024)


COLLECTING DATA FOR 2024
Found 24 Grand Prix meetings in 2024


Processing 2024 races:   0%|          | 0/24 [00:00<?, ?it/s]


--- Processing: Bahrain Grand Prix ---
  Qualifying Key: 9468
  Race Key: 9472
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races:   4%|▍         | 1/24 [00:07<02:43,  7.13s/it]


--- Processing: Saudi Arabian Grand Prix ---
  Qualifying Key: 9476
  Race Key: 9480
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races:   8%|▊         | 2/24 [00:13<02:30,  6.85s/it]


--- Processing: Australian Grand Prix ---
  Qualifying Key: 9484
  Race Key: 9488
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 19 drivers


Processing 2024 races:  12%|█▎        | 3/24 [00:19<02:16,  6.51s/it]


--- Processing: Japanese Grand Prix ---
  Qualifying Key: 9492
  Race Key: 9496
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races:  17%|█▋        | 4/24 [00:26<02:07,  6.36s/it]


--- Processing: Chinese Grand Prix ---
  Qualifying Key: 9664
  Race Key: 9673
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races:  21%|██        | 5/24 [00:32<01:59,  6.27s/it]


--- Processing: Miami Grand Prix ---
  Qualifying Key: 9498
  Race Key: 9507
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races:  25%|██▌       | 6/24 [00:38<01:52,  6.24s/it]


--- Processing: Emilia Romagna Grand Prix ---
  Qualifying Key: 9511
  Race Key: 9515
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races:  29%|██▉       | 7/24 [00:44<01:45,  6.22s/it]


--- Processing: Monaco Grand Prix ---
  Qualifying Key: 9519
  Race Key: 9523
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races:  33%|███▎      | 8/24 [00:50<01:38,  6.19s/it]


--- Processing: Canadian Grand Prix ---
  Qualifying Key: 9527
  Race Key: 9531
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races:  38%|███▊      | 9/24 [00:56<01:32,  6.16s/it]


--- Processing: Spanish Grand Prix ---
  Qualifying Key: 9535
  Race Key: 9539
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races:  42%|████▏     | 10/24 [01:04<01:35,  6.80s/it]


--- Processing: Austrian Grand Prix ---
  Qualifying Key: 9541
  Race Key: 9550
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races:  46%|████▌     | 11/24 [01:11<01:26,  6.69s/it]


--- Processing: British Grand Prix ---
  Qualifying Key: 9554
  Race Key: 9558
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races:  50%|█████     | 12/24 [01:18<01:20,  6.68s/it]


--- Processing: Hungarian Grand Prix ---
  Qualifying Key: 9562
  Race Key: 9566
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races:  54%|█████▍    | 13/24 [01:24<01:13,  6.71s/it]


--- Processing: Belgian Grand Prix ---
  Qualifying Key: 9570
  Race Key: 9574
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races:  58%|█████▊    | 14/24 [01:33<01:13,  7.37s/it]


--- Processing: Dutch Grand Prix ---
  Qualifying Key: 9578
  Race Key: 9582
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races:  62%|██████▎   | 15/24 [01:40<01:04,  7.17s/it]


--- Processing: Italian Grand Prix ---
  Qualifying Key: 9586
  Race Key: 9590
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races:  67%|██████▋   | 16/24 [01:46<00:55,  6.95s/it]


--- Processing: Azerbaijan Grand Prix ---
  Qualifying Key: 9594
  Race Key: 9598
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races:  71%|███████   | 17/24 [01:53<00:48,  6.97s/it]


--- Processing: Singapore Grand Prix ---
  Qualifying Key: 9602
  Race Key: 9606
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races:  75%|███████▌  | 18/24 [02:00<00:41,  6.86s/it]


--- Processing: United States Grand Prix ---
  Qualifying Key: 9608
  Race Key: 9617
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races:  79%|███████▉  | 19/24 [02:07<00:34,  6.96s/it]


--- Processing: Mexico City Grand Prix ---
  Qualifying Key: 9621
  Race Key: 9625
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races:  83%|████████▎ | 20/24 [02:14<00:27,  6.83s/it]


--- Processing: São Paulo Grand Prix ---
  Qualifying Key: 9627
  Race Key: 9636
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races:  88%|████████▊ | 21/24 [02:20<00:20,  6.74s/it]


--- Processing: Las Vegas Grand Prix ---
  Qualifying Key: 9640
  Race Key: 9644
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races:  92%|█████████▏| 22/24 [02:27<00:13,  6.67s/it]


--- Processing: Qatar Grand Prix ---
  Qualifying Key: 9646
  Race Key: 9655
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races:  96%|█████████▌| 23/24 [02:33<00:06,  6.55s/it]


--- Processing: Abu Dhabi Grand Prix ---
  Qualifying Key: 9658
  Race Key: 9662
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2024 races: 100%|██████████| 24/24 [02:39<00:00,  6.65s/it]


✅ Successfully collected data for 479 driver-race combinations in 2024





In [9]:
data_2025 = collect_race_data(2025)


COLLECTING DATA FOR 2025
Found 11 Grand Prix meetings in 2025


Processing 2025 races:   0%|          | 0/11 [00:00<?, ?it/s]


--- Processing: Australian Grand Prix ---
  Qualifying Key: 9689
  Race Key: 9693
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2025 races:   9%|▉         | 1/11 [00:06<01:00,  6.05s/it]


--- Processing: Chinese Grand Prix ---
  Qualifying Key: 9994
  Race Key: 9998
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2025 races:  18%|█▊        | 2/11 [00:12<00:55,  6.16s/it]


--- Processing: Japanese Grand Prix ---
  Qualifying Key: 10002
  Race Key: 10006
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2025 races:  27%|██▋       | 3/11 [00:18<00:49,  6.21s/it]


--- Processing: Bahrain Grand Prix ---
  Qualifying Key: 10010
  Race Key: 10014
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2025 races:  36%|███▋      | 4/11 [00:24<00:43,  6.16s/it]


--- Processing: Saudi Arabian Grand Prix ---
  Qualifying Key: 10018
  Race Key: 10022
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2025 races:  45%|████▌     | 5/11 [00:30<00:36,  6.13s/it]


--- Processing: Miami Grand Prix ---
  Qualifying Key: 10029
  Race Key: 10033
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2025 races:  55%|█████▍    | 6/11 [00:36<00:30,  6.11s/it]


--- Processing: Emilia Romagna Grand Prix ---
  Qualifying Key: 9983
  Race Key: 9987
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2025 races:  64%|██████▎   | 7/11 [00:43<00:24,  6.18s/it]


--- Processing: Monaco Grand Prix ---
  Qualifying Key: 9975
  Race Key: 9979
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2025 races:  73%|███████▎  | 8/11 [00:49<00:18,  6.16s/it]


--- Processing: Spanish Grand Prix ---
  Qualifying Key: 9967
  Race Key: 9971
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2025 races:  82%|████████▏ | 9/11 [00:55<00:12,  6.17s/it]


--- Processing: Canadian Grand Prix ---
  Qualifying Key: 9959
  Race Key: 9963
  Getting qualifying positions...
  Getting race results...
  Getting weather data...
  ✅ Collected data for 20 drivers


Processing 2025 races: 100%|██████████| 11/11 [01:01<00:00,  5.60s/it]


--- Processing: Austrian Grand Prix ---
  No qualifying session found for Austrian Grand Prix

✅ Successfully collected data for 200 driver-race combinations in 2025





In [10]:
# Combine 2023 and 2024 data only
print("🏎️  COMBINING F1 DATA (2023-2024)")
print("=" * 50)

# Check what data we have
datasets = []
dataset_info = []

if 'data_2023' in locals() and len(data_2023) > 0:
    datasets.append(data_2023)
    dataset_info.append(f"2023: {len(data_2023)} records")
    print(f"✅ 2023 data: {len(data_2023)} driver-race combinations")

if 'data_2024' in locals() and len(data_2024) > 0:
    datasets.append(data_2024)
    dataset_info.append(f"2024: {len(data_2024)} records")
    print(f"✅ 2024 data: {len(data_2024)} driver-race combinations")

if 'data_2025' in locals() and len(data_2025) > 0:
    datasets.append(data_2025)
    dataset_info.append(f"2025: {len(data_2025)} records")
    print(f"✅ 2025 data: {len(data_2025)} driver-race combinations")

print(f"ℹ️  Note: Using 2023-2025 data only (2022 not available)")

if len(datasets) == 0:
    print("❌ No datasets found. Please ensure data_2023 and data_2024 are loaded.")
elif len(datasets) == 1:
    print("⚠️  Only one year of data found. Using single year for training.")
    combined_data = datasets[0]
else:
    # Combine both datasets
    print(f"\nCombining {len(datasets)} datasets...")
    combined_data = pd.concat(datasets, ignore_index=True)

print(f"✅ Training dataset: {len(combined_data)} total driver-race combinations")

🏎️  COMBINING F1 DATA (2023-2024)
✅ 2023 data: 440 driver-race combinations
✅ 2024 data: 479 driver-race combinations
✅ 2025 data: 200 driver-race combinations
ℹ️  Note: Using 2023-2025 data only (2022 not available)

Combining 3 datasets...
✅ Training dataset: 1119 total driver-race combinations


In [11]:
# Show data distribution by year
print(f"\n📊 DATA DISTRIBUTION:")
year_col = 'meeting_year'
if year_col in combined_data.columns:
    year_distribution = combined_data[year_col].value_counts().sort_index()
    total_races = 0
    for year, count in year_distribution.items():
        races = count // 20  # Assuming ~20 drivers per race
        total_races += races
        print(f"  {year}: {count} driver-race combinations (~{races} races)")
    print(f"  Total: ~{total_races} races across {len(year_distribution)} years")

# Check data consistency
print(f"\n🔍 DATA CONSISTENCY CHECK:")
print(f"Columns in combined dataset: {len(combined_data.columns)}")
print(f"Unique drivers across all years: {combined_data['driver_number'].nunique()}")
print(f"Unique teams across all years: {combined_data['team_name'].nunique()}")

# Show some basic statistics
meeting_name_col = 'meeting_meeting_name'
if meeting_name_col in combined_data.columns:
    print(f"Unique race meetings: {combined_data[meeting_name_col].nunique()}")

# Add derived features to the combined dataset
print(f"\n{'='*20} FEATURE ENGINEERING {'='*20}")
print("Adding derived features to combined dataset...")
print("This will take a few minutes as we calculate historical features...")

enhanced_combined = add_derived_features(combined_data)

# Final dataset summary
print(f"\n{'='*50}")
print(f"🏁 FINAL TRAINING DATASET SUMMARY")
print(f"{'='*50}")

print(f"Total driver-race combinations: {len(enhanced_combined)}")
print(f"Total races: {enhanced_combined[meeting_name_col].nunique()}")
print(f"Years covered: {sorted(enhanced_combined['meeting_year'].unique())}")
print(f"Unique drivers: {enhanced_combined['driver_number'].nunique()}")
print(f"Unique teams: {enhanced_combined['team_name'].nunique()}")
print(f"Total features: {len(enhanced_combined.columns)}")

# Show sample of enhanced dataset
print(f"\n📋 SAMPLE OF FINAL TRAINING DATASET:")
sample_cols = [meeting_name_col, 'meeting_year', 'full_name', 'team_name',
                'qualifying_position', 'race_position', 'driver_races_completed',
                'driver_recent_avg_position', 'team_season_avg_position', 'is_wet_race']
available_sample_cols = [col for col in sample_cols if col in enhanced_combined.columns]
print(enhanced_combined[available_sample_cols].head(10))

# Show evolution of a driver across years
print(f"\n👤 DRIVER EVOLUTION EXAMPLE (Max Verstappen):")
verstappen_data = enhanced_combined[enhanced_combined['full_name'] == 'Max VERSTAPPEN'].copy()
if len(verstappen_data) > 0:
    verstappen_sample = verstappen_data[['meeting_year', meeting_name_col, 'qualifying_position',
                                        'race_position', 'driver_races_completed', 'driver_career_wins']].head(10)
    print(verstappen_sample)

# Data quality check
print(f"\n🔍 FINAL DATA QUALITY CHECK:")
print(f"Missing qualifying positions: {enhanced_combined['qualifying_position'].isna().sum()}")
print(f"Missing race positions: {enhanced_combined['race_position'].isna().sum()}")

# Weather data coverage
weather_col = 'avg_air_temp'
if weather_col in enhanced_combined.columns:
    weather_coverage = enhanced_combined[weather_col].notna().sum()
    print(f"Weather data coverage: {weather_coverage}/{len(enhanced_combined)} records ({weather_coverage/len(enhanced_combined)*100:.1f}%)")

# Feature engineering summary
feature_cols = ['driver_races_completed', 'driver_recent_avg_position', 'team_season_avg_position',
                'driver_circuit_avg_position', 'driver_career_wins', 'team_season_wins']
available_features = [col for col in feature_cols if col in enhanced_combined.columns]
print(f"Generated features: {len(available_features)} features")

print(f"\nKey derived features:")
for feature in available_features:
    non_null = enhanced_combined[feature].notna().sum()
    print(f"  - {feature}: {non_null}/{len(enhanced_combined)} non-null ({non_null/len(enhanced_combined)*100:.1f}%)")

# Top 3 analysis across all years
top3_analysis = enhanced_combined[enhanced_combined['race_position'] <= 3]
print(f"\n🏆 TOP 3 FINISHES ANALYSIS (ALL YEARS):")
print(f"Total top 3 finishes: {len(top3_analysis)}")

print("\nQualifying position distribution for top 3 finishers:")
qual_dist = top3_analysis['qualifying_position'].value_counts().sort_index().head(15)
for pos, count in qual_dist.items():
    percentage = (count / len(top3_analysis)) * 100
    print(f"  P{pos}: {count} times ({percentage:.1f}%)")

# New drivers analysis
print(f"\n🆕 NEW DRIVERS ANALYSIS:")
for year in sorted(enhanced_combined['meeting_year'].unique()):
    year_data = enhanced_combined[enhanced_combined['meeting_year'] == year]
    new_drivers = year_data[year_data['driver_races_completed'] == 0]
    if len(new_drivers) > 0:
        new_driver_names = new_drivers['full_name'].unique()
        print(f"  {year}: {len(new_driver_names)} new drivers - {list(new_driver_names)}")
    else:
        print(f"  {year}: No new drivers")

# Team performance evolution
print(f"\n🏎️ TEAM PERFORMANCE EVOLUTION:")
team_year_performance = enhanced_combined.groupby(['team_name', 'meeting_year'])['race_position'].mean().reset_index()
team_year_performance = team_year_performance.sort_values(['team_name', 'meeting_year'])

# Show a few teams as examples
example_teams = ['Red Bull Racing', 'Ferrari', 'Mercedes']
for team in example_teams:
    team_data = team_year_performance[team_year_performance['team_name'] == team]
    if len(team_data) > 0:
        print(f"  {team}:")
        for _, row in team_data.iterrows():
            print(f"    {int(row['meeting_year'])}: Avg position {row['race_position']:.1f}")

# Final dataset ready message
print(f"\n💾 DATASET READY FOR ML MODELING!")
print(f"✅ Data collection: {len(enhanced_combined)} records across 2023-2024")
print(f"✅ Feature engineering: {len(enhanced_combined.columns)} total features")
print(f"✅ Data quality: High ({enhanced_combined['qualifying_position'].notna().sum()}/{len(enhanced_combined)} complete records)")

print(f"\n🎯 TRAINING STRATEGY WITH 2023-2024 DATA:")
print(f"  - Use 2023 as primary training data")
print(f"  - Use early 2024 races for validation")
print(f"  - Predict remaining 2024 races or 2025 races")
print(f"  - Historical features will be built from 2023 data for 2024 predictions")

print(f"\n🎯 KEY FEATURES FOR ML MODEL:")
key_features = ['qualifying_position', 'driver_races_completed', 'driver_recent_avg_position',
                'team_season_avg_position', 'driver_circuit_avg_position', 'is_wet_race',
                'driver_career_wins', 'driver_career_podiums']
available_key_features = [f for f in key_features if f in enhanced_combined.columns]
for i, feature in enumerate(available_key_features, 1):
    non_null = enhanced_combined[feature].notna().sum()
    print(f"  {i}. {feature}: {non_null/len(enhanced_combined)*100:.1f}% coverage")

print(f"\n🚀 READY FOR NEXT PHASE: ML MODEL DEVELOPMENT!")
print(f"Use the 'enhanced_combined' DataFrame for training your models.")


📊 DATA DISTRIBUTION:
  2023: 440 driver-race combinations (~22 races)
  2024: 479 driver-race combinations (~23 races)
  2025: 200 driver-race combinations (~10 races)
  Total: ~55 races across 3 years

🔍 DATA CONSISTENCY CHECK:
Columns in combined dataset: 29
Unique drivers across all years: 32
Unique teams across all years: 13
Unique race meetings: 24

Adding derived features to combined dataset...
This will take a few minutes as we calculate historical features...

--- Adding derived features ---


Adding features:   0%|          | 0/1119 [00:00<?, ?it/s]

Adding features: 100%|██████████| 1119/1119 [00:08<00:00, 137.19it/s]

✅ Added derived features. Dataset now has 47 columns

🏁 FINAL TRAINING DATASET SUMMARY
Total driver-race combinations: 1119
Total races: 24
Years covered: [2023, 2024, 2025]
Unique drivers: 32
Unique teams: 13
Total features: 47

📋 SAMPLE OF FINAL TRAINING DATASET:
  meeting_meeting_name  meeting_year        full_name        team_name  \
0   Bahrain Grand Prix          2023   Max VERSTAPPEN  Red Bull Racing   
1   Bahrain Grand Prix          2023   Logan SARGEANT         Williams   
2   Bahrain Grand Prix          2023     Lando NORRIS          McLaren   
3   Bahrain Grand Prix          2023     Pierre GASLY           Alpine   
4   Bahrain Grand Prix          2023     Sergio PEREZ  Red Bull Racing   
5   Bahrain Grand Prix          2023  Fernando ALONSO     Aston Martin   
6   Bahrain Grand Prix          2023  Charles LECLERC          Ferrari   
7   Bahrain Grand Prix          2023     Lance STROLL     Aston Martin   
8   Bahrain Grand Prix          2023  Kevin MAGNUSSEN     Haas F1 Te




In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

#### DELETE UPTIL HERE

In [17]:
enhanced_combined.columns

Index(['driver_number', 'qualifying_position', 'race_position', 'full_name',
       'name_acronym', 'team_name', 'country_code', 'meeting_meeting_key',
       'meeting_circuit_key', 'meeting_circuit_short_name',
       'meeting_meeting_code', 'meeting_location', 'meeting_country_key',
       'meeting_country_code', 'meeting_country_name', 'meeting_meeting_name',
       'meeting_meeting_official_name', 'meeting_gmt_offset',
       'meeting_date_start', 'meeting_year', 'qualifying_session_key',
       'race_session_key', 'avg_air_temp', 'avg_track_temp', 'avg_humidity',
       'total_rainfall', 'max_rainfall', 'avg_wind_speed', 'is_wet_race',
       'driver_races_completed', 'driver_career_wins', 'driver_career_podiums',
       'driver_career_top5', 'driver_career_points_rate',
       'driver_recent_avg_position', 'driver_recent_avg_qual_position',
       'driver_recent_wins', 'driver_recent_podiums', 'driver_circuit_races',
       'driver_circuit_avg_position', 'driver_circuit_avg_qual'

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("🏎️  ULTIMATE F1 PREDICTION MODEL (2023-2025)")
print("=" * 60)

# Check if enhanced_combined is available
if 'enhanced_combined' not in locals():
    print("❌ 'enhanced_combined' dataset not found!")
    print("Please ensure feature engineering has been completed.")
else:
    df = enhanced_combined.copy()
    print(f"✅ Dataset loaded: {len(df)} records with {len(df.columns)} features")
    
    # Create target variable
    df['top_3_finish'] = (df['race_position'] <= 3).astype(int)
    
    # Verify data distribution
    print(f"\n📊 DATA DISTRIBUTION:")
    year_dist = df['meeting_year'].value_counts().sort_index()
    total_top3 = df['top_3_finish'].sum()
    for year, count in year_dist.items():
        races = count // 20
        year_top3 = df[df['meeting_year'] == year]['top_3_finish'].sum()
        print(f"  {year}: {count} records (~{races} races), {year_top3} top 3 finishes")
    
    print(f"  Total: {len(df)} records, {total_top3} top 3 finishes ({total_top3/len(df)*100:.1f}%)")
    
    # Enhanced feature selection using your actual columns
    feature_columns = [
        # Core predictive features
        'qualifying_position',
        'driver_races_completed', 
        'driver_recent_avg_position',
        'team_season_avg_position',
        'driver_circuit_avg_position',
        
        # Enhanced career performance
        'driver_career_wins',
        'driver_career_podiums', 
        'driver_career_top5',
        'driver_career_points_rate',
        
        # Recent form features
        'driver_recent_avg_qual_position',
        'driver_recent_wins',
        'driver_recent_podiums',
        
        # Circuit-specific performance
        'driver_circuit_races',
        'driver_circuit_avg_qual',
        'driver_circuit_wins',
        'driver_circuit_podiums',
        
        # Team performance
        'team_season_wins',
        'team_season_podiums', 
        'team_season_points_rate',
        
        # Weather conditions
        'is_wet_race',
        'avg_air_temp',
        'avg_track_temp', 
        'avg_humidity',
        'total_rainfall',
        'max_rainfall',
        'avg_wind_speed'
    ]
    
    # Check feature availability
    available_features = [col for col in feature_columns if col in df.columns]
    missing_features = [col for col in feature_columns if col not in df.columns]
    
    print(f"\n🔍 FEATURE ANALYSIS:")
    print(f"Available features: {len(available_features)}/{len(feature_columns)}")
    if missing_features:
        print(f"Missing features: {missing_features}")
    
    # Prepare the dataset
    X = df[available_features].copy()
    y = df['top_3_finish'].copy()
    
    print(f"\n📋 FINAL DATASET:")
    print(f"Feature matrix: {X.shape}")
    print(f"Target distribution: {y.sum()}/{len(y)} top 3 finishes ({y.mean()*100:.1f}%)")
    
    # Handle missing values
    print(f"\n🔧 PREPROCESSING:")
    missing_counts = X.isnull().sum()
    print(f"Missing values per feature:")
    for feature, missing in missing_counts[missing_counts > 0].items():
        percentage = (missing / len(X)) * 100
        print(f"  {feature}: {missing} ({percentage:.1f}%)")
    
    # Convert boolean to integer
    bool_columns = X.select_dtypes(include=['bool']).columns
    if len(bool_columns) > 0:
        X[bool_columns] = X[bool_columns].astype(int)
    
    # Impute missing values
    imputer = SimpleImputer(strategy='median')
    X_imputed = imputer.fit_transform(X)
    X_processed = pd.DataFrame(X_imputed, columns=available_features, index=X.index)
    
    print(f"✅ Preprocessing complete")
    
    # Time-based train/test split for realistic evaluation
    print(f"\n📊 TRAIN/TEST STRATEGY:")
    print("Using time-based split to simulate real-world usage:")
    
    # Split strategy: 2023-2024 for training, 2025 for testing
    train_mask = df['meeting_year'].isin([2023, 2024])
    test_mask = df['meeting_year'] == 2025
    
    X_train = X_processed[train_mask]
    X_test = X_processed[test_mask]
    y_train = y[train_mask]
    y_test = y[test_mask]
    
    print(f"  Training (2023-2024): {len(X_train)} samples ({y_train.mean()*100:.1f}% top 3)")
    print(f"  Testing (2025): {len(X_test)} samples ({y_test.mean()*100:.1f}% top 3)")
    
    # Advanced model ensemble
    print(f"\n🤖 TRAINING ENSEMBLE MODEL:")
    
    # Individual models with optimized parameters
    models = {
        'Logistic Regression': LogisticRegression(
            random_state=42, 
            max_iter=1000,
            class_weight='balanced'
        ),
        'Random Forest': RandomForestClassifier(
            random_state=42,
            n_estimators=200,
            max_depth=10,
            min_samples_split=5,
            class_weight='balanced'
        ),
        'Gradient Boosting': GradientBoostingClassifier(
            random_state=42,
            n_estimators=150,
            learning_rate=0.1,
            max_depth=6
        )
    }
    
    # Scale features for logistic regression
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    results = {}
    
    # Train individual models
    for name, model in models.items():
        print(f"\nTraining {name}...")
        
        # Use scaled data for Logistic Regression
        if name == 'Logistic Regression':
            X_train_model = X_train_scaled
            X_test_model = X_test_scaled
        else:
            X_train_model = X_train
            X_test_model = X_test
        
        # Train
        model.fit(X_train_model, y_train)
        
        # Evaluate
        train_accuracy = model.score(X_train_model, y_train)
        test_accuracy = model.score(X_test_model, y_test)
        test_auc = roc_auc_score(y_test, model.predict_proba(X_test_model)[:, 1])
        
        # Cross-validation on training data
        cv_scores = cross_val_score(model, X_train_model, y_train, cv=5, scoring='roc_auc')
        
        results[name] = {
            'model': model,
            'train_accuracy': train_accuracy,
            'test_accuracy': test_accuracy,
            'test_auc': test_auc,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'predictions': model.predict(X_test_model),
            'probabilities': model.predict_proba(X_test_model)[:, 1]
        }
        
        print(f"  Train Accuracy: {train_accuracy:.3f}")
        print(f"  Test Accuracy: {test_accuracy:.3f}")
        print(f"  Test AUC: {test_auc:.3f}")
        print(f"  CV AUC: {cv_scores.mean():.3f} (+/- {cv_scores.std()*2:.3f})")
    
    # Create ensemble model
    print(f"\n🎯 CREATING ENSEMBLE MODEL:")
    ensemble_models = [
        ('lr', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')),
        ('rf', RandomForestClassifier(random_state=42, n_estimators=200, max_depth=10, class_weight='balanced')),
        ('gb', GradientBoostingClassifier(random_state=42, n_estimators=150, learning_rate=0.1))
    ]
    
    # Note: For ensemble, we'll use the non-scaled data and let each model handle its own preprocessing
    ensemble = VotingClassifier(estimators=ensemble_models, voting='soft')
    ensemble.fit(X_train, y_train)
    
    ensemble_test_accuracy = ensemble.score(X_test, y_test)
    ensemble_test_auc = roc_auc_score(y_test, ensemble.predict_proba(X_test)[:, 1])
    ensemble_predictions = ensemble.predict(X_test)
    ensemble_probabilities = ensemble.predict_proba(X_test)[:, 1]
    
    print(f"Ensemble Test Accuracy: {ensemble_test_accuracy:.3f}")
    print(f"Ensemble Test AUC: {ensemble_test_auc:.3f}")
    
    # Find best individual model
    best_individual = max(results.keys(), key=lambda k: results[k]['test_auc'])
    best_individual_auc = results[best_individual]['test_auc']
    
    # Choose final model (ensemble vs best individual)
    if ensemble_test_auc > best_individual_auc:
        final_model_name = "Ensemble"
        final_model = ensemble
        final_predictions = ensemble_predictions
        final_probabilities = ensemble_probabilities
        final_auc = ensemble_test_auc
        final_accuracy = ensemble_test_accuracy
        print(f"\n🏆 FINAL MODEL: Ensemble (AUC: {final_auc:.3f})")
    else:
        final_model_name = best_individual
        final_model = results[best_individual]['model']
        final_predictions = results[best_individual]['predictions']
        final_probabilities = results[best_individual]['probabilities']
        final_auc = results[best_individual]['test_auc']
        final_accuracy = results[best_individual]['test_accuracy']
        print(f"\n🏆 FINAL MODEL: {final_model_name} (AUC: {final_auc:.3f})")
    
    # Detailed evaluation
    print(f"\n📊 DETAILED EVALUATION ({final_model_name}):")
    print(f"Accuracy: {final_accuracy:.1%}")
    print(f"AUC Score: {final_auc:.3f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, final_predictions, target_names=['Not Top 3', 'Top 3']))
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, final_predictions)
    print(cm)
    
    # Feature importance analysis (for tree-based models)
    if final_model_name in ['Random Forest', 'Gradient Boosting']:
        feature_importance = pd.DataFrame({
            'feature': available_features,
            'importance': final_model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print(f"\n🎯 FEATURE IMPORTANCE ({final_model_name}):")
        for idx, row in feature_importance.head(15).iterrows():
            print(f"  {row['feature']:<30}: {row['importance']:.3f}")
    
    # Performance analysis by year and qualifying position
    test_data = df[test_mask].copy()
    test_data['predicted_top3'] = final_predictions
    test_data['pred_probability'] = final_probabilities
    
    print(f"\n📈 PERFORMANCE ANALYSIS (2025 PREDICTIONS):")
    
    # By qualifying position
    qual_analysis = test_data.groupby('qualifying_position').agg({
        'top_3_finish': ['count', 'sum', 'mean'],
        'predicted_top3': 'sum',
        'pred_probability': 'mean'
    }).round(3)
    
    qual_analysis.columns = ['total_races', 'actual_top3', 'actual_rate', 'predicted_top3', 'avg_probability']
    qual_analysis = qual_analysis[qual_analysis['total_races'] >= 2]
    
    print("\nPerformance by qualifying position (2+ races):")
    print(qual_analysis.head(15))
    
    # Model accuracy by race
    race_accuracy = test_data.groupby('meeting_meeting_name').apply(
        lambda x: (x['top_3_finish'] == x['predicted_top3']).mean()
    ).sort_values(ascending=False)
    
    print(f"\n🏁 ACCURACY BY 2025 RACE:")
    for race, accuracy in race_accuracy.items():
        print(f"  {race:<30}: {accuracy:.1%}")
    
    # Save the final model components
    print(f"\n💾 SAVING MODEL COMPONENTS:")
    
    model_package = {
        'model': final_model,
        'imputer': imputer,
        'scaler': scaler if final_model_name == 'Logistic Regression' else None,
        'features': available_features,
        'model_type': final_model_name,
        'model_version': '3.0_ultimate',
        'created_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        'performance_metrics': {
            'test_accuracy': final_accuracy,
            'test_auc': final_auc,
            'train_samples': len(X_train),
            'test_samples': len(X_test),
            'features_count': len(available_features)
        },
        'training_years': [2023, 2024],
        'validation_year': 2025
    }
    
    # Save model
    model_filename = 'f1_prediction_model.pkl'
    joblib.dump(model_package, model_filename)
    print(f"✅ Model saved as: {model_filename}")
    
    print(f"\n🎯 MODEL SUMMARY:")
    print(f"✅ Final Model: {final_model_name}")
    print(f"✅ Test Accuracy: {final_accuracy:.1%}")
    print(f"✅ AUC Score: {final_auc:.3f}")
    print(f"✅ Training Data: 2023-2024 ({len(X_train)} samples)")
    print(f"✅ Validation Data: 2025 ({len(X_test)} samples)")
    print(f"✅ Features: {len(available_features)} predictive variables")
    
    print(f"\n🚀 READY FOR PRODUCTION:")
    print(f"1. ✅ Model trained on comprehensive 3-year dataset")
    print(f"2. ✅ Validated on real 2025 race results") 
    print(f"3. ✅ Feature importance identified")
    print(f"4. ✅ Ready for Austrian GP predictions")
    print(f"5. ✅ Ready for Streamlit dashboard integration")
    
    # Store components for Streamlit
    print(f"\n💾 Model components ready for Streamlit:")
    print(f"  - final_model: Trained {final_model_name}")
    print(f"  - available_features: List of {len(available_features)} features")
    print(f"  - imputer: Preprocessing pipeline")
    print(f"  - Model file: {model_filename}")

🏎️  ULTIMATE F1 PREDICTION MODEL (2023-2025)
✅ Dataset loaded: 1119 records with 47 features

📊 DATA DISTRIBUTION:
  2023: 440 records (~22 races), 66 top 3 finishes
  2024: 479 records (~23 races), 72 top 3 finishes
  2025: 200 records (~10 races), 30 top 3 finishes
  Total: 1119 records, 168 top 3 finishes (15.0%)

🔍 FEATURE ANALYSIS:
Available features: 26/26

📋 FINAL DATASET:
Feature matrix: (1119, 26)
Target distribution: 168/1119 top 3 finishes (15.0%)

🔧 PREPROCESSING:
Missing values per feature:
  driver_recent_avg_position: 32 (2.9%)
  team_season_avg_position: 60 (5.4%)
  driver_circuit_avg_position: 573 (51.2%)
  driver_recent_avg_qual_position: 32 (2.9%)
  driver_circuit_avg_qual: 573 (51.2%)
  team_season_points_rate: 60 (5.4%)
✅ Preprocessing complete

📊 TRAIN/TEST STRATEGY:
Using time-based split to simulate real-world usage:
  Training (2023-2024): 919 samples (15.0% top 3)
  Testing (2025): 200 samples (15.0% top 3)

🤖 TRAINING ENSEMBLE MODEL:

Training Logistic Regres

In [19]:
import joblib
import pickle
import pandas as pd
import numpy as np
from datetime import datetime

print("💾 SAVING F1 PREDICTION MODEL")
print("=" * 40)

# Check what model components are available
components_available = {}
required_components = [
    'final_model',
    'available_features',
    'imputer',
    'enhanced_combine'  # The training data
]

optional_components = [
    'scaler',
    'results',  # Model evaluation results
    'X_train',
    'X_test',
    'y_train',
    'y_test'
]

# Check required components
print("🔍 Checking required components:")
for component in required_components:
    if component in locals() or component in globals():
        components_available[component] = True
        print(f"  ✅ {component}: Available")
    else:
        components_available[component] = False
        print(f"  ❌ {component}: Missing")

# Check optional components
print("\n🔍 Checking optional components:")
for component in optional_components:
    if component in locals() or component in globals():
        components_available[component] = True
        print(f"  ✅ {component}: Available")
    else:
        components_available[component] = False
        print(f"  ⚠️ {component}: Not available (optional)")

# Create model package if we have the essentials
if all(components_available[comp] for comp in required_components):
    print(f"\n💾 Creating model package...")

    # Package all components
    model_package = {
        'model': final_model,
        'features': available_features,
        'imputer': imputer,
        'training_data': enhanced_combined,
        'model_type': 'Logistic Regression',
        'model_version': '1.0',
        'training_years': [2023, 2024],
        'validation_year': 2025,
        'created_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        'performance_metrics': {
            'test_accuracy': 0.88,  # From your results
            'auc_score': 0.964,
            'precision_top3': 0.56,
            'recall_top3': 1.00
        }
    }

    # Add optional components if available
    if components_available['scaler']:
        model_package['scaler'] = scaler

    if components_available['results']:
        model_package['evaluation_results'] = results

    # Save using joblib (recommended for sklearn models)
    model_filename = 'f1_prediction_model.pkl'
    joblib.dump(model_package, model_filename)
    print(f"✅ Model saved as: {model_filename}")

    # Also save a backup using pickle
    backup_filename = 'f1_model_backup.pkl'
    with open(backup_filename, 'wb') as f:
        pickle.dump(model_package, f)
    print(f"✅ Backup saved as: {backup_filename}")

    # Save feature names as text file for reference
    features_filename = 'f1_model_features.txt'
    with open(features_filename, 'w') as f:
        f.write("F1 Prediction Model Features\n")
        f.write("=" * 30 + "\n\n")
        for i, feature in enumerate(available_features, 1):
            f.write(f"{i:2d}. {feature}\n")
    print(f"✅ Features list saved as: {features_filename}")

    # Save model summary
    summary_filename = 'f1_model_summary.txt'
    with open(summary_filename, 'w') as f:
        f.write("F1 Prediction Model Summary\n")
        f.write("=" * 30 + "\n\n")
        f.write(f"Model Type: {model_package['model_type']}\n")
        f.write(f"Version: {model_package['model_version']}\n")
        f.write(f"Created: {model_package['created_date']}\n")
        f.write(f"Training Years: {model_package['training_years']}\n")
        f.write(f"Validation Year: {model_package['validation_year']}\n\n")
        f.write("Performance Metrics:\n")
        for metric, value in model_package['performance_metrics'].items():
            f.write(f"  {metric}: {value}\n")
        f.write(f"\nNumber of Features: {len(available_features)}\n")
        f.write(f"Training Records: {len(enhanced_combined)}\n")
    print(f"✅ Summary saved as: {summary_filename}")

    print(f"\n🎯 MODEL PACKAGE COMPLETE!")
    print(f"Files created:")
    print(f"  📦 {model_filename} - Main model file")
    print(f"  📦 {backup_filename} - Backup model file")
    print(f"  📄 {features_filename} - Feature list")
    print(f"  📄 {summary_filename} - Model summary")

else:
    print(f"\n❌ Cannot create model package - missing required components:")
    for comp in required_components:
        if not components_available[comp]:
            print(f"  - {comp}")

    print(f"\n🔧 To fix this, make sure you have run:")
    print(f"  1. The data collection script (enhanced_combined)")
    print(f"  2. The ultimate model training script (final_model, available_features, imputer)")

# Function to load the model later
def load_f1_model(filename='f1_prediction_model.pkl'):
    """
    Load the saved F1 prediction model

    Returns:
        dict: Model package with all components
    """
    try:
        model_package = joblib.load(filename)
        print(f"✅ Model loaded from {filename}")
        print(f"  Model type: {model_package['model_type']}")
        print(f"  Version: {model_package['model_version']}")
        print(f"  Features: {len(model_package['features'])}")
        return model_package
    except Exception as e:
        print(f"❌ Error loading model: {e}")
        return None

# Example usage
print(f"\n📖 TO LOAD THE MODEL LATER:")
print(f"```python")
print(f"import joblib")
print(f"")
print(f"# Load the model")
print(f"model_package = joblib.load('f1_prediction_model.pkl')")
print(f"")
print(f"# Extract components")
print(f"final_model = model_package['model']")
print(f"available_features = model_package['features']")
print(f"imputer = model_package['imputer']")
print(f"```")

print(f"\n🚀 Ready for Streamlit integration!")

💾 SAVING F1 PREDICTION MODEL
🔍 Checking required components:
  ✅ final_model: Available
  ✅ available_features: Available
  ✅ imputer: Available
  ❌ enhanced_combine: Missing

🔍 Checking optional components:
  ✅ scaler: Available
  ✅ results: Available
  ✅ X_train: Available
  ✅ X_test: Available
  ✅ y_train: Available
  ✅ y_test: Available

❌ Cannot create model package - missing required components:
  - enhanced_combine

🔧 To fix this, make sure you have run:
  1. The data collection script (enhanced_combined)
  2. The ultimate model training script (final_model, available_features, imputer)

📖 TO LOAD THE MODEL LATER:
```python
import joblib

# Load the model
model_package = joblib.load('f1_prediction_model.pkl')

# Extract components
final_model = model_package['model']
available_features = model_package['features']
imputer = model_package['imputer']
```

🚀 Ready for Streamlit integration!
