In [2]:
!pip install fastf1

Collecting fastf1
  Using cached fastf1-3.6.1-py3-none-any.whl.metadata (4.6 kB)
Collecting rapidfuzz (from fastf1)
  Using cached rapidfuzz-3.14.1-cp313-cp313-win_amd64.whl.metadata (12 kB)
Collecting requests-cache>=1.0.0 (from fastf1)
  Using cached requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting timple>=0.1.6 (from fastf1)
  Using cached timple-0.1.8-py3-none-any.whl.metadata (2.0 kB)
Collecting websockets<14,>=10.3 (from fastf1)
  Using cached websockets-13.1-cp313-cp313-win_amd64.whl.metadata (7.0 kB)
Collecting cattrs>=22.2 (from requests-cache>=1.0.0->fastf1)
  Using cached cattrs-25.3.0-py3-none-any.whl.metadata (8.4 kB)
Collecting url-normalize>=1.4 (from requests-cache>=1.0.0->fastf1)
  Using cached url_normalize-2.2.1-py3-none-any.whl.metadata (5.6 kB)
Collecting attrs>=21.2 (from requests-cache>=1.0.0->fastf1)
  Using cached attrs-25.4.0-py3-none-any.whl.metadata (10 kB)
Collecting typing-extensions>=4.14.0 (from cattrs>=22.2->requests-cache>=1.0.0->fast

In [11]:
import os
import pandas as pd
import fastf1
import requests

# ---------- SETUP ----------
os.makedirs('cache', exist_ok=True)
os.makedirs('../data/processed', exist_ok=True)
fastf1.Cache.enable_cache('cache')

# ⚠️ CRITICAL: Use Jolpica F1 API (Ergast replacement)
fastf1.ergast.interface.BASE_URL = "https://api.jolpi.ca/ergast/f1"

# ---------- FUNCTION ----------
def build_race_features_for_ml(seasons=[2022, 2023]):
    """
    Build comprehensive race features using Jolpica F1 API
    """
    all_data = []
    
    for season in seasons:
        print(f"⏳ Processing {season} season...")
        
        # Fetch from Jolpica F1 API
        url = f"https://api.jolpi.ca/ergast/f1/{season}/results.json?limit=1000"
        
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            data = response.json()
            races = data['MRData']['RaceTable']['Races']
            print(f"   ✅ Found {len(races)} races")
        except Exception as e:
            print(f"   ❌ Error: {e}")
            continue
        
        for race in races:
            round_no = int(race['round'])
            race_name = race['raceName']
            circuit = race['Circuit']['circuitName']
            
            for result in race['Results']:
                driver = result['Driver']['familyName']
                driver_id = result['Driver']['driverId']
                team = result['Constructor']['name']
                team_id = result['Constructor']['constructorId']
                
                grid_pos = int(result['grid']) if result['grid'] != '0' else 20
                final_pos = int(result.get('position', 99))
                status = result['status']
                points = float(result.get('points', 0))
                
                qualifying_time = grid_pos * 1.5
                
                all_data.append({
                    'season': season,
                    'round': round_no,
                    'race_name': race_name,
                    'circuit': circuit,
                    'driver': driver,
                    'driver_id': driver_id,
                    'team': team,
                    'team_id': team_id,
                    'grid_position': grid_pos,
                    'qualifying_time': qualifying_time,
                    'final_position': final_pos,
                    'points': points,
                    'dnf': 1 if 'Finished' not in status and final_pos > 20 else 0
                })
    
    df = pd.DataFrame(all_data)
    print(f"\n📊 Total records: {len(df)}")
    
    # Sort chronologically
    df = df.sort_values(['driver_id', 'season', 'round']).reset_index(drop=True)
    
    # Feature engineering
    print("🔧 Engineering features...")
    
    # 1. Recent avg position
    df['recent_avg_position'] = df.groupby('driver_id')['final_position'].transform(
        lambda x: x.shift(1).rolling(3, min_periods=1).mean()
    )
    
    # 2. Avg track position
    df['avg_track_position'] = df.groupby(['driver_id', 'circuit'])['final_position'].transform(
        lambda x: x.shift(1).expanding(min_periods=1).mean()
    )
    
    # 3. Team season points
    df['team_season_points'] = df.groupby(['team_id', 'season'])['points'].transform(
        lambda x: x.shift(1).cumsum().fillna(0)
    )
    
    # 4. Driver DNF rate
    df['driver_dnf_rate'] = df.groupby('driver_id')['dnf'].transform(
        lambda x: x.shift(1).expanding(min_periods=1).mean().fillna(0)
    )
    
    # 5. Home advantage
    df['home_advantage'] = 0
    
    # 6. Tire strategy score
    df['tire_strategy_score'] = (20 - df['grid_position']) / 20.0
    df['tire_strategy_score'] = df['tire_strategy_score'].clip(0, 1)
    
    # Fill NaNs
    df.fillna({
        'recent_avg_position': 10.0,
        'avg_track_position': 10.0,
        'team_season_points': 0.0,
        'driver_dnf_rate': 0.1
    }, inplace=True)
    
    # Save
    output_path = '../data/processed/race_features.csv'
    df.to_csv(output_path, index=False)
    
    print(f"\n✅ Saved to {output_path}")
    print(f"   Shape: {df.shape}")
    print(f"   Columns: {list(df.columns)}")
    
    return df

# RUN
df = build_race_features_for_ml(seasons=[2022, 2023])
print("\n📊 Sample:")
print(df[['driver', 'race_name', 'grid_position', 'final_position', 
          'recent_avg_position', 'team_season_points']].head(10))

⏳ Processing 2022 season...
   ✅ Found 5 races
⏳ Processing 2023 season...
   ✅ Found 5 races

📊 Total records: 200
🔧 Engineering features...

✅ Saved to ../data/processed/race_features.csv
   Shape: (200, 19)
   Columns: ['season', 'round', 'race_name', 'circuit', 'driver', 'driver_id', 'team', 'team_id', 'grid_position', 'qualifying_time', 'final_position', 'points', 'dnf', 'recent_avg_position', 'avg_track_position', 'team_season_points', 'driver_dnf_rate', 'home_advantage', 'tire_strategy_score']

📊 Sample:
  driver                  race_name  grid_position  final_position  \
0  Albon         Bahrain Grand Prix             14              13   
1  Albon   Saudi Arabian Grand Prix             16              14   
2  Albon      Australian Grand Prix             20              10   
3  Albon  Emilia Romagna Grand Prix             18              11   
4  Albon           Miami Grand Prix             18               9   
5  Albon         Bahrain Grand Prix             15             