In [6]:
import fastf1
from fastf1 import plotting
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm
import warnings
import os
warnings.filterwarnings('ignore')

cache_path = r'D:\f1-prediction-ml-2025-zoe\cache'
if not os.path.exists(cache_path):
    os.makedirs(cache_path)
fastf1.Cache.enable_cache(cache_path)

TRACK_TYPES = {
    'Australia': 'semi-permanent',
    'Monaco': 'street',
    'Miami': 'semi-permanent',
    'Singapore': 'street',
    'Baku': 'street',
    'Las Vegas': 'street',
    'Albert Park': 'semi-permanent',
    'Montreal': 'semi-permanent',
    'Jeddah': 'street'
} #all others are defaulted to permanent

In [None]:
#feature extraction functions
def get_track_type(event_name):
    for track, track_type in TRACK_TYPES.items():
        if track.lower() in event_name.lower():
            return track_type
    return 'permanent'

def get_weather_condition(session):
    try:
        weather_data = session.weather_data
        if weather_data is not None and not weather_data.empty:
            # If any rainfall is True, return 'wet', else 'dry'
            if weather_data['Rainfall'].any():
                return 'wet'
            else:
                return 'dry'
        else:
            return 'dry' #default
    except Exception as e:
        print(f"Error extracting weather condition: {e}")
        return 'unknown'

def get_car_pace_indicator(year, event_name, driver_code):
#get relative pace from fp3 session
#larger number = slower
#why fp3 and not quali?
    try: 
        fp3 = fastf1.get_session(year, event_name, 'FP3')
        fp3.load(laps=True)
        fastest_lap = fp3.laps.pick_fastest()
        if fastest_lap is not None and not fastest_lap.empty:
            driver_lap = fp3.laps.pick_driver(driver_code).pick_fastest()
            if driver_lap is not None and not driver_lap.empty:
                #calculate pace difference as percentage
                pace_diff = (driver_lap['LapTime'].total_seconds() - fastest_lap['LapTime'].total_seconds()) / fastest_lap['LapTime'].total_seconds() * 100
                return round(pace_diff, 3)
            else:
                return 0.0 #default if no data
    except Exception as e:
        print(f"Error getting pace for {driver_code} in {event_name}: {e}")
        return 0.0

def get_quali_position(quali_results, driver_code):
    try:
        driver_quali = quali_results[quali_results['Abbreviation'] == driver_code]
        if not driver_quali.empty:
            pos = driver_quali.iloc[0]['Position']
            return int(pos) if not pd.isna(pos) else 20
    except Exception as e:
        return 20 #default if no data

print("Feature extraction functions defined.") #check for bugs

#test first function
print(get_track_type("Australian Grand Prix")) #should return 'semi-permanent'

Feature extraction functions defined.
semi-permanent


In [8]:
def collect_race_data(year, event_name, round_no): #Collect all features for a race
    try:
        race = fastf1.get_session(year, event_name, 'R')
        race.load(laps=True, weather=True)
        quali = fastf1.get_session(year, event_name, 'Q')
        quali.load()
        #get race info
        #feature 1: track type
        track_type = get_track_type(event_name)
        #feature 2
        weather = get_weather_condition(race)

        race_data = []

        for idx, driver_result in race.results.iterrows():
            driver_code = driver_result['Abbreviation']
            if pd.isna(driver_code):
                continue
            #feature 3: get quali position
            quali_pos = get_quali_position(quali.results, driver_code)
            #feature 4: car pace indicator from FP3
            car_pace = get_car_pace_indicator(year, event_name, driver_code)
            #feature 5: driver historical performance at track
            driver_historical_performance = 10.0 #placeholder
            #feature 6: driver standing before race
            driver_standing_before_race = 10.0 #placeholder
            #feature 7: team standing before race
            constructor_standing_before_race = 10.0 #placeholder
            #target: finishing position
            finish_pos = driver_result['Position']
            if pd.isna(finish_pos):
                finish_pos = 21 #DNF
            
            race_data.append({
                'year': year,
                'round': round_no,
                'race': event_name,
                'driver': driver_code,
                'team': driver_result['TeamName'],
                'qualifying_position': quali_pos,
                'track_type': track_type,
                'weather': weather,
                'car_pace_indicator': car_pace,
                'driver_historical_track_performance': driver_historical_performance,
                'driver_championship_standing': driver_standing_before_race,
                'constructor_championship_standing': constructor_standing_before_race,
                'finishing_position': int(finish_pos)
            })
        return race_data
    except Exception as e:
        print(f"Error processing {event_name} {year}: {e}")
        return []

In [9]:
#test for single race data (Bahrain 2024)
test_data = collect_race_data(2024, 'Bahrain', 1)

if test_data:
    test_df = pd.DataFrame(test_data)
    print(f"Collected {len(test_df)} data points.")
    display(test_df)

    #show feature summary
    print(f"\nFeature summary:")
    print(f"Track type: {test_df['track_type'].iloc[0]}")
    print(f"Weather: {test_df['weather'].iloc[0]}")
    print(f"Pace indicator stats: \nmin: {test_df['car_pace_indicator'].min()} \nmax: {test_df['car_pace_indicator'].max()} \nmean: {test_df['car_pace_indicator'].mean()}")
else:
    print("Failed to collect test data.")

core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req      

Collected 20 data points.


Unnamed: 0,year,round,race,driver,team,qualifying_position,track_type,weather,car_pace_indicator,driver_historical_track_performance,driver_championship_standing,constructor_championship_standing,finishing_position
0,2024,1,Bahrain,VER,Red Bull Racing,1,permanent,dry,0.262,10.0,10.0,10.0,1
1,2024,1,Bahrain,PER,Red Bull Racing,5,permanent,dry,0.467,10.0,10.0,10.0,2
2,2024,1,Bahrain,SAI,Ferrari,4,permanent,dry,0.0,10.0,10.0,10.0,3
3,2024,1,Bahrain,LEC,Ferrari,2,permanent,dry,0.297,10.0,10.0,10.0,4
4,2024,1,Bahrain,RUS,Mercedes,3,permanent,dry,0.403,10.0,10.0,10.0,5
5,2024,1,Bahrain,NOR,McLaren,7,permanent,dry,0.324,10.0,10.0,10.0,6
6,2024,1,Bahrain,HAM,Mercedes,9,permanent,dry,0.691,10.0,10.0,10.0,7
7,2024,1,Bahrain,PIA,McLaren,8,permanent,dry,0.425,10.0,10.0,10.0,8
8,2024,1,Bahrain,ALO,Aston Martin,6,permanent,dry,0.155,10.0,10.0,10.0,9
9,2024,1,Bahrain,STR,Aston Martin,12,permanent,dry,0.63,10.0,10.0,10.0,10



Feature summary:
Track type: permanent
Weather: dry
Pace indicator stats: 
min: 0.0 
max: 1.715 
mean: 0.7597
