In [1]:
import fastf1
import pandas as pd
import numpy as np
import os
import fastf1.events as f1events

In [None]:
#create folder
cache_dir = os.path.expanduser("~/f1_cache") #if on windows MUST contain directory 
os.makedirs(cache_dir, exist_ok=True)
#enable cache
fastf1.Cache.enable_cache(cache_dir)

- The code below will allow initial acess to Fast1api to pull all data from 2019 to 2025. 

In [None]:
fastf1.Cache.enable_cache(cache_dir) #enable cache to prevent


YEARS = range(2019, 2026)  #set year range
all_laps = [] #create dic to store laps

for year in YEARS:
    schedule = fastf1.get_event_schedule(year, include_testing=False)
    non_sprint_events = schedule[schedule['EventFormat'] != 'sprint'] #exclude sprints for control 

    for i, event in non_sprint_events.iterrows():
        round_num = event['RoundNumber']
        event_name = event['EventName']

        try:
            session = fastf1.get_session(year, round_num, 'R') #races only; qualifying and practices are excluded
            print(f"Loading {year} Round {round_num} - {event_name}")
            session.load(telemetry=True)  
            laps = session.laps.copy()
            laps['RaceName'] = event_name
            laps['Year'] = year
            laps['PostRegulation'] = int(year >= 2022)

            #merge weather data
            weather = session.weather_data.copy()
            laps['LapStartTimeSec'] = laps['LapStartTime'].dt.total_seconds()
            weather['TimeSec'] = weather['Time'].dt.total_seconds()

            laps = pd.merge_asof(
                laps.sort_values('LapStartTimeSec'),
                weather[['TimeSec', 'AirTemp', 'TrackTemp', 'WindSpeed', 'Humidity']],
                left_on='LapStartTimeSec',
                right_on='TimeSec',
                direction='nearest'
            )

            all_laps.append(laps)
        except Exception as e:
            print(f"Skipped {event_name} ({year}) due to error: {e}")

#combine all datasets to one & export
if all_laps:
    final_df = pd.concat(all_laps, ignore_index=True)
    final_df = final_df.dropna().reset_index(drop=True)
    final_df.to_csv('laps_2020_2025_extended.csv', index=False)

NameError: name 'cache_dir' is not defined

In [None]:
#code to use the full lap into an aggregated race level data for RF and lap level for linear mixed model
df = pd.read_csv('laps_2019_2025.csv')

#convert time columns to timedelta and LapTime to seconds
time_cols = ['LapTime', 'Time', 'LapStartTime']
for col in time_cols:
    df[col] = pd.to_timedelta(df[col], errors='coerce')
df['LapTime_sec'] = df['LapTime'].dt.total_seconds()

#drop rows with missing essential data
df_cleaned = df.dropna(subset=['Position', 'DriverNumber', 'Year', 'RaceName', 'LapNumber', 'Driver', 'LapTime_sec']).copy()


#convert to ints & rename cols
df_cleaned['Position'] = pd.to_numeric(df_cleaned['Position']).astype(int)
df_cleaned['LapNumber'] = pd.to_numeric(df_cleaned['LapNumber']).astype(int)
df_cleaned['DriverNumber'] = pd.to_numeric(df_cleaned['DriverNumber']).astype(int)
df_cleaned['RegulationEra'] = df_cleaned['PostRegulation'].astype(int)

"""
#Likely not be used
#preping overtakes and lap level for mixed effects model below
#--------------------------------------------------

#sort the data for accurate shift operation
df_cleaned = df_cleaned.sort_values(by=['Year', 'RaceName', 'DriverNumber', 'LapNumber'])

#get prev position
df_cleaned['PreviousPosition'] = df_cleaned.groupby(['Year', 'RaceName', 'DriverNumber'])['Position'].shift(1)

#overtake condition is when current position < prev position and not during pitout (tire changes)
overtake_mask = (
    (df_cleaned['Position'] < df_cleaned['PreviousPosition']) &
    (df_cleaned['PitOutTime'].isna())
)
df_cleaned['IsOvertake'] = overtake_mask

#get all cols
lap_lmem_cols = [
    'Year', 'RaceName', 'Driver', 'DriverNumber', 'Team', 'LapNumber', 'LapTime_sec',
    'Position', 'PreviousPosition', 'IsOvertake', 'Compound', 'TyreLife',
    'AirTemp', 'TrackTemp', 'WindSpeed', 'Humidity', 'TrackStatus', 'RegulationEra',
    'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST'
]

df_lap_final = df_cleaned[lap_lmem_cols].copy()

#drop rows missing essential lap-level fields before saving (avoid dropping on optional columns)
df_lap_final = df_lap_final.dropna(subset=['Year', 'RaceName', 'Driver', 'DriverNumber', 'LapNumber', 'LapTime_sec']).reset_index(drop=True)
lap_output_file = 'lap_data_for_lmem_final.csv'
df_lap_final.to_csv(lap_output_file, index=False)
"""


#race level aggregation for RF begins below
#----------------------------------

#overtake PER race
overtake_count = df_cleaned[df_cleaned['IsOvertake']].groupby(
    ['Year', 'RaceName', 'DriverNumber']
).size().reset_index(name='Overtakes_Per_Race')

#def comprehensive aggregation dictionary
mean_cols = ['SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST','AirTemp', 'TrackTemp', 'WindSpeed', 'Humidity', 'TrackStatus']

#define aggregatiion method
agg_dict = {
    'LapNumber': 'max',  #total laps completed
    'IsPersonalBest': 'any',
    'TyreLife': 'max',
    'RegulationEra': 'first', #era constant per race
    'LapTime': 'mean', #mean lap time (duration) per race-driver
    'Compound': lambda x: x.loc[x.index.max()] if not x.empty else np.nan, # Final Compound
    'Team': 'first',}
for col in mean_cols:
    agg_dict[col] = 'mean'


#aggregate cleaned data
df_aggregated = df_cleaned.groupby(['Year', 'RaceName', 'Driver', 'DriverNumber']).agg(agg_dict).reset_index()

#merge overtakes
df_aggregated = df_aggregated.merge(
    overtake_count,
    on=['Year', 'RaceName', 'DriverNumber'],
    how='left'
)
df_aggregated['Overtakes_Per_Race'] = df_aggregated['Overtakes_Per_Race'].fillna(0).astype(int)

#determine final position & race time
idx = df_cleaned.groupby(['Year', 'RaceName', 'DriverNumber'])['LapNumber'].idxmax()
final_laps = df_cleaned.loc[idx, ['Year', 'RaceName', 'DriverNumber', 'Position', 'Time']].copy()
final_laps.rename(columns={'Position': 'FinalPosition', 'Time': 'FinalRaceTime'}, inplace=True)

df_aggregated = df_aggregated.merge(
    final_laps[['Year', 'RaceName', 'DriverNumber', 'FinalPosition', 'FinalRaceTime']],
    on=['Year', 'RaceName', 'DriverNumber'],
    how='left'
)

#save race level data
df_aggregated.rename(columns={
    'LapNumber': 'TotalLapsCompleted',
    'LapTime': 'LapTime',
    'Compound': 'FinalStintCompound',
}, inplace=True)

#rename mean columns to indicate aggregation type and round
for col in mean_cols:
    df_aggregated.rename(columns={col: f'Mean_{col}'}, inplace=True)
df_aggregated[[f'Mean_{col}' for col in mean_cols]] = df_aggregated[[f'Mean_{col}' for col in mean_cols]].round(2)

#convert Timedelta objects to total seconds for easier analysis
df_aggregated['LapTime'] = df_aggregated['LapTime'].dt.total_seconds()
df_aggregated['FinalRaceTime'] = df_aggregated['FinalRaceTime'].dt.total_seconds()

#save to csv
race_output_file = 'race_data_for_rf.csv'
df_aggregated.to_csv(race_output_file, index=False)