In [1]:
import fastf1
import pandas as pd
import time

### Testing

In [None]:
year = 2025
round_num = 6
qual = fastf1.get_session(year,round_num,'Q')
qual.load(telemetry=False, laps=False, weather=False)
race = fastf1.get_session(year,round_num,'R')
race.load(telemetry=False, laps=False, weather=False)

core           INFO 	Loading data for Australian Grand Prix - Qualifying [v3.5.3]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for race_control_messages. Loading data...
_api           INFO 	Fetching race control messages...
req            INFO 	Data has been written to cache!
core           INFO 	Finished loading data for 20 drivers: ['16', '1', '11', '4', '44', '63', '3', '31', '55', '14', '10', '77', '22', '24', '47', '23', '20', '5', '6', '18']
core           INFO 	Loading data for Australian Grand Prix - Race [v3.5.3]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info da

In [14]:
# for feature: race_time per race
race_time = race.results.loc[:,["BroadcastName","Time"]]
race_time["Standardized_Time"] = race_time["Time"] / pd.Timedelta(seconds=1) # standardizing the time into seconds
max_val = race_time["Standardized_Time"].max()
race_time.loc[race_time["Standardized_Time"] == max_val, "Standardized_Time"] = 0 # set fastest time to 0
columns_to_fill = ["Standardized_Time", "Time"]
race_time[columns_to_fill] = race_time[columns_to_fill].fillna(-1)
race_time

Unnamed: 0,BroadcastName,Time,Standardized_Time
16,C LECLERC,0 days 01:27:46.548000,0.0
11,S PEREZ,0 days 00:00:20.524000,20.524
63,G RUSSELL,0 days 00:00:25.593000,25.593
44,L HAMILTON,0 days 00:00:28.543000,28.543
4,L NORRIS,0 days 00:00:53.303000,53.303
3,D RICCIARDO,0 days 00:00:53.737000,53.737
31,E OCON,0 days 00:01:01.683000,61.683
77,V BOTTAS,0 days 00:01:08.439000,68.439
10,P GASLY,0 days 00:01:16.221000,76.221
23,A ALBON,0 days 00:01:19.382000,79.382


In [15]:
# for feature: qual_q3_time per qual
qual_q3_results = qual.results.loc[:,["BroadcastName","Position","Q1","Q2","Q3"]]
qual_q3_results["qual_q3_time"] = qual_q3_results["Q3"] / pd.Timedelta(seconds=1) # standardizing the time into seconds
# leave to feature engineering step to standardize per race
columns_to_fill = ["qual_q3_time"]
qual_q3_results[columns_to_fill] = qual_q3_results[columns_to_fill].fillna(-1)
qual_q3_results

Unnamed: 0,BroadcastName,Position,Q1,Q2,Q3,qual_q3_time
16,C LECLERC,1.0,0 days 00:01:18.881000,0 days 00:01:18.606000,0 days 00:01:17.868000,77.868
1,M VERSTAPPEN,2.0,0 days 00:01:18.580000,0 days 00:01:18.611000,0 days 00:01:18.154000,78.154
11,S PEREZ,3.0,0 days 00:01:18.834000,0 days 00:01:18.340000,0 days 00:01:18.240000,78.24
4,L NORRIS,4.0,0 days 00:01:19.280000,0 days 00:01:19.066000,0 days 00:01:18.703000,78.703
44,L HAMILTON,5.0,0 days 00:01:19.401000,0 days 00:01:19.106000,0 days 00:01:18.825000,78.825
63,G RUSSELL,6.0,0 days 00:01:19.405000,0 days 00:01:19.076000,0 days 00:01:18.933000,78.933
3,D RICCIARDO,7.0,0 days 00:01:19.665000,0 days 00:01:19.130000,0 days 00:01:19.032000,79.032
31,E OCON,8.0,0 days 00:01:19.605000,0 days 00:01:19.136000,0 days 00:01:19.061000,79.061
55,C SAINZ,9.0,0 days 00:01:18.983000,0 days 00:01:18.469000,0 days 00:01:19.408000,79.408
14,F ALONSO,10.0,0 days 00:01:19.192000,0 days 00:01:18.815000,NaT,-1.0


### Code to build training and test dataset

In [2]:
# Get historical data. Doesn't include 2025.
def get_train_meetings_df():
    year = 2022
    meetings_df = pd.DataFrame()
    while year<2025:
        meeting = fastf1.get_event_schedule(year)
        meeting = meeting.loc[:,["RoundNumber","Country","Location","EventFormat"]]
        meeting['Year'] = year
        meetings_df = pd.concat([meetings_df,meeting],axis=0)
        year += 1

    condition = meetings_df["EventFormat"] == "testing"
    meetings_df = meetings_df[~condition]
    return meetings_df

# Get current year (2025) data. Test data
def get_test_meetings_df():
    year = 2025
    meetings_2025_df = pd.DataFrame()

    meeting = fastf1.get_event_schedule(year)
    meeting = meeting.loc[:,["RoundNumber","Country","Location","EventFormat"]]
    meeting['Year'] = year
    meetings_2025_df = pd.concat([meetings_2025_df,meeting],axis=0)

    condition = meetings_2025_df["EventFormat"] == "testing"
    meetings_2025_df = meetings_2025_df[~condition]
    return meetings_2025_df

def get_meetings_df(year):
    meetings_df = pd.DataFrame()

    meeting = fastf1.get_event_schedule(year)
    meeting = meeting.loc[:,["RoundNumber","Country","Location","EventFormat"]]
    meeting['Year'] = year
    meetings_df = pd.concat([meetings_df,meeting],axis=0)

    condition = meetings_df["EventFormat"] == "testing"
    meetings_df = meetings_df[~condition]
    return meetings_df



In [51]:
def get_event_info(year,round_num,event_format,is_test_data_for_pred):
    # if is_test_data_for_pred --> data won't contain any race data, because it needs to be used for predictions
    
    if event_format != "conventional":
        sprint = fastf1.get_session(year,round_num,'S')
        sprint.load(telemetry=False, laps=False, weather=False,messages=False)

    # For Qualifying 
    qual = fastf1.get_session(year,round_num,'Q')
    qual.load(telemetry=False, laps=False, weather=False,messages=False)
    
    # data from session.results
    qual_results_df = qual.results.loc[:,["DriverNumber","BroadcastName","TeamName","Position","Q3"]]
    qual_results_df = qual_results_df.rename(columns={"Position":"Qual_Position"})
    qual_results_df["Qual_Q3_Time"] = qual_results_df["Q3"] / pd.Timedelta(seconds=1) # standardizing the time into seconds
    # leave to feature engineering step to standardize per race
    columns_to_fill = ["Qual_Q3_Time"]
    qual_results_df[columns_to_fill] = qual_results_df[columns_to_fill].fillna(-1)
    qual_results_df.drop(columns="Q3",inplace=True)

    # For Race
    if is_test_data_for_pred:
        qual_results_df['Sector1Time'] =  None
        qual_results_df['Sector2Time'] = None
        qual_results_df['Sector3Time'] = None
        qual_results_df['SpeedST'] = None
        qual_results_df['Stint'] = None
        qual_results_df['Race_Position'] = None
        qual_results_df['Standardized_Time'] = None
        merged_qual_race_df = qual_results_df

    else:
        race = fastf1.get_session(year,round_num,'R')
        race.load(telemetry=False, laps=True, weather=False,messages=False)

        # data from session.results
        race_results_df = race.results.loc[:,["DriverNumber","Position","Time"]]
        race_results_df["Standardized_Time"] = race_results_df["Time"] / pd.Timedelta(seconds=1) # standardizing the time into seconds
        max_val = race_results_df["Standardized_Time"].max()
        race_results_df.loc[race_results_df["Standardized_Time"] == max_val, "Standardized_Time"] = 0 # set fastest time to 0
        columns_to_fill = ["Standardized_Time", "Time"]
        race_results_df[columns_to_fill] = race_results_df[columns_to_fill].fillna(-1)
        race_results_df.drop(columns="Time",inplace=True)

        # data from session.laps
        race_laps_df = race.laps.loc[:,["DriverNumber","Sector1Time","Sector2Time","Sector3Time","SpeedST","Stint"]]
        for sector in ['Sector1Time', 'Sector2Time', 'Sector3Time']: race_laps_df[sector] = race_laps_df[sector] / pd.Timedelta(seconds=1)
        final_race_laps_df = race_laps_df.groupby('DriverNumber').agg({
            'Sector1Time': 'mean',
            'Sector2Time': 'mean',
            'Sector3Time': 'mean',
            'SpeedST': 'mean',
            'Stint': 'max'
        }).reset_index()

        # outer join dfs --> complete data for a single race session 
        merged_race_df = pd.merge(race_results_df,final_race_laps_df,on="DriverNumber",how="outer")

        # Merge qual and race data 
        merged_qual_race_df = pd.merge(qual_results_df,merged_race_df,on="DriverNumber",how="outer")
        merged_qual_race_df = merged_qual_race_df.rename(columns={"Position":"Race_Position"})

    if merged_qual_race_df is None: raise Exception("merged_qual_race_df is None")

    if event_format == "conventional":
        merged_qual_race_df["Sprint_Qual_Position"] = None
        merged_qual_race_df["Sprint_Race_Position"] = None
        merged_qual_race_df["Event_Type"] = "Race"

    elif event_format == "sprint":
        # Get Sprint data
        sprint_results_df = sprint.results.loc[:,["DriverNumber","Position"]]
        sprint_results_df = sprint_results_df.rename(columns={"Position":"Sprint_Race_Position"})

        # Merge Qual + Sprint + Race data
        merged_qual_race_df["Sprint_Qual_Position"] = merged_qual_race_df["Qual_Position"]
        merged_qual_race_df = pd.merge(merged_qual_race_df,sprint_results_df,on="DriverNumber",how="outer")

        merged_qual_race_df["Event_Type"] = "Sprint"

    elif event_format == "sprint_shootout" or event_format == "sprint_qualifying":
        # Get Sprint data
        sprint_results_df = sprint.results.loc[:,["DriverNumber","GridPosition","Position"]]
        sprint_results_df = sprint_results_df.rename(columns={"GridPosition":"Sprint_Qual_Position","Position":"Sprint_Race_Position"})

        # Merge Qual + Sprint + Race data
        merged_qual_race_df = pd.merge(merged_qual_race_df,sprint_results_df,on="DriverNumber",how="outer")

        merged_qual_race_df["Event_Type"] = "Sprint"

    else:
        raise Exception(f"Unknown event format: {event_format}")
    
    return merged_qual_race_df

In [52]:
# Get data for a all meetings in meetings_df
def get_data(meetings_df,is_test_data_for_pred):  
    final_df = pd.DataFrame()
    for index,row in meetings_df.iterrows():
        round_num = row["RoundNumber"]
        country = row["Country"]
        location = row["Location"]
        EventFormat = row["EventFormat"]
        year = row["Year"]
             
        print(f"{index}: {round_num}, {country}, {location}, {EventFormat}, {year}")
        
        time.sleep(61) # for api rate limiting
        
        meeting_data_df = get_event_info(year,round_num,EventFormat,is_test_data_for_pred)

        meeting_data_df["Round_Number"] = round_num
        meeting_data_df["Country"] = country
        meeting_data_df["Location"] = location
        meeting_data_df["Year"] = year

        # reorganize columns
        new_col_order = ["DriverNumber","BroadcastName","TeamName","Round_Number","Country","Location","Year","Event_Type",
                        "Sprint_Qual_Position","Sprint_Race_Position","Qual_Q3_Time","Qual_Position","Sector1Time","Sector2Time",
                        "Sector3Time","SpeedST","Stint","Standardized_Time","Race_Position"]
        meeting_data_df = meeting_data_df[new_col_order]

        final_df = pd.concat([final_df,meeting_data_df],axis=0)

    return final_df

In [None]:
# Pre-load meeting data to cache. Returns all races that didn't load properly
def preload_cache(df):
        count = 0
        bad = []
        for _,row in df.iterrows():
                time.sleep(5)
                round_num = row["RoundNumber"]
                country = row["Country"]
                location = row["Location"]
                EventFormat = row["EventFormat"]
                year = row["Year"]
                
                print(f"{count}: {round_num}, {country}, {location}, {EventFormat}, {year}")

                if round_num > 11:

                        qual = fastf1.get_session(year,round_num,'Q')
                        qual.load(telemetry=False, laps=False, weather=False)
                        race = fastf1.get_session(year,round_num,'R')
                        race.load(telemetry=False, laps=True, weather=False)

                        if EventFormat != "conventional":
                                sprint = fastf1.get_session(year,round_num,'S')
                                sprint.load(telemetry=False, laps=False, weather=False)

                        if len(race.results) == 0 or len(qual.results)==0 or(EventFormat != "conventional" and len(sprint.results)==0):
                                bad.append(f"{round_num}-{location}-{sprint}-{year}")

                count += 1
        return bad

# Run this to preload the caches so data is easily retrieval per year later.
#meetings_df = get_train_meetings_df()
#df = meetings_df.loc[meetings_df["Year"]==2022,:]
#preload_cache(df)

In [None]:
# define which year dataset you want to build
year = 2025
#meetings_df = get_meetings_df(year)
#meetings_df

In [22]:
# build  data
rn = 6 # the latest round that you want to include
meetings_df = meetings_df.loc[meetings_df["RoundNumber"] <= rn ,:]
#test_data = get_data(meetings_df,False)
#train_data = get_data(meetings_df,False)
#train_data.to_csv("train_data.csv",index=False)
#train_data.to_csv(f"train_data_{year}.csv",index=False)  # if you want a specific year
#test_data.to_csv(f"test_data_new.csv",index=False)

In [27]:
# path to your cache directory
cache_dir = 'C:/Users/rohan/AppData/Local/Temp/fastf1'

# Clear the cache
#fastf1.Cache.clear_cache(cache_dir,True)


fastf1.Cache.get_cache_info()

('C:\\Users\\rohan\\AppData\\Local\\Temp\\fastf1', 2513382030)

In [24]:
# Combine training data sets
trn_data_2022 = pd.read_csv("train_data_2022_new.csv")
trn_data_2023 = pd.read_csv("train_data_2023_new.csv")
trn_data_2024 = pd.read_csv("train_data_2024_new.csv")

trn_data = pd.DataFrame()
trn_data = pd.concat([trn_data,trn_data_2022], axis=0, ignore_index=True)
trn_data = pd.concat([trn_data,trn_data_2023], axis=0, ignore_index=True)
trn_data = pd.concat([trn_data,trn_data_2024], axis=0, ignore_index=True)
#trn_data = trn_data.drop(columns=['Unnamed: 0'])

In [None]:
#trn_data.to_csv("./data/train_data_new.csv",index=False)

In [None]:
# Clean bad index on test dataset
#test_data = pd.read_csv("test_data_2025.csv")
#test_data = test_data.drop(columns=['Unnamed: 0'])

In [None]:
#test_data.to_csv("./data/test_data.csv",index=False)

### Code To Generate New Test Data and either add to existing test data or get only new race data for predictions

In [59]:
is_test_data_for_pred = False # change this depending on what you want
# true == generate new race test data with no race info for pred, else false == generate regular test data with race features

meetings_df = get_meetings_df(2025)

In [60]:
test_data = pd.read_csv("data/test_data_new.csv") 

In [61]:
new_race_df = meetings_df.iloc[[6]].reset_index(drop=True) # edit which race(s) you want to add here 
new_race_df

Unnamed: 0,RoundNumber,Country,Location,EventFormat,Year
0,7,Italy,Imola,conventional,2025


In [62]:
final_df = get_data(new_race_df,is_test_data_for_pred)

0: 7, Italy, Imola, conventional, 2025


core           INFO 	Loading data for Emilia Romagna Grand Prix - Qualifying [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Finished loading data for 20 drivers: ['81', '1', '63', '4', '14', '55', '23', '18', '6', '10', '16', '44', '12', '5', '43', '30', '27', '31', '87', '22']
core           INFO 	Loading data for Emilia Romagna Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '81', '44', '23', 

In [63]:
final_df.sort_values(by="Qual_Position")
#final_df.sort_values(by="Race_Position")

Unnamed: 0,DriverNumber,BroadcastName,TeamName,Round_Number,Country,Location,Year,Event_Type,Sprint_Qual_Position,Sprint_Race_Position,Qual_Q3_Time,Qual_Position,Sector1Time,Sector2Time,Sector3Time,SpeedST,Stint,Standardized_Time,Race_Position
18,81,O PIASTRI,McLaren,7,Italy,Imola,2025,Race,,,74.67,1.0,27.863532,30.454,29.069381,275.888889,3.0,12.956,3.0
0,1,M VERSTAPPEN,Red Bull Racing,7,Italy,Imola,2025,Race,,,74.704,2.0,28.144258,30.006016,29.046667,271.825397,3.0,0.0,1.0
17,63,G RUSSELL,Mercedes,7,Italy,Imola,2025,Race,,,74.807,3.0,27.975694,30.485063,29.063048,280.809524,3.0,22.034,7.0
11,4,L NORRIS,McLaren,7,Italy,Imola,2025,Race,,,-1.0,4.0,27.811016,30.308175,29.143429,276.936508,3.0,6.109,2.0
3,14,F ALONSO,Aston Martin,7,Italy,Imola,2025,Race,,,75.431,5.0,27.870274,30.482476,29.241079,276.730159,3.0,27.25,11.0
15,55,C SAINZ,Williams,7,Italy,Imola,2025,Race,,,75.432,6.0,27.808,30.450603,29.259238,275.730159,3.0,22.898,8.0
7,23,A ALBON,Williams,7,Italy,Imola,2025,Race,,,75.473,7.0,27.745516,30.45827,29.229714,275.603175,3.0,17.945,5.0
5,18,L STROLL,Aston Martin,7,Italy,Imola,2025,Race,,,75.581,8.0,27.887484,30.492794,29.290508,273.555556,3.0,32.993,15.0
16,6,I HADJAR,Racing Bulls,7,Italy,Imola,2025,Race,,,75.746,9.0,27.735129,30.373016,29.394714,274.857143,3.0,23.586,9.0
1,10,P GASLY,Alpine,7,Italy,Imola,2025,Race,,,75.787,10.0,27.891339,30.52773,29.21573,278.888889,3.0,31.424,13.0


In [58]:
if is_test_data_for_pred:
    final_df.to_csv("data/new_race_data.csv",index=False)
else:
    test_data_with_new_race = pd.concat([test_data,final_df],axis=0,ignore_index=True)
    test_data_with_new_race.to_csv("data/test_data_new.csv",index=False)

In [46]:
# for sanity check
#df = pd.read_csv("data/test_data_new.csv")
#df.loc[df["Round_Number"]==5,:].sort_values(by="Race_Position")