In [2]:
import fastf1
import pandas as pd
import time

In [14]:
# Get historical data. Doesn't include 2025.
year = 2022
meetings_df = pd.DataFrame()
while year<2025:
    meeting = fastf1.get_event_schedule(year)
    meeting = meeting.loc[:,["RoundNumber","Country","Location","EventFormat"]]
    meeting['Year'] = year
    meetings_df = pd.concat([meetings_df,meeting],axis=0)
    year += 1

condition = meetings_df["EventFormat"] == "testing"
meetings_df = meetings_df[~condition]


In [3]:
meetings_df

Unnamed: 0,RoundNumber,Country,Location,EventFormat,Year
2,1,Bahrain,Sakhir,conventional,2022
3,2,Saudi Arabia,Jeddah,conventional,2022
4,3,Australia,Melbourne,conventional,2022
5,4,Italy,Imola,sprint,2022
6,5,United States,Miami,conventional,2022
...,...,...,...,...,...
20,20,Mexico,Mexico City,conventional,2024
21,21,Brazil,São Paulo,sprint_qualifying,2024
22,22,United States,Las Vegas,conventional,2024
23,23,Qatar,Lusail,sprint_qualifying,2024


In [11]:
# Get current year (2025) data. Test data
year = 2025
meetings_2025_df = pd.DataFrame()

meeting = fastf1.get_event_schedule(year)
meeting = meeting.loc[:,["RoundNumber","Country","Location","EventFormat"]]
meeting['Year'] = year
meetings_2025_df = pd.concat([meetings_2025_df,meeting],axis=0)

condition = meetings_2025_df["EventFormat"] == "testing"
meetings_2025_df = meetings_2025_df[~condition]

In [12]:
meetings_2025_df

Unnamed: 0,RoundNumber,Country,Location,EventFormat,Year
1,1,Australia,Melbourne,conventional,2025
2,2,China,Shanghai,sprint_qualifying,2025
3,3,Japan,Suzuka,conventional,2025
4,4,Bahrain,Sakhir,conventional,2025
5,5,Saudi Arabia,Jeddah,conventional,2025
6,6,United States,Miami,sprint_qualifying,2025
7,7,Italy,Imola,conventional,2025
8,8,Monaco,Monaco,conventional,2025
9,9,Spain,Barcelona,conventional,2025
10,10,Canada,Montréal,conventional,2025


In [3]:
def get_event_info(year,round_num,event_format):
    
    qual = fastf1.get_session(year,round_num,'Q')
    qual.load()
    race = fastf1.get_session(year,round_num,'R')
    race.load()

    if event_format != "conventional":
        sprint = fastf1.get_session(year,round_num,'S')
        sprint.load()

    # For Qualifying 
    # data from session.results
    qual_results_df = qual.results.loc[:,["DriverNumber","Position"]]
    qual_results_df = qual_results_df.rename(columns={"Position":"Qual_Position"})

    # For Race
    # data from session.results
    race_results_df = race.results.loc[:,["DriverNumber","BroadcastName","TeamName","Position"]]

    # data from session.laps
    race_laps_df = race.laps.loc[:,["DriverNumber","Sector1Time","Sector2Time","Sector3Time","SpeedST","Stint"]]
    for sector in ['Sector1Time', 'Sector2Time', 'Sector3Time']: race_laps_df[sector] = race_laps_df[sector] / pd.Timedelta(seconds=1)
    final_race_laps_df = race_laps_df.groupby('DriverNumber').agg({
        'Sector1Time': 'mean',
        'Sector2Time': 'mean',
        'Sector3Time': 'mean',
        'SpeedST': 'mean',
        'Stint': 'max'
    }).reset_index()

    # outer join dfs --> complete data for a single race session 
    merged_race_df = pd.merge(race_results_df,final_race_laps_df,on="DriverNumber",how="outer")

    # Merge qual and race data 
    merged_qual_race_df = pd.merge(qual_results_df,merged_race_df,on="DriverNumber",how="outer")
    merged_qual_race_df = merged_qual_race_df.rename(columns={"Position":"Race_Position"})

    if event_format == "conventional":
        merged_qual_race_df["Sprint_Qual_Position"] = None
        merged_qual_race_df["Sprint_Race_Position"] = None
        merged_qual_race_df["Event_Type"] = "Race"

    elif event_format == "sprint":
        # Get Sprint data
        sprint_results_df = sprint.results.loc[:,["DriverNumber","Position"]]
        sprint_results_df = sprint_results_df.rename(columns={"Position":"Sprint_Race_Position"})

        # Merge Qual + Sprint + Race data
        merged_qual_race_df["Sprint_Qual_Position"] = merged_qual_race_df["Qual_Position"]
        merged_qual_race_df = pd.merge(merged_qual_race_df,sprint_results_df,on="DriverNumber",how="outer")

        merged_qual_race_df["Event_Type"] = "Sprint"

    elif event_format == "sprint_shootout" or event_format == "sprint_qualifying":
        # Get Sprint data
        sprint_results_df = sprint.results.loc[:,["DriverNumber","GridPosition","Position"]]
        sprint_results_df = sprint_results_df.rename(columns={"GridPosition":"Sprint_Qual_Position","Position":"Sprint_Race_Position"})

        # Merge Qual + Sprint + Race data
        merged_qual_race_df = pd.merge(merged_qual_race_df,sprint_results_df,on="DriverNumber",how="outer")

        merged_qual_race_df["Event_Type"] = "Sprint"

    else:
        raise Exception(f"Unknown event format: {event_format}")
    
    return merged_qual_race_df

In [None]:
# Get data for a all meetings in meetings_df
def get_data():  
    final_df = pd.DataFrame()
    for index,row in meetings_df.iterrows():
        round_num = row["RoundNumber"]
        country = row["Country"]
        location = row["Location"]
        EventFormat = row["EventFormat"]
        year = row["Year"]
             
        print(f"{index}: {round_num}, {country}, {location}, {EventFormat}, {year}")
        #time.sleep(5)
        
        meeting_data_df = get_event_info(year,round_num,EventFormat)

        meeting_data_df["Round_Number"] = round_num
        meeting_data_df["Country"] = country
        meeting_data_df["Location"] = location
        meeting_data_df["Year"] = year

        # reorganize columns
        new_col_order = ["DriverNumber","BroadcastName","TeamName","Round_Number","Country","Location","Year","Event_Type",
                        "Sprint_Qual_Position","Sprint_Race_Position","Qual_Position","Sector1Time","Sector2Time","Sector3Time",
                        "SpeedST","Stint","Race_Position"]
        meeting_data_df = meeting_data_df[new_col_order]

        final_df = pd.concat([final_df,meeting_data_df],axis=0)

    return final_df

In [10]:
#d = get_data()
#d.to_csv("test_data_2025.csv")

In [15]:
# Pre-load meeting data to cache. Returns all races that didn't load properly
def preload_cache(df):
        count = 0
        bad = []
        for _,row in df.iterrows():
                time.sleep(5)
                round_num = row["RoundNumber"]
                country = row["Country"]
                location = row["Location"]
                EventFormat = row["EventFormat"]
                year = row["Year"]
                
                if round_num >= 5:
                        print(f"{count}: {round_num}, {country}, {location}, {EventFormat}, {year}")

                        qual = fastf1.get_session(year,round_num,'Q')
                        qual.load()
                        race = fastf1.get_session(year,round_num,'R')
                        race.load()

                        if EventFormat != "conventional":
                                sprint = fastf1.get_session(year,round_num,'S')
                                sprint.load()

                        if len(race.results) == 0 or len(qual.results)==0 or(EventFormat != "conventional" and len(sprint.results)==0):
                                bad.append(f"{round_num}-{location}-{sprint}-{year}")

                count += 1
        return bad

just2024 = meetings_df.loc[meetings_df["Year"]==2024,:]
#preload_cache(just2024)

In [25]:
# Combine training data sets
trn_data_2022 = pd.read_csv("train_data_2022.csv")
trn_data_2023 = pd.read_csv("train_data_2023.csv")
trn_data_2024 = pd.read_csv("train_data_2024.csv")

trn_data = pd.DataFrame()
trn_data = pd.concat([trn_data,trn_data_2022], axis=0, ignore_index=True)
trn_data = pd.concat([trn_data,trn_data_2023], axis=0, ignore_index=True)
trn_data = pd.concat([trn_data,trn_data_2024], axis=0, ignore_index=True)
trn_data = trn_data.drop(columns=['Unnamed: 0'])

In [31]:
trn_data.to_csv("train_data.csv",index=False)

In [None]:
# Clean bad index on test dataset
test_data = pd.read_csv("test_data_2025.csv")
test_data = test_data.drop(columns=['Unnamed: 0'])
test_data.to_csv("test_data.csv",index=False)