In [6]:
import pandas as pd
import numpy as np
import os
import re 
import glob

In [7]:
def process_filename_info(df, filename):
    # Extract the base name (in case a full path is passed)
    base = os.path.basename(filename)
    
    # Regex to get race name and year
    match = re.match(r"f1_data_(.+)_([0-9]{4})\.csv", base)
    
    if match:
        race_name = match.group(1).replace("_", " ")  # Turn underscores into spaces
        year = int(match.group(2))
        
        df["RaceName"] = race_name
        df["Year"] = year
    else:
        print("⚠️ Filename doesn't match expected pattern.")
    
    return df

In [8]:
def impute_missing_practice_sessions(df):
    sessions = ["FP1", "FP2", "FP3"]
    penalty_seconds = 0.5

    # Initialize imputation flags explicitly
    for session in sessions:
        df[f"IsImputed_{session}"] = False

    # Loop through each session
    for session in sessions:
        lap_col = f"LapTime_{session}"
        
        # Drivers missing practice lap data for this session
        missing_drivers = df[df[lap_col].isna()]

        for idx, driver_row in missing_drivers.iterrows():
            team, race, year, driver = driver_row["TeamName"], driver_row["RaceName"], driver_row["Year"], driver_row["Driver"]

            # Teammate data
            teammate = df[
                (df["TeamName"] == team) &
                (df["RaceName"] == race) &
                (df["Year"] == year) &
                (df["Driver"] != driver) &
                (~df[lap_col].isna())
            ]

            if teammate.empty:
                print(f"⚠️ No teammate data for {driver} in {session} at {race} ({year}). Consider removing this row manually.")
                continue  # Skip if no teammate data available

            teammate_row = teammate.iloc[0]

            # Get all columns for this practice session
            session_cols = [col for col in df.columns if f"_{session}" in col]

            # Copy teammate's data for entire session
            df.loc[idx, session_cols] = teammate_row[session_cols].values

            # Penalize LapTime explicitly
            df.at[idx, lap_col] += penalty_seconds

            # Mark as imputed
            df.at[idx, f"IsImputed_{session}"] = True

            # Recalculate FastestPracticeTime clearly after imputation
            df.at[idx, "FastestPracticeTime"] = df.loc[idx, ["LapTime_FP1", "LapTime_FP2", "LapTime_FP3"]].min()

    return df



In [9]:
def process_race_data(csv_path, output_path):
    df = pd.read_csv(csv_path)

    process_filename_info(df, csv_path)
    
    # Check if 'TotalRaceTime' column exists before dropping
    if 'TotalRaceTime' in df.columns:
        df = df.dropna(subset=["TotalRaceTime"])
    else:
        print(f"⚠️ Column 'TotalRaceTime' missing in {csv_path}. Skipping file.")
        return  # skip further processing for this file

    # Convert all relevant time columns to seconds
    time_cols = [
        "LapTime_FP1", "LapTime_FP2", "LapTime_FP3",
        "AvgLapTime_LongestStint_FP1", "AvgLapTime_LongestStint_FP2", "AvgLapTime_LongestStint_FP3",
        "Delta_FirstLastLap_FP1", "Delta_FirstLastLap_FP2", "Delta_FirstLastLap_FP3",
        "Q1_FastestLap", "Q2_FastestLap", "Q3_FastestLap"
    ]
    for col in time_cols:
        df[col] = pd.to_timedelta(df[col]).dt.total_seconds()

    # Fastest laps
    df["FastestPracticeTime"] = df[["LapTime_FP1", "LapTime_FP2", "LapTime_FP3"]].min(axis=1)
    df["FastestQualifyingTime"] = df[["Q1_FastestLap", "Q2_FastestLap", "Q3_FastestLap"]].min(axis=1)

    # Map compounds to categorical
    compounds = {"SOFT": 0, "MEDIUM": 1, "HARD": 2, "INTERMEDIATE": 3, "WET": 4}

    # Find compound of fastest FP lap
    conditions = [
        df["LapTime_FP1"] == df["FastestPracticeTime"],
        df["LapTime_FP2"] == df["FastestPracticeTime"],
        df["LapTime_FP3"] == df["FastestPracticeTime"]
    ]
    choices = [df["Compound_FP1"], df["Compound_FP2"], df["Compound_FP3"]]
    df["FastestPracticeCompound"] = np.select(conditions, choices, default=np.nan)
    
    # Map compounds
    compound_cols = ["FastestPracticeCompound", "LongestStintCompound_FP1", "LongestStintCompound_FP2", "LongestStintCompound_FP3",
                    "Compound_FP1", "Compound_FP2", "Compound_FP3",
                     ]
    for col in compound_cols:
        df[col] = df[col].map(compounds)


    # Normalize all speed trap data vs. session slowest
    speed_cols = [
        "SpeedST_FP1", "SpeedFL_FP1", "SpeedI1_FP1", "SpeedI2_FP1",
        "SpeedST_FP2", "SpeedFL_FP2", "SpeedI1_FP2", "SpeedI2_FP2",
        "SpeedST_FP3", "SpeedFL_FP3", "SpeedI1_FP3", "SpeedI2_FP3",
        "Q1_TopSpeedST", "Q2_TopSpeedST", "Q3_TopSpeedST"
    ]

    for col in speed_cols:
        df[col] = df[col] - df[col].min()


    # Normalize race time
    df["TotalRaceTime"] = pd.to_timedelta(df["TotalRaceTime"]).dt.total_seconds()
    totalTimeFirst = df["TotalRaceTime"].max()
    df["TotalRaceTime"] = df["TotalRaceTime"].fillna(totalTimeFirst)
    df.loc[df["TotalRaceTime"] != totalTimeFirst, "TotalRaceTime"] += totalTimeFirst

    df = impute_missing_practice_sessions(df)

    # Append to combined CSV (create it if doesn't exist)
    if not os.path.exists(output_path):
        df.to_csv(output_path, index=False)
    else:
        df.to_csv(output_path, mode='a', index=False, header=False)  # Append without writing header again

    print(f"Processed and saved: {csv_path}")


In [10]:
# Folder path (relative to script location)
data_folder = './data'

# Get all CSV files in the folder
csv_files = glob.glob(os.path.join(data_folder, '*.csv'))

# Loop through each CSV
for csv_file in csv_files:
    process_race_data(csv_file, "dataset/f1_data_combined.csv") 

Processed and saved: ./data/f1_data_Saudi_Arabian_Grand_Prix_2021.csv
⚠️ No teammate data for ALB in FP1 at Canadian Grand Prix (2023). Consider removing this row manually.
⚠️ No teammate data for GAS in FP1 at Canadian Grand Prix (2023). Consider removing this row manually.
⚠️ No teammate data for HAM in FP1 at Canadian Grand Prix (2023). Consider removing this row manually.
⚠️ No teammate data for OCO in FP1 at Canadian Grand Prix (2023). Consider removing this row manually.
Processed and saved: ./data/f1_data_Canadian_Grand_Prix_2023.csv
Processed and saved: ./data/f1_data_Monaco_Grand_Prix_2021.csv
Processed and saved: ./data/f1_data_Canadian_Grand_Prix_2022.csv
⚠️ No teammate data for LEC in FP1 at Abu Dhabi Grand Prix (2018). Consider removing this row manually.
Processed and saved: ./data/f1_data_Abu_Dhabi_Grand_Prix_2018.csv
⚠️ No teammate data for ALB in FP1 at Abu Dhabi Grand Prix (2024). Consider removing this row manually.
⚠️ No teammate data for TSU in FP1 at Abu Dhabi Gra