In [6]:
import pandas as pd
import numpy as np
import os
import re 
import glob

In [2]:
def process_filename_info(df, filename):
    # Extract the base name (in case a full path is passed)
    base = os.path.basename(filename)
    
    # Regex to get race name and year
    match = re.match(r"f1_data_(.+)_([0-9]{4})\.csv", base)
    
    if match:
        race_name = match.group(1).replace("_", " ")  # Turn underscores into spaces
        year = int(match.group(2))
        
        df["RaceName"] = race_name
        df["Year"] = year
    else:
        print("⚠️ Filename doesn't match expected pattern.")
    
    return df

In [4]:
def process_race_data(csv_path, output_path):
    df = pd.read_csv(csv_path)

    process_filename_info(df, csv_path)
    
    # Drop rows without race time
    df = df.dropna(subset=["TotalRaceTime"])

    # Convert all relevant time columns to seconds
    time_cols = [
        "LapTime_FP1", "LapTime_FP2", "LapTime_FP3",
        "AvgLapTime_LongestStint_FP1", "AvgLapTime_LongestStint_FP2", "AvgLapTime_LongestStint_FP3",
        "Delta_FirstLastLap_FP1", "Delta_FirstLastLap_FP2", "Delta_FirstLastLap_FP3",
        "Q1_FastestLap", "Q2_FastestLap", "Q3_FastestLap"
    ]
    for col in time_cols:
        df[col] = pd.to_timedelta(df[col]).dt.total_seconds()

    # Fastest laps
    df["FastestPracticeTime"] = df[["LapTime_FP1", "LapTime_FP2", "LapTime_FP3"]].min(axis=1)
    df["FastestQualifyingTime"] = df[["Q1_FastestLap", "Q2_FastestLap", "Q3_FastestLap"]].min(axis=1)

    # Map compounds to categorical
    compounds = {"SOFT": 0, "MEDIUM": 1, "HARD": 2, "INTERMEDIATE": 3, "WET": 4}

    # Find compound of fastest FP lap
    conditions = [
        df["LapTime_FP1"] == df["FastestPracticeTime"],
        df["LapTime_FP2"] == df["FastestPracticeTime"],
        df["LapTime_FP3"] == df["FastestPracticeTime"]
    ]
    choices = [df["Compound_FP1"], df["Compound_FP2"], df["Compound_FP3"]]
    df["FastestPracticeCompound"] = np.select(conditions, choices, default=np.nan)
    
    # Map compounds
    compound_cols = ["FastestPracticeCompound", "LongestStintCompound_FP1", "LongestStintCompound_FP2", "LongestStintCompound_FP3"]
    for col in compound_cols:
        df[col] = df[col].map(compounds)

    # Normalize speed vs. session slowest
    for col in ["SpeedST_FP1", "SpeedST_FP2", "SpeedST_FP3", "Q1_TopSpeedST", "Q2_TopSpeedST", "Q3_TopSpeedST"]:
        df[col] = df[col] - df[col].min()

    # Normalize race time
    df["TotalRaceTime"] = pd.to_timedelta(df["TotalRaceTime"]).dt.total_seconds()
    totalTimeFirst = df["TotalRaceTime"].max()
    df["TotalRaceTime"] = df["TotalRaceTime"].fillna(totalTimeFirst)
    df.loc[df["TotalRaceTime"] != totalTimeFirst, "TotalRaceTime"] += totalTimeFirst

    # Append to combined CSV (create it if doesn't exist)
    if not os.path.exists(output_path):
        df.to_csv(output_path, index=False)
    else:
        df.to_csv(output_path, mode='a', index=False, header=False)  # Append without writing header again

    print(f"Processed and saved: {csv_path}")


In [7]:
# Folder path (relative to script location)
data_folder = './data'

# Get all CSV files in the folder
csv_files = glob.glob(os.path.join(data_folder, '*.csv'))

# Loop through each CSV
for csv_file in csv_files:
    process_race_data(csv_file, "data/f1_data_combined.csv")

Processed and saved: ./data/f1_data_Bahrain_Grand_Prix_2025.csv
Processed and saved: ./data/f1_data_Australian_Grand_Prix_2025.csv
Processed and saved: ./data/f1_data_Japanese_Grand_Prix_2025.csv
