# Step 0. Setup and Paths initialized.

In [2]:
# Two Race QA - Comparing performances across races.
# Goal - Loading 2 sessions and answering practical business style questions.
# Skills - Data Loading, Filtering, Joining, Grouping, Aggregation, Window Ops, Merge_asof, Vectorization, Defensiveness. 

from pathlib import Path
import pandas as pd
import numpy as np
import json, glob

PROJECT_DIR = Path.cwd().parents[0]
PROCESSED_DATA_DIR = PROJECT_DIR / "data" / "processed"

print(f"Project Directory: {PROJECT_DIR}")
print(f"Processed Data Directory: {PROCESSED_DATA_DIR}")

Project Directory: /Users/pratyushagarwal/Desktop/PROJECTS/F1_Projects/race_weekend_data_pipeline
Processed Data Directory: /Users/pratyushagarwal/Desktop/PROJECTS/F1_Projects/race_weekend_data_pipeline/data/processed


# Step 1. Picking 2 race sessions.

In [3]:
# We will pick 2 race sessions (endswith _R). Can be hardcoded or picking the first 2 automatically.

all_sessions = sorted(Path(p).name for p in glob.glob(str(PROCESSED_DATA_DIR / "*")) if Path(p).is_dir())
race_sessions = [r for r in all_sessions if r.endswith('_R')]

race_sessions[:10], len(race_sessions)

(['2024_01_R',
  '2024_02_R',
  '2024_03_R',
  '2024_04_R',
  '2024_05_R',
  '2024_06_R',
  '2024_07_R',
  '2024_08_R',
  '2024_09_R',
  '2024_10_R'],
 24)

In [4]:
# Choosing 2 races to compare
s1 = race_sessions[5] if len(race_sessions) > 5 else (race_sessions[0] if len(race_sessions) > 0 else None)
s2 = race_sessions[15] if len(race_sessions) > 15 else (race_sessions[1] if len(race_sessions) > 1 else None)

s1, s2

('2024_06_R', '2024_16_R')

# Step 2. Loading processed tables for both sessions.

In [5]:
# Loading fact tables for s1 and s2 - suffix columns to keep them distinct when necessary.

def load_session_tables(session_key: str):
    d = PROCESSED_DATA_DIR / session_key
    paths = {p.stem: p for p in d.glob("*.parquet")}
    fl = pd.read_parquet(paths['fact_laps']) if 'fact_laps' in paths else pd.DataFrame()
    fp = pd.read_parquet(paths['fact_pitstops']) if 'fact_pitstops' in paths else pd.DataFrame()
    fw = pd.read_parquet(paths['fact_weather']) if 'fact_weather' in paths else pd.DataFrame()
    dd = pd.read_parquet(paths['dim_drivers']) if 'dim_drivers' in paths else pd.DataFrame()

    return fl, fp, fw, dd

In [6]:
fl1, fp1, fw1, dd1 = load_session_tables(s1)
fl2, fp2, fw2, dd2 = load_session_tables(s2)

(fl1.shape, fp1.shape, fw1.shape, dd1.shape), (fl2.shape, fp2.shape, fw2.shape, dd2.shape)

(((1111, 16), (0, 5), (150, 10), (20, 6)),
 ((1008, 16), (0, 5), (133, 10), (20, 6)))

# Step 3. Normalizing driver keys and enriching with names.

In [9]:
# Normalizing driver_number to string and attaching names / teams for readability.

def enrich_laps(fl: pd.DataFrame, dd: pd.DataFrame) -> pd.DataFrame:
    if fl.empty:
        return fl
    
    fl = fl.copy()
    fl['driver_number'] = fl['driver_number'].astype(str)
    if not dd.empty:
        dd = dd.copy()
        dd['driver_number'] = dd['driver_number'].astype(str)
        fl = fl.merge(
            dd[['session_key', 'driver_number', 'full_name', 'team']],
            on=['session_key', 'driver_number'],
            how='left'
        )
    
    return fl

In [11]:
fl1e = enrich_laps(fl1, dd1)
fl2e = enrich_laps(fl2, dd2)

display(fl1e.drop_duplicates().head(2).style.set_caption(f"Enriched Laps - Session: {s1}"))
display(fl2e.drop_duplicates().head(2).style.set_caption(f"Enriched Laps - Session: {s2}"))

Unnamed: 0,driver_number,lap_number,tyre_compound,stint,track_status,tyre_life,is_accurate,speed_trap_kph,lap_time_ms,sector1_time_ms,sector2_time_ms,sector3_time_ms,is_inlap,is_outlap,is_pit,session_key,full_name,team
0,1,1.0,MEDIUM,1,1,1.0,False,305.0,94338.0,,34990.0,26211.0,False,False,False,2024_06_R,Max Verstappen,Red Bull Racing
1,1,2.0,MEDIUM,1,1,2.0,True,303.0,93093.0,31455.0,35489.0,26149.0,False,False,False,2024_06_R,Max Verstappen,Red Bull Racing


Unnamed: 0,driver_number,lap_number,tyre_compound,stint,track_status,tyre_life,is_accurate,speed_trap_kph,lap_time_ms,sector1_time_ms,sector2_time_ms,sector3_time_ms,is_inlap,is_outlap,is_pit,session_key,full_name,team
0,1,1.0,HARD,1,1,1.0,False,298.0,90354.0,,31144.0,28856.0,False,False,False,2024_16_R,Max Verstappen,Red Bull Racing
1,1,2.0,HARD,1,1,2.0,True,316.0,86170.0,28048.0,29474.0,28648.0,False,False,False,2024_16_R,Max Verstappen,Red Bull Racing


# Q1) Which drivers improved or regressed average race pace from Race A to Race B.

In [None]:
# Question - Who got faster? Comparing average lap_time_ms across 2 races.

def pace_summary(fl: pd.DataFrame) -> pd.DataFrame:
    