# Step 0. Setup and Paths Initialized.

In [4]:
from pathlib import Path
import pandas as pd
import numpy as np
import json, glob

PROJECT_DIR = Path().cwd().parents[0]
PROCESSED_DIR = PROJECT_DIR / "data" / "processed"

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 1000)

print(f"Project Directory: {PROJECT_DIR}")
print(f"Processed Data Directory: {PROCESSED_DIR}")

Project Directory: /Users/pratyushagarwal/Desktop/PROJECTS/F1_Projects/race_weekend_data_pipeline
Processed Data Directory: /Users/pratyushagarwal/Desktop/PROJECTS/F1_Projects/race_weekend_data_pipeline/data/processed


# Step 1. Picking a Session to Explore.

In [5]:
# Browsing available processed sessions and picking one for exploration.

session_dirs = sorted([Path(p).name for p in glob.glob(str(PROCESSED_DIR / "*")) if Path(p).is_dir()])
len(session_dirs), session_dirs[:20]

(60,
 ['2024_01_Q',
  '2024_01_R',
  '2024_02_Q',
  '2024_02_R',
  '2024_03_Q',
  '2024_03_R',
  '2024_04_Q',
  '2024_04_R',
  '2024_05_Q',
  '2024_05_R',
  '2024_05_S',
  '2024_05_SQ',
  '2024_06_Q',
  '2024_06_R',
  '2024_06_S',
  '2024_06_SQ',
  '2024_07_Q',
  '2024_07_R',
  '2024_08_Q',
  '2024_08_R'])

In [12]:
session_key = next((s for s in session_dirs if s.endswith("_R")), session_dirs[0])
session_key

'2024_01_R'

# Step 2. Loading the processed tables.

In [13]:
in_dir = PROCESSED_DIR / session_key
paths = {p.stem: p for p in in_dir.glob("*.parquet")}
paths

{'fact_pitstops': PosixPath('/Users/pratyushagarwal/Desktop/PROJECTS/F1_Projects/race_weekend_data_pipeline/data/processed/2024_01_R/fact_pitstops.parquet'),
 'fact_laps': PosixPath('/Users/pratyushagarwal/Desktop/PROJECTS/F1_Projects/race_weekend_data_pipeline/data/processed/2024_01_R/fact_laps.parquet'),
 'fact_weather': PosixPath('/Users/pratyushagarwal/Desktop/PROJECTS/F1_Projects/race_weekend_data_pipeline/data/processed/2024_01_R/fact_weather.parquet'),
 'dim_drivers': PosixPath('/Users/pratyushagarwal/Desktop/PROJECTS/F1_Projects/race_weekend_data_pipeline/data/processed/2024_01_R/dim_drivers.parquet')}

In [14]:
fl = pd.read_parquet(paths['fact_laps']) if 'fact_laps' in paths else pd.DataFrame()
display(fl.head(10))
fp = pd.read_parquet(paths['fact_pitstops']) if 'fact_pitstops' in paths else pd.DataFrame()
display(fp.head(10))
fw = pd.read_parquet(paths['fact_weather']) if 'fact_weather' in paths else pd.DataFrame()
display(fw.head(10))
dd = pd.read_parquet(paths['dim_drivers']) if 'dim_drivers' in paths else pd.DataFrame()
display(dd.head(10))

Unnamed: 0,driver_number,lap_number,tyre_compound,stint,track_status,tyre_life,is_accurate,speed_trap_kph,lap_time_ms,sector1_time_ms,sector2_time_ms,sector3_time_ms,is_inlap,is_outlap,is_pit,session_key
0,1,1.0,SOFT,1,12,4.0,False,251.0,97284.0,,41266.0,23616.0,False,False,False,2024_01_R
1,1,2.0,SOFT,1,1,5.0,True,287.0,96296.0,30916.0,41661.0,23719.0,False,False,False,2024_01_R
2,1,3.0,SOFT,1,1,6.0,True,290.0,96753.0,30999.0,41966.0,23788.0,False,False,False,2024_01_R
3,1,4.0,SOFT,1,1,7.0,True,,96647.0,30931.0,41892.0,23824.0,False,False,False,2024_01_R
4,1,5.0,SOFT,1,1,8.0,True,289.0,97173.0,31255.0,42056.0,23862.0,False,False,False,2024_01_R
5,1,6.0,SOFT,1,1,9.0,True,296.0,97092.0,31041.0,42187.0,23864.0,False,False,False,2024_01_R
6,1,7.0,SOFT,1,1,10.0,True,291.0,97038.0,31015.0,42118.0,23905.0,False,False,False,2024_01_R
7,1,8.0,SOFT,1,1,11.0,True,,97024.0,31041.0,42077.0,23906.0,False,False,False,2024_01_R
8,1,9.0,SOFT,1,1,12.0,True,292.0,97229.0,31100.0,42166.0,23963.0,False,False,False,2024_01_R
9,1,10.0,SOFT,1,12,13.0,True,293.0,96960.0,30986.0,42009.0,23965.0,False,False,False,2024_01_R


Unnamed: 0,driver_number,lap_number,pit_time_ms,compound_out,session_key


Unnamed: 0,session_key,session_time,time_utc,air_temp_c,track_temp_c,humidity_pct,pressure_hPa,wind_speed_mps,wind_direction_deg,rainfall
0,2024_01_R,0 days 00:00:14.093000,NaT,18.9,26.5,46.0,1017.1,0.25,162,False
1,2024_01_R,0 days 00:01:14.084000,NaT,18.9,26.5,46.0,1017.0,0.277778,55,False
2,2024_01_R,0 days 00:02:14.093000,NaT,18.9,26.5,46.0,1017.0,0.277778,55,False
3,2024_01_R,0 days 00:03:14.090000,NaT,18.9,26.2,45.0,1017.0,0.305556,85,False
4,2024_01_R,0 days 00:04:14.091000,NaT,18.9,26.2,46.0,1017.0,0.277778,178,False
5,2024_01_R,0 days 00:05:14.093000,NaT,18.9,26.2,46.0,1017.0,0.277778,178,False
6,2024_01_R,0 days 00:06:14.087000,NaT,18.9,26.1,46.0,1017.0,0.416667,56,False
7,2024_01_R,0 days 00:07:14.097000,NaT,18.9,26.2,46.0,1017.0,0.222222,155,False
8,2024_01_R,0 days 00:08:14.086000,NaT,18.9,26.2,46.0,1017.0,0.25,15,False
9,2024_01_R,0 days 00:09:14.100000,NaT,18.9,26.0,46.0,1017.0,0.277778,51,False


Unnamed: 0,driver_number,driver_id,broadcast_name,full_name,team,session_key
0,1,VER,M VERSTAPPEN,Max Verstappen,Red Bull Racing,2024_01_R
1,11,PER,S PEREZ,Sergio Perez,Red Bull Racing,2024_01_R
2,55,SAI,C SAINZ,Carlos Sainz,Ferrari,2024_01_R
3,16,LEC,C LECLERC,Charles Leclerc,Ferrari,2024_01_R
4,63,RUS,G RUSSELL,George Russell,Mercedes,2024_01_R
5,4,NOR,L NORRIS,Lando Norris,McLaren,2024_01_R
6,44,HAM,L HAMILTON,Lewis Hamilton,Mercedes,2024_01_R
7,81,PIA,O PIASTRI,Oscar Piastri,McLaren,2024_01_R
8,14,ALO,F ALONSO,Fernando Alonso,Aston Martin,2024_01_R
9,18,STR,L STROLL,Lance Stroll,Aston Martin,2024_01_R


# Step 3. Quick Schema and typing checks.

In [15]:
# Understanding the schema quickly. First few rows, dtypes and null rates.

def peek(df: pd.DataFrame, name: str, n: int = 5):
    print(f"--- {name} ---")
    if df.empty:
        print(" <empty> ")
        return
    
    display(df.head(n))
    display(pd.DataFrame({
        'dtype' : df.dtypes,
        'num_nulls': df.isnull().sum()
    }))
    display((df.isna().mean() * 100).round(2).rename("null_pct").to_frame())

peek(fl, 'fact_laps')
peek(fp, 'fact_pitstops')
peek(fw, 'fact_weather')
peek(dd, 'dim_drivers')

--- fact_laps ---


Unnamed: 0,driver_number,lap_number,tyre_compound,stint,track_status,tyre_life,is_accurate,speed_trap_kph,lap_time_ms,sector1_time_ms,sector2_time_ms,sector3_time_ms,is_inlap,is_outlap,is_pit,session_key
0,1,1.0,SOFT,1,12,4.0,False,251.0,97284.0,,41266.0,23616.0,False,False,False,2024_01_R
1,1,2.0,SOFT,1,1,5.0,True,287.0,96296.0,30916.0,41661.0,23719.0,False,False,False,2024_01_R
2,1,3.0,SOFT,1,1,6.0,True,290.0,96753.0,30999.0,41966.0,23788.0,False,False,False,2024_01_R
3,1,4.0,SOFT,1,1,7.0,True,,96647.0,30931.0,41892.0,23824.0,False,False,False,2024_01_R
4,1,5.0,SOFT,1,1,8.0,True,289.0,97173.0,31255.0,42056.0,23862.0,False,False,False,2024_01_R


Unnamed: 0,dtype,num_nulls
driver_number,object,0
lap_number,float64,0
tyre_compound,object,0
stint,int64,0
track_status,object,0
tyre_life,float64,0
is_accurate,bool,0
speed_trap_kph,float64,170
lap_time_ms,float64,2
sector1_time_ms,float64,21


Unnamed: 0,null_pct
driver_number,0.0
lap_number,0.0
tyre_compound,0.0
stint,0.0
track_status,0.0
tyre_life,0.0
is_accurate,0.0
speed_trap_kph,15.06
lap_time_ms,0.18
sector1_time_ms,1.86


--- fact_pitstops ---
 <empty> 
--- fact_weather ---


Unnamed: 0,session_key,session_time,time_utc,air_temp_c,track_temp_c,humidity_pct,pressure_hPa,wind_speed_mps,wind_direction_deg,rainfall
0,2024_01_R,0 days 00:00:14.093000,NaT,18.9,26.5,46.0,1017.1,0.25,162,False
1,2024_01_R,0 days 00:01:14.084000,NaT,18.9,26.5,46.0,1017.0,0.277778,55,False
2,2024_01_R,0 days 00:02:14.093000,NaT,18.9,26.5,46.0,1017.0,0.277778,55,False
3,2024_01_R,0 days 00:03:14.090000,NaT,18.9,26.2,45.0,1017.0,0.305556,85,False
4,2024_01_R,0 days 00:04:14.091000,NaT,18.9,26.2,46.0,1017.0,0.277778,178,False


Unnamed: 0,dtype,num_nulls
session_key,object,0
session_time,timedelta64[ns],0
time_utc,datetime64[ns],157
air_temp_c,float64,0
track_temp_c,float64,0
humidity_pct,float64,0
pressure_hPa,float64,0
wind_speed_mps,float64,0
wind_direction_deg,int64,0
rainfall,bool,0


Unnamed: 0,null_pct
session_key,0.0
session_time,0.0
time_utc,100.0
air_temp_c,0.0
track_temp_c,0.0
humidity_pct,0.0
pressure_hPa,0.0
wind_speed_mps,0.0
wind_direction_deg,0.0
rainfall,0.0


--- dim_drivers ---


Unnamed: 0,driver_number,driver_id,broadcast_name,full_name,team,session_key
0,1,VER,M VERSTAPPEN,Max Verstappen,Red Bull Racing,2024_01_R
1,11,PER,S PEREZ,Sergio Perez,Red Bull Racing,2024_01_R
2,55,SAI,C SAINZ,Carlos Sainz,Ferrari,2024_01_R
3,16,LEC,C LECLERC,Charles Leclerc,Ferrari,2024_01_R
4,63,RUS,G RUSSELL,George Russell,Mercedes,2024_01_R


Unnamed: 0,dtype,num_nulls
driver_number,object,0
driver_id,object,0
broadcast_name,object,0
full_name,object,0
team,object,0
session_key,object,0


Unnamed: 0,null_pct
driver_number,0.0
driver_id,0.0
broadcast_name,0.0
full_name,0.0
team,0.0
session_key,0.0


In [16]:
# Checking Memory Usage.
def mem_mb(df: pd.DataFrame):
    return df.memory_usage(deep=True).sum() / (1024 ** 2)

{
    'fact_laps' : mem_mb(fl),
    'fact_pitstops' : mem_mb(fp),
    'fact_weather' : mem_mb(fw),
    'dim_drivers' : mem_mb(dd)
}

{'fact_laps': np.float64(0.33588123321533203),
 'fact_pitstops': np.float64(0.000125885009765625),
 'fact_weather': np.float64(0.01974010467529297),
 'dim_drivers': np.float64(0.007503509521484375)}

# Step 4. Keys and Uniqueness check (data quality)

In [17]:
# Core DQ checks - uniqueness, duplicates, valid ranges.

if not fl.empty:
    dup = fl.duplicated(['session_key', 'driver_number', 'lap_number']).sum()
    print(f"fact_laps duplicate key rows: {dup}")

fact_laps duplicate key rows: 0


In [19]:
# Sanity on timings. Lap times should be positive and within reasonable limits. (ex: < 3 minutes)
if not fl.empty and 'lap_time_ms' in fl.columns:
    display(fl['lap_time_ms'].describe(percentiles=[0.01, 0.1, 0.5, 0.99]))

count      1127.000000
mean      98004.108252
std        4500.312961
min       92608.000000
1%        94421.980000
10%       95262.200000
50%       97046.000000
99%      119337.620000
max      132438.000000
Name: lap_time_ms, dtype: float64

# Step 5. Joining with Driver Info.

In [20]:
# Enriching lap data with driver info.

if not fl.empty and not dd.empty:
    fl_enriched = fl.merge(dd[['session_key', 'driver_number', 'full_name', 'team']], on=['driver_number', 'session_key'], how='left')
    display(fl_enriched.head(10))
else:
    fl_enriched = fl.copy()

Unnamed: 0,driver_number,lap_number,tyre_compound,stint,track_status,tyre_life,is_accurate,speed_trap_kph,lap_time_ms,sector1_time_ms,sector2_time_ms,sector3_time_ms,is_inlap,is_outlap,is_pit,session_key,full_name,team
0,1,1.0,SOFT,1,12,4.0,False,251.0,97284.0,,41266.0,23616.0,False,False,False,2024_01_R,Max Verstappen,Red Bull Racing
1,1,2.0,SOFT,1,1,5.0,True,287.0,96296.0,30916.0,41661.0,23719.0,False,False,False,2024_01_R,Max Verstappen,Red Bull Racing
2,1,3.0,SOFT,1,1,6.0,True,290.0,96753.0,30999.0,41966.0,23788.0,False,False,False,2024_01_R,Max Verstappen,Red Bull Racing
3,1,4.0,SOFT,1,1,7.0,True,,96647.0,30931.0,41892.0,23824.0,False,False,False,2024_01_R,Max Verstappen,Red Bull Racing
4,1,5.0,SOFT,1,1,8.0,True,289.0,97173.0,31255.0,42056.0,23862.0,False,False,False,2024_01_R,Max Verstappen,Red Bull Racing
5,1,6.0,SOFT,1,1,9.0,True,296.0,97092.0,31041.0,42187.0,23864.0,False,False,False,2024_01_R,Max Verstappen,Red Bull Racing
6,1,7.0,SOFT,1,1,10.0,True,291.0,97038.0,31015.0,42118.0,23905.0,False,False,False,2024_01_R,Max Verstappen,Red Bull Racing
7,1,8.0,SOFT,1,1,11.0,True,,97024.0,31041.0,42077.0,23906.0,False,False,False,2024_01_R,Max Verstappen,Red Bull Racing
8,1,9.0,SOFT,1,1,12.0,True,292.0,97229.0,31100.0,42166.0,23963.0,False,False,False,2024_01_R,Max Verstappen,Red Bull Racing
9,1,10.0,SOFT,1,12,13.0,True,293.0,96960.0,30986.0,42009.0,23965.0,False,False,False,2024_01_R,Max Verstappen,Red Bull Racing


# Step 6. Groupby / Agg patterns (pace and dispersion)

In [21]:
# Practicing common groupby patterns.
# - aggregate stats
# - percentiles via quantile
# - sorting by metrics

if not fl_enriched.empty:
    pace = (
        fl_enriched
        .loc[~fl_enriched['is_pit'].fillna(False)]
        .groupby(['driver_number', 'full_name', 'team'], dropna=False)['lap_time_ms']
        .agg(
            laps='size',
            avg_ms='mean',
            std_ms='std',
            p10=lambda x: x.quantile(0.1),
            p50=lambda x: x.quantile(0.5),
            p90=lambda x: x.quantile(0.9)
        )
        .reset_index()
        .sort_values(['avg_ms', 'std_ms'])
    )

display(pace.head(20))

Unnamed: 0,driver_number,full_name,team,laps,avg_ms,std_ms,p10,p50,p90
0,1,Max Verstappen,Red Bull Racing,53,95685.150943,1079.79445,94342.0,95581.0,97077.0
2,11,Sergio Perez,Red Bull Racing,53,96115.566038,1168.256681,94743.4,96226.0,97801.2
16,55,Carlos Sainz,Ferrari,53,96188.301887,1222.48265,94854.6,96149.0,97724.0
4,16,Charles Leclerc,Ferrari,53,96404.207547,1142.070562,95032.6,96760.0,97876.0
17,63,George Russell,Mercedes,53,96526.09434,955.523643,95211.4,96635.0,97783.6
14,4,Lando Norris,McLaren,53,96571.150943,1267.604637,95187.2,96566.0,97995.2
15,44,Lewis Hamilton,Mercedes,53,96620.037736,1445.565034,95066.6,96666.0,98265.6
19,81,Oscar Piastri,McLaren,53,96650.603774,1317.568001,95242.0,96680.0,98055.2
11,27,Nico Hulkenberg,Haas F1 Team,50,96854.68,1067.070956,95467.2,96816.0,97891.5
3,14,Fernando Alonso,Aston Martin,53,97029.320755,1536.792348,94929.0,97202.0,98607.0


# Step 7. Working with Window Functions - Rolling and Expanding.

In [22]:
# Working with windows - common in time series analytics (stability, degradation, trends, etc.)

if not fl_enriched.empty:
    # example: per driver rolling median of laptimes over last 3 laps.
    fl_enriched = fl_enriched.sort_values(['driver_number', 'lap_number'])
    fl_enriched['rolling_median_ms_w3'] = (
        fl_enriched
        .groupby('driver_number', group_keys=False)['lap_time_ms']
        .apply(lambda x: x.rolling(window=3, min_periods=1).median())
    )

    display(fl_enriched[['driver_number', 'lap_number', 'lap_time_ms', 'rolling_median_ms_w3']].head(20))

Unnamed: 0,driver_number,lap_number,lap_time_ms,rolling_median_ms_w3
0,1,1.0,97284.0,97284.0
1,1,2.0,96296.0,96790.0
2,1,3.0,96753.0,96753.0
3,1,4.0,96647.0,96647.0
4,1,5.0,97173.0,96753.0
5,1,6.0,97092.0,97092.0
6,1,7.0,97038.0,97092.0
7,1,8.0,97024.0,97038.0
8,1,9.0,97229.0,97038.0
9,1,10.0,96960.0,97024.0


# Step 8. Stint Analysis (Derived Feature)

In [24]:
# Stints Signal - Tyre Life / Strategy. Showing average pace by stint and compound.

if not fl_enriched.empty and 'tyre_compound' in fl_enriched.columns and 'stint' in fl_enriched.columns:
    stint_stats = (
        fl_enriched
        .groupby(['driver_number', 'full_name', 'stint', 'tyre_compound'], dropna=False)['lap_time_ms']
        .agg(
            laps='size',
            avg_ms='mean'
        )
        .reset_index()
        .sort_values(['driver_number', 'stint'])
    )

    display(stint_stats.head(20))

Unnamed: 0,driver_number,full_name,stint,tyre_compound,laps,avg_ms
0,1,Max Verstappen,1,SOFT,17,97162.294118
1,1,Max Verstappen,2,HARD,20,96876.35
2,1,Max Verstappen,3,SOFT,20,95772.8
3,10,Pierre Gasly,1,SOFT,12,99847.25
4,10,Pierre Gasly,2,HARD,31,99252.935484
5,10,Pierre Gasly,3,SOFT,13,97088.538462
6,11,Sergio Perez,1,SOFT,12,98039.833333
7,11,Sergio Perez,2,HARD,24,97298.708333
8,11,Sergio Perez,3,SOFT,21,95978.666667
9,14,Fernando Alonso,1,SOFT,15,98906.333333


# Step 9. Pit Stops (counts and durations - even if NaN)

In [25]:
if not fp.empty:
    pits = (
        fp
        .groupby('driver_number', dropna=False)
        .agg(
            pit_events=('pit_time_ms', 'size'),
            avg_pit_ms=('pit_time_ms', 'mean')
        )
        .sort_values('pit_events', ascending=False)
    )

    display(pits.head(20))
else:
    print("Pit Stops data is empty.")

Pit Stops data is empty.


# Step 10. Weather alignment (merge_asof)

In [38]:
# Realistic Join - aligning lap midpoints to nearest weather timestamp.

if not fl_enriched.empty and not fw.empty:
    # Ensuring time columns for merge_asof (we will synthesize lap midpoint in ms if needed.)
    # We didn't persist absolute timestamps, so we will fake a monotonically increasing t_ms per lap per driver.
    fl_enriched = fl_enriched.sort_values(['driver_number', 'lap_number']).copy()
    fl_enriched['t_ms'] = fl_enriched.groupby('driver_number').cumcount() * 90000 # assuming avg 90s per lap placeholder.

    fw2 = fw.copy()
    
    # weather might have session_time as datetime, convert to a numeric ms scale relative to first row.
    if 'session_time' in fw2.columns and pd.api.types.is_datetime64_any_dtype(fw2['session_time']):
        t0 = fw2['session_time'].min()
        fw2['t_ms'] = (fw2['session_time'] -t0).dt.total_seconds() * 100
    elif 'session_time' in fw2.columns:
        # if not datetime just rank order.
        fw2['t_ms'] = np.arange(len(fw2)) * 60000 # assuming weather readings every minute.
    else:
        fw2['t_ms'] = np.arange(len(fw2)) * 60000

    merged = pd.merge_asof(
        fl_enriched.sort_values('t_ms'),
        fw2.sort_values('t_ms'),
        on = 't_ms',
        direction = 'nearest'
    )

    display(merged[['driver_number', 'lap_number', 'lap_time_ms', 't_ms', 'air_temp_c', 'track_temp_c', 'humidity_pct']].head(20))

Unnamed: 0,driver_number,lap_number,lap_time_ms,t_ms,air_temp_c,track_temp_c,humidity_pct
0,1,1.0,97284.0,0,18.9,26.5,46.0
1,14,1.0,101679.0,0,18.9,26.5,46.0
2,16,1.0,98271.0,0,18.9,26.5,46.0
3,18,1.0,110214.0,0,18.9,26.5,46.0
4,2,1.0,105921.0,0,18.9,26.5,46.0
5,20,1.0,104349.0,0,18.9,26.5,46.0
6,22,1.0,103485.0,0,18.9,26.5,46.0
7,23,1.0,103888.0,0,18.9,26.5,46.0
8,11,1.0,100053.0,0,18.9,26.5,46.0
9,27,1.0,113555.0,0,18.9,26.5,46.0


# Step 11. Pivot Tables.

In [39]:
if not fl_enriched.empty:
    pivot = pd.pivot_table(
        fl_enriched,
        values = 'lap_time_ms',
        index = 'tyre_compound',
        columns = 'stint',
        aggfunc = 'mean'
    )

    display(pivot.head(20))

stint,1,2,3
tyre_compound,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HARD,,97880.756962,
SOFT,99029.012931,,96667.628571


# Step 12. Vectorization Practice (working with np.where, masks, qcut)

In [40]:
# Demonstrating vectorized logic. Avoiding row-wise loops.
# - Tag push laps : laps times faster than drivers 25th percentile pace.
# - Binning lap_time_ms into speed tiers using qcut.

if not fl_enriched.empty:
    # Tagging push laps
    q25 = (
        fl_enriched
        .groupby('driver_number')['lap_time_ms']
        .transform(lambda x: x.quantile(0.25))
    )
    fl_enriched['is_push_lap'] = np.where(fl_enriched['lap_time_ms'] < q25, True, False)
    fl_enriched['speed_tier'] = pd.qcut(fl_enriched['lap_time_ms'], q=5, labels=False, duplicates='drop')

    display(fl_enriched[['driver_number', 'lap_number', 'lap_time_ms', 'is_push_lap', 'speed_tier']].head(40))

Unnamed: 0,driver_number,lap_number,lap_time_ms,is_push_lap,speed_tier
0,1,1.0,97284.0,False,2.0
1,1,2.0,96296.0,False,1.0
2,1,3.0,96753.0,False,2.0
3,1,4.0,96647.0,False,1.0
4,1,5.0,97173.0,False,2.0
5,1,6.0,97092.0,False,2.0
6,1,7.0,97038.0,False,2.0
7,1,8.0,97024.0,False,2.0
8,1,9.0,97229.0,False,2.0
9,1,10.0,96960.0,False,2.0


# Step 13. Demonstrating Memory Optimization

In [41]:
opt = fl_enriched.copy()

for cat_cols in ['driver_number', 'tyre_compound', 'team', 'track_status']:
    if cat_cols in opt.columns:
        opt[cat_cols] = opt[cat_cols].astype('category')

if 'lap_number' in opt.columns:
    opt['lap_number'] = pd.to_numeric(opt['lap_number'], errors='coerce').astype('Int32')

{
    'before_mb' : mem_mb(fl_enriched),
    'after_mb' : mem_mb(opt)
}

{'before_mb': np.float64(0.5095605850219727),
 'after_mb': np.float64(0.2513580322265625)}

# Step 14. Saving a session summary json (for notes and reproducibility)

In [43]:
# Persisting a session summary json for what we found - handy when collaborating / debugging later.

summary = {
    'session_key' : session_key,
    'rows' : {
        'fact_laps' : len(fl),
        'fact_pitstops' : len(fp),
        'fact_weather' : len(fw),
        'dim_drivers' : len(dd)
    },
    'drivers' : sorted(dd['full_name'].dropna().unique().tolist() if not dd.empty and 'full_name' in dd.columns else None),
    'pace_top10' : (pace.head(10).to_dict(orient='records') if 'pace' in globals() else []),
    'pits_by_driver' : (pits.reset_index().to_dict(orient='records') if 'pits' in globals() and not fp.empty else []),
    'has_weather' : not fw.empty,
    'earlier_memory_usage_mb' : mem_mb(fl_enriched),
    'optimized_memory_usage_mb' : mem_mb(opt)
}

out_path = PROJECT_DIR / 'notebooks' / '02_processed_exploration_summary.json'
out_path.write_text(json.dumps(summary, indent=2))
out_path, summary['rows']

(PosixPath('/Users/pratyushagarwal/Desktop/PROJECTS/F1_Projects/race_weekend_data_pipeline/notebooks/02_processed_exploration_summary.json'),
 {'fact_laps': 1129,
  'fact_pitstops': 0,
  'fact_weather': 157,
  'dim_drivers': 20})