# 01 - Data Engineering

## Objective
Prepare a clean, modeling-ready race dataset from raw FastF1 timing data.

### Tasks
- Load race session
- Clean lap data
- Remove non-representative laps
- Engineer core performance features
- Save processed dataset


In [3]:
import sys
import os
sys.path.append("..")

import pandas as pd
import numpy as np

from src.config import RACE_YEAR, RACE_NAME, SESSION_TYPE
from src.data_loader import enable_cache, load_session
from src.preprocessing import clean_laps
from src.feature_engineering import add_features

In [7]:
enable_cache()

laps_raw = load_session(RACE_YEAR, RACE_NAME, SESSION_TYPE)

print(f"Total raw laps: {len(laps_raw)}")
laps_raw.head()


req            INFO 	No cached data found for season_schedule. Loading data...
_api           INFO 	Fetching season schedule...
req            INFO 	Data has been written to cache!
core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.7.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers

Total raw laps: 1056


Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,...,FreshTyre,Team,LapStartTime,LapStartDate,TrackStatus,Position,Deleted,DeletedReason,FastF1Generated,IsAccurate
0,0 days 01:04:15.902000,VER,1,0 days 00:01:39.019000,1.0,1.0,NaT,NaT,NaT,0 days 00:00:42.414000,...,False,Red Bull Racing,0 days 01:02:36.652000,2023-03-05 15:03:38.501,12,1.0,False,,False,False
1,0 days 01:05:53.876000,VER,1,0 days 00:01:37.974000,2.0,1.0,NaT,NaT,0 days 00:00:31.342000,0 days 00:00:42.504000,...,False,Red Bull Racing,0 days 01:04:15.902000,2023-03-05 15:05:17.751,12,1.0,False,,False,True
2,0 days 01:07:31.882000,VER,1,0 days 00:01:38.006000,3.0,1.0,NaT,NaT,0 days 00:00:31.388000,0 days 00:00:42.469000,...,False,Red Bull Racing,0 days 01:05:53.876000,2023-03-05 15:06:55.725,1,1.0,False,,False,True
3,0 days 01:09:09.858000,VER,1,0 days 00:01:37.976000,4.0,1.0,NaT,NaT,0 days 00:00:31.271000,0 days 00:00:42.642000,...,False,Red Bull Racing,0 days 01:07:31.882000,2023-03-05 15:08:33.731,1,1.0,False,,False,True
4,0 days 01:10:47.893000,VER,1,0 days 00:01:38.035000,5.0,1.0,NaT,NaT,0 days 00:00:31.244000,0 days 00:00:42.724000,...,False,Red Bull Racing,0 days 01:09:09.858000,2023-03-05 15:10:11.707,1,1.0,False,,False,True


In [11]:
os.makedirs("../data/raw", exist_ok=True)

laps_raw.to_csv(
    f"../data/raw/f1_laps_{RACE_YEAR}_{RACE_NAME}.csv",
    index=False
)

print("Raw dataset saved.")

Raw dataset saved.


In [8]:
laps_clean = clean_laps(laps_raw)

print(f"Clean laps: {len(laps_clean)}")
laps_clean.head()


Clean laps: 914


Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,...,TrackStatus,Position,Deleted,DeletedReason,FastF1Generated,IsAccurate,LapTimeSec,Sector1Sec,Sector2Sec,Sector3Sec
1,0 days 01:05:53.876000,VER,1,0 days 00:01:37.974000,2.0,1.0,NaT,NaT,0 days 00:00:31.342000,0 days 00:00:42.504000,...,12,1.0,False,,False,True,97.974,31.342,42.504,24.128
2,0 days 01:07:31.882000,VER,1,0 days 00:01:38.006000,3.0,1.0,NaT,NaT,0 days 00:00:31.388000,0 days 00:00:42.469000,...,1,1.0,False,,False,True,98.006,31.388,42.469,24.149
3,0 days 01:09:09.858000,VER,1,0 days 00:01:37.976000,4.0,1.0,NaT,NaT,0 days 00:00:31.271000,0 days 00:00:42.642000,...,1,1.0,False,,False,True,97.976,31.271,42.642,24.063
4,0 days 01:10:47.893000,VER,1,0 days 00:01:38.035000,5.0,1.0,NaT,NaT,0 days 00:00:31.244000,0 days 00:00:42.724000,...,1,1.0,False,,False,True,98.035,31.244,42.724,24.067
5,0 days 01:12:25.879000,VER,1,0 days 00:01:37.986000,6.0,1.0,NaT,NaT,0 days 00:00:31.341000,0 days 00:00:42.632000,...,1,1.0,False,,False,True,97.986,31.341,42.632,24.013


We remove:
- Inaccurate laps
- Pit in/out laps
- Convert sector times to seconds


In [11]:
laps_features = add_features(laps_clean)

laps_features.head()


Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,...,Deleted,DeletedReason,FastF1Generated,IsAccurate,LapTimeSec,Sector1Sec,Sector2Sec,Sector3Sec,FuelProxy,StintLap
1,0 days 01:05:53.876000,VER,1,0 days 00:01:37.974000,2.0,1.0,NaT,NaT,0 days 00:00:31.342000,0 days 00:00:42.504000,...,False,,False,True,97.974,31.342,42.504,24.128,55.0,0
2,0 days 01:07:31.882000,VER,1,0 days 00:01:38.006000,3.0,1.0,NaT,NaT,0 days 00:00:31.388000,0 days 00:00:42.469000,...,False,,False,True,98.006,31.388,42.469,24.149,54.0,1
3,0 days 01:09:09.858000,VER,1,0 days 00:01:37.976000,4.0,1.0,NaT,NaT,0 days 00:00:31.271000,0 days 00:00:42.642000,...,False,,False,True,97.976,31.271,42.642,24.063,53.0,2
4,0 days 01:10:47.893000,VER,1,0 days 00:01:38.035000,5.0,1.0,NaT,NaT,0 days 00:00:31.244000,0 days 00:00:42.724000,...,False,,False,True,98.035,31.244,42.724,24.067,52.0,3
5,0 days 01:12:25.879000,VER,1,0 days 00:01:37.986000,6.0,1.0,NaT,NaT,0 days 00:00:31.341000,0 days 00:00:42.632000,...,False,,False,True,97.986,31.341,42.632,24.013,51.0,4


In [13]:
laps_features[['Driver', 'LapNumber', 'Stint', 'StintLap', 'FuelProxy']].head(10)

Unnamed: 0,Driver,LapNumber,Stint,StintLap,FuelProxy
1,VER,2.0,1.0,0,55.0
2,VER,3.0,1.0,1,54.0
3,VER,4.0,1.0,2,53.0
4,VER,5.0,1.0,3,52.0
5,VER,6.0,1.0,4,51.0
6,VER,7.0,1.0,5,50.0
7,VER,8.0,1.0,6,49.0
8,VER,9.0,1.0,7,48.0
9,VER,10.0,1.0,8,47.0
10,VER,11.0,1.0,9,46.0


In [15]:
laps_features['LapTimeSec'].describe()


count    914.000000
mean      98.586856
std        1.308025
min       93.996000
25%       97.656500
50%       98.342000
75%       99.410500
max      102.267000
Name: LapTimeSec, dtype: float64

In [17]:
laps_features.groupby(['Driver', 'Stint'])['StintLap'].max().head()


Driver  Stint
ALB     1.0       8
        2.0      12
        3.0      11
        4.0      15
ALO     1.0      11
Name: StintLap, dtype: int64

In [19]:
laps_features[['LapNumber', 'FuelProxy']].head()


Unnamed: 0,LapNumber,FuelProxy
1,2.0,55.0
2,3.0,54.0
3,4.0,53.0
4,5.0,52.0
5,6.0,51.0


In [21]:
os.makedirs("../data/processed", exist_ok=True)

laps_features.to_csv(
    f"../data/processed/{RACE_YEAR}_{RACE_NAME}_processed.csv",
    index=False
)

print("Processed dataset saved.")

Processed dataset saved.
