In [1]:
import pandas as pd

In [None]:
#load data
laps_df = pd.read_csv("C:\data/lap_time.csv")

print("Loaded data")

Loaded data


In [3]:
laps_df.head()

Unnamed: 0,Year,Driver,Stint,LapNumber,Compound,LapTimeSeconds,TrackStatus,PitInTime,PitOutTime
0,2021,GAS,1.0,2.0,HARD,92.814,1,,
1,2021,GAS,1.0,3.0,HARD,91.533,1,,
2,2021,GAS,1.0,4.0,HARD,91.439,1,,
3,2021,GAS,1.0,5.0,HARD,91.367,1,,
4,2021,GAS,1.0,6.0,HARD,90.786,1,,


In [None]:
#split training and testing roughly 75/25
train_laps = laps_df[laps_df['Year'].isin([2021, 2022, 2023])].copy()

test_laps = laps_df[laps_df['Year'] == 2024].copy()


In [None]:
#extract features and working on feature selection
def extract_clean_features(laps, total_laps=58):
    #list to add all the features needed to train the model
    features = []
    #each driver on a certain tire 
    for (driver, stint), group in laps.groupby(['Driver', 'Stint']):
        group = group.sort_values('LapNumber')
        #best lap time on that stint
        baseline = group.iloc[0]['LapTimeSeconds']
        #where the current stint begans
        start_lap = group['LapNumber'].min()

        for _, row in group.iterrows():
            lap_number = row['LapNumber']
            #how long has the driver been on the said tyre
            laps_in_stint = lap_number - start_lap + 1
            #tyre health
            degradation = row['LapTimeSeconds'] - baseline
            #pitted or not
            pit_event = int(pd.notna(row['PitInTime']) or pd.notna(row['PitOutTime']))
            #safety cars and yellow flags
            flagged = 0 if row['TrackStatus'] == 1 else 1
            #add all the features to the list
            features.append({
                'Driver': row['Driver'],
                'LapNumber': lap_number,
                'Compound': row['Compound'],
                'LapsInStint': laps_in_stint,
                'Degradation': degradation,
                'PitEvent': pit_event,
                'Flagged': flagged,
                'LapPercent': lap_number / total_laps,
                'LapTimeSeconds': row['LapTimeSeconds']
            })

    return pd.DataFrame(features)


In [None]:
#extract only the needed features from our df
train_features = extract_clean_features(train_laps)
test_features = extract_clean_features(test_laps)

In [7]:
extract_clean_features(train_laps)

Unnamed: 0,Driver,LapNumber,Compound,LapsInStint,Degradation,PitEvent,Flagged,LapPercent,LapTimeSeconds
0,ALB,2.0,MEDIUM,1.0,0.000,0,0,0.034483,93.227
1,ALB,2.0,MEDIUM,1.0,0.333,0,0,0.034483,93.560
2,ALB,3.0,MEDIUM,2.0,-1.459,0,0,0.051724,91.768
3,ALB,3.0,MEDIUM,2.0,-0.871,0,0,0.051724,92.356
4,ALB,4.0,MEDIUM,3.0,-0.090,0,0,0.068966,93.137
...,...,...,...,...,...,...,...,...,...
2908,ZHO,56.0,SOFT,18.0,3.364,0,0,0.965517,92.315
2909,ZHO,56.0,MEDIUM,18.0,1.249,0,0,0.965517,90.200
2910,ZHO,57.0,SOFT,19.0,3.600,0,0,0.982759,92.551
2911,ZHO,57.0,MEDIUM,19.0,2.040,0,0,0.982759,90.991


In [8]:
extract_clean_features(test_laps)

Unnamed: 0,Driver,LapNumber,Compound,LapsInStint,Degradation,PitEvent,Flagged,LapPercent,LapTimeSeconds
0,ALB,4.0,MEDIUM,1.0,0.000,0,0,0.068966,90.060
1,ALB,5.0,MEDIUM,2.0,0.309,0,0,0.086207,90.369
2,ALB,6.0,MEDIUM,3.0,0.755,0,0,0.103448,90.815
3,ALB,7.0,MEDIUM,4.0,0.257,0,0,0.120690,90.317
4,ALB,8.0,MEDIUM,5.0,0.645,0,0,0.137931,90.705
...,...,...,...,...,...,...,...,...,...
916,ZHO,53.0,HARD,13.0,0.228,0,0,0.913793,88.483
917,ZHO,54.0,HARD,14.0,-0.233,0,0,0.931034,88.022
918,ZHO,55.0,HARD,15.0,-0.208,0,0,0.948276,88.047
919,ZHO,56.0,HARD,16.0,-0.273,0,1,0.965517,87.982


In [None]:
#store it to a pickle file so its easier to access
train_features.to_pickle("C:\data\train_features.pkl")

test_features.to_pickle("C:\data\test_features.pkl")