In [327]:
import pandas as pd
from sklearn.preprocessing import Normalizer, StandardScaler

In [328]:
from libs.transformer_utils import process_features_standardisation, encode_labels, one_hot_encode_field

## 1. Load the Dataset

In [329]:
df = pd.read_csv('../data/outputs/06_final_dataset.csv')

In [330]:
df.shape
df.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,race_rank,statusId,year,round,circuitId,...,race_end_in_2019,race_end_in_2020,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_most_won_circuit_id,driver_nber_of_races_won,driver_nber_of_times_in_top_10,age,season_age
0,1,18,1,1,1,1,1,2008,1,1,...,21,16,21,19,12,9,103,283,39,23
1,2,18,2,2,5,2,1,2008,1,1,...,0,0,0,0,0,0,0,101,47,31
2,3,18,3,3,7,3,1,2008,1,1,...,0,0,0,0,0,6,23,144,39,23
3,4,18,4,4,11,4,1,2008,1,1,...,0,0,11,12,12,3,32,250,43,27
4,5,18,5,1,3,5,1,2008,1,1,...,0,0,0,0,0,11,1,36,43,27


## 2. Process Encoding

### 2.1. Encode the categorials values

In [331]:
# to_one_enc = ['grid', 'circuitId', 'constructor_is_active', 'driver_is_active']
to_one_enc = ['grid', 'constructor_is_active', 'driver_is_active']
for field in to_one_enc:
    df = one_hot_encode_field(df, field)

### 2.2. Normalizing the values

In [332]:
# cols = ['year', 'driver_avg_point', 'race_end_bf_2019', 'driver_nber_of_races_won',
#         'race_end_in_2019', 'race_end_in_2020', 'race_end_in_2021', 'race_end_in_2022', 'race_end_in_2023']
cols = ['driver_avg_point', 'race_end_bf_2019', 'driver_nber_of_races_won',
        'race_end_in_2019', 'race_end_in_2020', 'race_end_in_2021', 'race_end_in_2022', 'race_end_in_2023']
df[cols] = process_features_standardisation(df, cols, Normalizer)

### 2.3. Descritizing the values

In [333]:
bins = [df['driver_avg_speed'].min(), 200, 202, 204, 206, df['driver_avg_speed'].max()]
labels = [1, 2, 3, 4, 5]
categories = pd.cut(df['driver_avg_speed'], bins=bins, labels=labels, include_lowest=True)
df['driver_avg_speed'] = categories

In [334]:
bins = [df['constructor_avg_point'].min(), 2, 6, 8, 10, df['constructor_avg_point'].max()]
labels = [1, 2, 3, 4, 5]
categories = pd.cut(df['constructor_avg_point'], bins=bins, labels=labels, include_lowest=True)
df['constructor_avg_point'] = categories

### 2.4. Delete unused columns

In [335]:
# df.drop(['raceId', 'resultId', 'age', 'driver_avg_point', 'year', 'race_end_bf_2019'], axis=1, inplace=True)
df.drop(['raceId', 'resultId', 'age', 'driver_avg_point', 'race_end_bf_2019'], axis=1, inplace=True)

## 3. Splitting the Data set

### 3.1. Remove the races where the number of drivers are more than 23
Decrease in output and increase in stringent regulations: 2006-2013


In [336]:
df = df[df['year'] > 2005]
print(df.shape)

(7235, 56)


In [337]:
for i in range(24, 35):
    df.drop(f'grid_{i}', axis=1, inplace=True)

In [340]:
print(df.shape)
df.head()

(7235, 45)


Unnamed: 0,driverId,constructorId,race_rank,statusId,year,round,circuitId,constructor_races_won,constructor_avg_point,constructor_times_in_top_10,...,grid_16,grid_17,grid_18,grid_19,grid_20,grid_21,grid_22,grid_23,constructor_is_active_1,driver_is_active_1
0,1,1,1,1,2008,1,1,179,2,1110,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,2,2,2,1,2008,1,1,1,2,91,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,3,3,1,2008,1,1,114,2,764,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4,4,4,1,2008,1,1,35,2,390,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,1,5,1,2008,1,1,179,2,1110,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## 4. Export the result

In [339]:
df.to_csv('../data/outputs/07_dataset_processed.csv', index=False)