In [16]:
import sys
sys.path.append('../../')

import numpy as np
import pandas as pd

## 1. Load the Datasets

In [3]:
df_drivers = pd.read_csv('../../data/outputs/03_drivers_processed.csv')
df_races_results = pd.read_csv('../../data/outputs/03_races_results.csv')

In [4]:
df_drivers.head()

Unnamed: 0,driverId,number,forename,surname,dob,nationality,driver_is_active
0,1,44,Lewis,Hamilton,1985-01-07,British,1
1,2,\N,Nick,Heidfeld,1977-05-10,German,0
2,3,6,Nico,Rosberg,1985-06-27,German,0
3,4,14,Fernando,Alonso,1981-07-29,Spanish,1
4,5,\N,Heikki,Kovalainen,1981-10-19,Finnish,0


In [5]:
df_races_results.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,race_rank,points,laps,milliseconds,fastestLap,fastestLapTime,fastestLapSpeed,statusId,year,round,circuitId,name
0,1,18,1,1,1,1,10.0,58,5690616,39,87452,218.3,1,2008,1,1,Australian Grand Prix
1,2,18,2,2,5,2,8.0,58,5696094,41,87739,217.586,1,2008,1,1,Australian Grand Prix
2,3,18,3,3,7,3,6.0,58,5698779,41,88090,216.719,1,2008,1,1,Australian Grand Prix
3,4,18,4,4,11,4,5.0,58,5707797,58,88603,215.464,1,2008,1,1,Australian Grand Prix
4,5,18,5,1,3,5,4.0,58,5708630,43,87418,218.385,1,2008,1,1,Australian Grand Prix


## 2. Process driver dataset

### 2.1. Set the name and surname in the same colum

In [14]:
df_drivers['full_name'] = df_drivers['forename'] + ' ' + df_drivers['surname']
df_drivers.drop(['forename', 'surname'], axis=1, inplace=True)

In [15]:
df_drivers.head()

Unnamed: 0,driverId,number,nationality,driver_is_active,age,full_name
0,1,44,British,1,39,Lewis Hamilton
1,2,\N,German,0,47,Nick Heidfeld
2,3,6,German,0,39,Nico Rosberg
3,4,14,Spanish,1,43,Fernando Alonso
4,5,\N,Finnish,0,43,Heikki Kovalainen


### 2.2 Calculate the driver age

In [8]:
import datetime

def calculate_age(row: pd.DataFrame) -> int:
    return datetime.datetime.now().year - row['dob'].year

In [11]:
# Convert the dob column in Pandas date type  
df_drivers['dob'] = pd.to_datetime(df_drivers['dob'])
df_drivers['age'] = df_drivers.apply(calculate_age, axis=1)
df_drivers.drop('dob', axis=1, inplace=True)

In [12]:
df_drivers.head()

Unnamed: 0,driverId,number,forename,surname,nationality,driver_is_active,age
0,1,44,Lewis,Hamilton,British,1,39
1,2,\N,Nick,Heidfeld,German,0,47
2,3,6,Nico,Rosberg,German,0,39
3,4,14,Fernando,Alonso,Spanish,1,43
4,5,\N,Heikki,Kovalainen,Finnish,0,43


### 2.3. Calculate the average points for each driver

In [17]:
# Get all the ids for each drivers
ids = df_drivers.driverId.to_list()
# Create a driver avg point column
df_drivers['driver_avg_point'] = np.float64(0)

In [18]:
for driverId in ids:
    avg_points = df_races_results[df_races_results.driverId == driverId]['points'].mean()
    df_drivers.loc[df_drivers.driverId == driverId, 'driver_avg_point'] = np.float64(avg_points)

In [19]:
df_drivers.head()

Unnamed: 0,driverId,number,nationality,driver_is_active,age,full_name,driver_avg_point
0,1,44,British,1,39,Lewis Hamilton,14.100932
1,2,\N,German,0,47,Nick Heidfeld,1.407609
2,3,6,German,0,39,Nico Rosberg,7.740291
3,4,14,Spanish,1,43,Fernando Alonso,5.954054
4,5,\N,Finnish,0,43,Heikki Kovalainen,0.9375


### 2.4. Calculate the driver average speed

In [20]:
# Initialisation de la colonne avec la valeure à 0 par défaut
df_drivers['driver_avg_speed'] = np.float64(0)

for driverId in ids:
    avg_speed = df_races_results[(df_races_results.driverId == driverId) & (df_races_results.fastestLapSpeed > 0)]['fastestLapSpeed'].mean()
    df_drivers.loc[df_drivers.driverId == driverId, 'driver_avg_speed'] = np.float64(avg_speed)

In [21]:
df_drivers.head()

Unnamed: 0,driverId,number,nationality,driver_is_active,age,full_name,driver_avg_point,driver_avg_speed
0,1,44,British,1,39,Lewis Hamilton,14.100932,205.711759
1,2,\N,German,0,47,Nick Heidfeld,1.407609,204.044578
2,3,6,German,0,39,Nico Rosberg,7.740291,200.290369
3,4,14,Spanish,1,43,Fernando Alonso,5.954054,204.462531
4,5,\N,Finnish,0,43,Heikki Kovalainen,0.9375,198.246165


### 2.5. Number of races ends from 2015

In [23]:
years_list = [0, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
df_drivers['race_end_bf_2015'] = 0
df_drivers['race_end_in_2015'] = 0
df_drivers['race_end_in_2016'] = 0
df_drivers['race_end_in_2017'] = 0
df_drivers['race_end_in_2018'] = 0
df_drivers['race_end_in_2019'] = 0
df_drivers['race_end_in_2020'] = 0
df_drivers['race_end_in_2021'] = 0
df_drivers['race_end_in_2022'] = 0
df_drivers['race_end_in_2023'] = 0

for driverId in ids:
    for year in years_list:
        if 0 == year:
            nber_of_win = df_races_results[(df_races_results.driverId == driverId) & (df_races_results.statusId == 1) & (df_races_results.year < 2015)].raceId.count()
            df_drivers.loc[df_drivers.driverId == driverId, 'race_end_bf_2015'] = nber_of_win
        else:
            nber_of_win = df_races_results[(df_races_results.driverId == driverId) & (df_races_results.statusId == 1) & (df_races_results.year == year)].raceId.count()
            df_drivers.loc[df_drivers.driverId == driverId, f'race_end_in_{year}'] = nber_of_win

In [24]:
df_drivers.head()

Unnamed: 0,driverId,number,nationality,driver_is_active,age,full_name,driver_avg_point,driver_avg_speed,race_end_bf_2015,race_end_in_2015,race_end_in_2016,race_end_in_2017,race_end_in_2018,race_end_in_2019,race_end_in_2020,race_end_in_2021,race_end_in_2022,race_end_in_2023
0,1,44,British,1,39,Lewis Hamilton,14.100932,205.711759,117,18,19,19,20,21,16,21,19,12
1,2,\N,German,0,47,Nick Heidfeld,1.407609,204.044578,66,0,0,0,0,0,0,0,0,0
2,3,6,German,0,39,Nico Rosberg,7.740291,200.290369,108,17,20,0,0,0,0,0,0,0
3,4,14,Spanish,1,43,Fernando Alonso,5.954054,204.462531,183,3,8,3,5,0,0,11,12,12
4,5,\N,Finnish,0,43,Heikki Kovalainen,0.9375,198.246165,39,0,0,0,0,0,0,0,0,0


### 2.6. The driver's circuit most won

In [30]:
df_drivers['driver_most_won_circuit_id'] = 0

for driverId in ids:
    try:
        df_drivers.loc[df_drivers.driverId == driverId, 'driver_most_won_circuit_id'] = df_races_results[
            (df_races_results.driverId == driverId) & (df_races_results.race_rank == 1)
                ].groupby(['circuitId', 'name'])[['resultId']].count().sort_values(
                    ascending=False, by='resultId').resultId.idxmax()[0]
    except:
        df_drivers.loc[df_drivers.driverId == driverId, 'driver_most_won_circuit_id'] = 0

In [31]:
df_drivers.head()

Unnamed: 0,driverId,number,nationality,driver_is_active,age,full_name,driver_avg_point,driver_avg_speed,race_end_bf_2015,race_end_in_2015,race_end_in_2016,race_end_in_2017,race_end_in_2018,race_end_in_2019,race_end_in_2020,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_most_won_circuit_id
0,1,44,British,1,39,Lewis Hamilton,14.100932,205.711759,117,18,19,19,20,21,16,21,19,12,9
1,2,\N,German,0,47,Nick Heidfeld,1.407609,204.044578,66,0,0,0,0,0,0,0,0,0,0
2,3,6,German,0,39,Nico Rosberg,7.740291,200.290369,108,17,20,0,0,0,0,0,0,0,6
3,4,14,Spanish,1,43,Fernando Alonso,5.954054,204.462531,183,3,8,3,5,0,0,11,12,12,3
4,5,\N,Finnish,0,43,Heikki Kovalainen,0.9375,198.246165,39,0,0,0,0,0,0,0,0,0,11


### 2.7. Number of races won

In [32]:
df_drivers['driver_nber_of_races_won'] = 0

for driverId in ids:
    df_drivers.loc[df_drivers.driverId == driverId, 'driver_nber_of_races_won'] = df_races_results[(df_races_results.driverId == driverId) & (df_races_results.race_rank == 1)].shape[0]

In [33]:
df_drivers.head()

Unnamed: 0,driverId,number,nationality,driver_is_active,age,full_name,driver_avg_point,driver_avg_speed,race_end_bf_2015,race_end_in_2015,race_end_in_2016,race_end_in_2017,race_end_in_2018,race_end_in_2019,race_end_in_2020,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_most_won_circuit_id,driver_nber_of_races_won
0,1,44,British,1,39,Lewis Hamilton,14.100932,205.711759,117,18,19,19,20,21,16,21,19,12,9,103
1,2,\N,German,0,47,Nick Heidfeld,1.407609,204.044578,66,0,0,0,0,0,0,0,0,0,0,0
2,3,6,German,0,39,Nico Rosberg,7.740291,200.290369,108,17,20,0,0,0,0,0,0,0,6,23
3,4,14,Spanish,1,43,Fernando Alonso,5.954054,204.462531,183,3,8,3,5,0,0,11,12,12,3,32
4,5,\N,Finnish,0,43,Heikki Kovalainen,0.9375,198.246165,39,0,0,0,0,0,0,0,0,0,11,1


### 2.8. Number of time in the top 10

In [34]:
df_drivers['driver_nber_of_times_in_top_10'] = 0

for driverId in ids:
    df_drivers.loc[df_drivers.driverId == driverId, 'driver_nber_of_times_in_top_10'] = df_races_results[(df_races_results.driverId == driverId) & (df_races_results.race_rank <= 10)].shape[0]

In [35]:
df_drivers.head()

Unnamed: 0,driverId,number,nationality,driver_is_active,age,full_name,driver_avg_point,driver_avg_speed,race_end_bf_2015,race_end_in_2015,...,race_end_in_2017,race_end_in_2018,race_end_in_2019,race_end_in_2020,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_most_won_circuit_id,driver_nber_of_races_won,driver_nber_of_times_in_top_10
0,1,44,British,1,39,Lewis Hamilton,14.100932,205.711759,117,18,...,19,20,21,16,21,19,12,9,103,283
1,2,\N,German,0,47,Nick Heidfeld,1.407609,204.044578,66,0,...,0,0,0,0,0,0,0,0,0,101
2,3,6,German,0,39,Nico Rosberg,7.740291,200.290369,108,17,...,0,0,0,0,0,0,0,6,23,144
3,4,14,Spanish,1,43,Fernando Alonso,5.954054,204.462531,183,3,...,3,5,0,0,11,12,12,3,32,250
4,5,\N,Finnish,0,43,Heikki Kovalainen,0.9375,198.246165,39,0,...,0,0,0,0,0,0,0,11,1,36


### 3. Save the dataset

In [36]:
df_drivers.to_csv('../../data/db/01_driver.csv', index=False)