In [1]:
import sys
sys.path.append('../../')

import pandas as pd

In [2]:
from src.time_transformer import time_to_milliseconds
from src.transformer_utils import fill_na, print_df_missing_values, set_the_right_col_type

## 1. Chargement du Dataset

In [3]:
df_results = pd.read_csv('../../data/inputs/results.csv')

In [4]:
print(df_results.shape)
df_results.head()

(26080, 18)


Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,5690616,39,2,1:27.452,218.3,1
1,2,18,2,2,3,5,2,2,2,8.0,58,+5.478,5696094,41,3,1:27.739,217.586,1
2,3,18,3,3,7,7,3,3,3,6.0,58,+8.163,5698779,41,5,1:28.090,216.719,1
3,4,18,4,4,5,11,4,4,4,5.0,58,+17.181,5707797,58,7,1:28.603,215.464,1
4,5,18,5,1,23,3,5,5,5,4.0,58,+18.014,5708630,43,1,1:27.418,218.385,1


In [5]:
df_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26080 entries, 0 to 26079
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   resultId         26080 non-null  int64  
 1   raceId           26080 non-null  int64  
 2   driverId         26080 non-null  int64  
 3   constructorId    26080 non-null  int64  
 4   number           26080 non-null  object 
 5   grid             26080 non-null  int64  
 6   position         26080 non-null  object 
 7   positionText     26080 non-null  object 
 8   positionOrder    26080 non-null  int64  
 9   points           26080 non-null  float64
 10  laps             26080 non-null  int64  
 11  time             26080 non-null  object 
 12  milliseconds     26080 non-null  object 
 13  fastestLap       26080 non-null  object 
 14  rank             26080 non-null  object 
 15  fastestLapTime   26080 non-null  object 
 16  fastestLapSpeed  26080 non-null  object 
 17  statusId    

## 2. Traitement des valeurs nulles
### 2.1 Recherchons les valeures nulles
En se référent à la documentation du jeu de donnée, nous remarquons que les valeures nulles sont plutot représentés par la valeur \N

In [6]:
car = '\\N'
print_df_missing_values(df_results, car)

resultId: 0
raceId: 0
driverId: 0
constructorId: 0
number: 6
grid: 0
position: 10873
positionText: 0
positionOrder: 0
points: 0
laps: 0
time: 18829
milliseconds: 18830
fastestLap: 18465
rank: 18249
fastestLapTime: 18465
fastestLapSpeed: 18465
statusId: 0


In [7]:
df_results.isnull().sum()

resultId           0
raceId             0
driverId           0
constructorId      0
number             0
grid               0
position           0
positionText       0
positionOrder      0
points             0
laps               0
time               0
milliseconds       0
fastestLap         0
rank               0
fastestLapTime     0
fastestLapSpeed    0
statusId           0
dtype: int64

### 2.2. Ajouter une valeur par défaut dans les lignes qui ont la valeur \N

In [8]:
cols = ['milliseconds', 'fastestLap', 'fastestLapSpeed']
# Etant donnée que ces trois champs sont quasiment du meme types nous pourrons leur attribuer la même valeur par défaut
for field in cols:
    df_results = fill_na(df_results, field, 0)

# Ajoutons la valeur par defaut à 90 min
df_results = fill_na(df_results, 'fastestLapTime', '90:00')

In [9]:
car = '\\N'
print_df_missing_values(df_results, car)

resultId: 0
raceId: 0
driverId: 0
constructorId: 0
number: 6
grid: 0
position: 10873
positionText: 0
positionOrder: 0
points: 0
laps: 0
time: 18829
milliseconds: 0
fastestLap: 0
rank: 18249
fastestLapTime: 0
fastestLapSpeed: 0
statusId: 0


## 2. Convertir le le champs fastLapTime en millisecondes

### 2.2. Convert a fastestLapTime field

In [10]:
# Convertion de 'fastestLapTime' en millisecondes en utilisant une opération vectorielle
df_results['fastestLapTime'] = df_results['fastestLapTime'].apply(time_to_milliseconds)

In [11]:
df_results.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,5690616,39,2,87452,218.3,1
1,2,18,2,2,3,5,2,2,2,8.0,58,+5.478,5696094,41,3,87739,217.586,1
2,3,18,3,3,7,7,3,3,3,6.0,58,+8.163,5698779,41,5,88090,216.719,1
3,4,18,4,4,5,11,4,4,4,5.0,58,+17.181,5707797,58,7,88603,215.464,1
4,5,18,5,1,23,3,5,5,5,4.0,58,+18.014,5708630,43,1,87418,218.385,1


## 3. Retirer les champs unitilisés

Nous pourrons avoir ces informations dans l'ordre de position. Cette information est présente dans le dataset des pilotes

In [12]:
df_results.drop(['positionText', 'number', 'position', 'rank', 'time'], axis=1, inplace=True)

In [13]:
df_results.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,positionOrder,points,laps,milliseconds,fastestLap,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,1,1,10.0,58,5690616,39,87452,218.3,1
1,2,18,2,2,5,2,8.0,58,5696094,41,87739,217.586,1
2,3,18,3,3,7,3,6.0,58,5698779,41,88090,216.719,1
3,4,18,4,4,11,4,5.0,58,5707797,58,88603,215.464,1
4,5,18,5,1,3,5,4.0,58,5708630,43,87418,218.385,1


## 4. Int fields to int64

In [14]:
df_results = set_the_right_col_type(df_results, ['milliseconds', 'fastestLap'], 'int64')
df_results = set_the_right_col_type(df_results, ['fastestLapSpeed'], 'float64')

In [15]:
df_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26080 entries, 0 to 26079
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   resultId         26080 non-null  int64  
 1   raceId           26080 non-null  int64  
 2   driverId         26080 non-null  int64  
 3   constructorId    26080 non-null  int64  
 4   grid             26080 non-null  int64  
 5   positionOrder    26080 non-null  int64  
 6   points           26080 non-null  float64
 7   laps             26080 non-null  int64  
 8   milliseconds     26080 non-null  int64  
 9   fastestLap       26080 non-null  int64  
 10  fastestLapTime   26080 non-null  int64  
 11  fastestLapSpeed  26080 non-null  float64
 12  statusId         26080 non-null  int64  
dtypes: float64(2), int64(11)
memory usage: 2.6 MB


In [16]:
df_results.to_csv('../../data/outputs/01_results_cleaned.csv', index=False)