In [22]:
%matplotlib inline
import pandas as pd

In [23]:
from libs.time_transformer import time_to_milliseconds

## 1. Loading the Data set

In [24]:
df_results = pd.read_csv('../data/inputs/results.csv')

In [25]:
print(df_results.shape)
df_results.head()

(26080, 18)


Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,5690616,39,2,1:27.452,218.3,1
1,2,18,2,2,3,5,2,2,2,8.0,58,+5.478,5696094,41,3,1:27.739,217.586,1
2,3,18,3,3,7,7,3,3,3,6.0,58,+8.163,5698779,41,5,1:28.090,216.719,1
3,4,18,4,4,5,11,4,4,4,5.0,58,+17.181,5707797,58,7,1:28.603,215.464,1
4,5,18,5,1,23,3,5,5,5,4.0,58,+18.014,5708630,43,1,1:27.418,218.385,1


## 2. Converting fatestLapTime in milliseconds

### 2.1. add a default values to a results without fastestLapTime defined (\N)

In [26]:
# Filter rows where fastestLapTime is '\\N' and replace with '90:00'
df_results.loc[df_results.fastestLapTime == '\\N', 'fastestLapTime'] = '90:00'

### 2.2. Convert a fastestLapTime field

In [27]:
# Convert 'fastestLapTime' to milliseconds using vectorized operation
df_results['fastestLapTime'] = df_results['fastestLapTime'].apply(time_to_milliseconds)

In [28]:
df_results.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,5690616,39,2,87452,218.3,1
1,2,18,2,2,3,5,2,2,2,8.0,58,+5.478,5696094,41,3,87739,217.586,1
2,3,18,3,3,7,7,3,3,3,6.0,58,+8.163,5698779,41,5,88090,216.719,1
3,4,18,4,4,5,11,4,4,4,5.0,58,+17.181,5707797,58,7,88603,215.464,1
4,5,18,5,1,23,3,5,5,5,4.0,58,+18.014,5708630,43,1,87418,218.385,1


## 3. Delete the time field and add a default to \N in time

In [29]:
df_results.drop('time', axis=1, inplace=True)

In [30]:
df_results.loc[df_results.milliseconds == '\\N', 'milliseconds'] = '0'

In [31]:

df_results.tail(10)

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
26070,26076,1110,842,214,10,12,11,11,11,0.0,44,5033534,25,12,110911,227.339,1
26071,26077,1110,822,51,77,13,12,12,12,0.0,44,5035641,26,10,110515,228.153,1
26072,26078,1110,855,51,24,17,13,13,13,0.0,44,5045891,27,5,110188,228.830,1
26073,26079,1110,848,3,23,15,14,14,14,0.0,44,5046634,35,3,109841,229.553,1
26074,26080,1110,825,210,20,16,15,15,15,0.0,44,5052204,27,14,110993,227.171,1
26075,26081,1110,817,213,3,19,16,16,16,0.0,44,5053521,25,15,110994,227.169,1
26076,26082,1110,858,3,2,18,17,17,17,0.0,44,5054926,37,9,110485,228.213,1
26077,26083,1110,807,210,27,0,18,18,18,0.0,44,5060900,26,4,109907,229.415,1
26078,26084,1110,832,6,55,4,\N,R,19,0.0,23,0,9,19,113138,222.864,130
26079,26085,1110,857,1,81,5,\N,R,20,0.0,0,0,\N,0,5400000,\N,130


## 4. Remove unnecessary fields

Because we can have these informations in position Order, for the number it is present in the driver dataset

In [32]:
df_results.drop(['positionText', 'number', 'position', 'positionOrder'], axis=1, inplace=True)

In [33]:
df_results.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,points,laps,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,1,10.0,58,5690616,39,2,87452,218.3,1
1,2,18,2,2,5,8.0,58,5696094,41,3,87739,217.586,1
2,3,18,3,3,7,6.0,58,5698779,41,5,88090,216.719,1
3,4,18,4,4,11,5.0,58,5707797,58,7,88603,215.464,1
4,5,18,5,1,3,4.0,58,5708630,43,1,87418,218.385,1


## 5. Put the right type to each fields

In [34]:
df_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26080 entries, 0 to 26079
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   resultId         26080 non-null  int64  
 1   raceId           26080 non-null  int64  
 2   driverId         26080 non-null  int64  
 3   constructorId    26080 non-null  int64  
 4   grid             26080 non-null  int64  
 5   points           26080 non-null  float64
 6   laps             26080 non-null  int64  
 7   milliseconds     26080 non-null  object 
 8   fastestLap       26080 non-null  object 
 9   rank             26080 non-null  object 
 10  fastestLapTime   26080 non-null  int64  
 11  fastestLapSpeed  26080 non-null  object 
 12  statusId         26080 non-null  int64  
dtypes: float64(1), int64(8), object(4)
memory usage: 2.6+ MB


### 5.1. Rename the field rank beacause it is a reversed work 

In [35]:
df_results.rename({'rank': 'race_rank'}, axis=1, inplace=True)

### 5.1. Fill the \N value (NA) in the fields that has type object

In [36]:
df_results.loc[df_results.milliseconds == '\\N']

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,points,laps,milliseconds,fastestLap,race_rank,fastestLapTime,fastestLapSpeed,statusId


In [37]:
df_results.loc[df_results.fastestLap == '\\N', 'fastestLap'] = 0
df_results.loc[df_results.fastestLap == '\\N']

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,points,laps,milliseconds,fastestLap,race_rank,fastestLapTime,fastestLapSpeed,statusId


In [38]:
df_results.loc[df_results.race_rank == '\\N', 'race_rank'] = '0'
df_results.loc[df_results.race_rank == '\\N']

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,points,laps,milliseconds,fastestLap,race_rank,fastestLapTime,fastestLapSpeed,statusId


In [39]:
df_results.loc[df_results.fastestLapSpeed == '\\N', 'fastestLapSpeed'] = 0
df_results.loc[df_results.fastestLapSpeed == '\\N']

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,points,laps,milliseconds,fastestLap,race_rank,fastestLapTime,fastestLapSpeed,statusId


### 5.1. Int fields to int64

In [40]:
df_results.milliseconds = df_results.milliseconds.astype('int64')
df_results.fastestLap = df_results.fastestLap.astype('int64')
df_results.fastestLapSpeed = df_results.fastestLapSpeed.astype('float64')

In [41]:
df_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26080 entries, 0 to 26079
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   resultId         26080 non-null  int64  
 1   raceId           26080 non-null  int64  
 2   driverId         26080 non-null  int64  
 3   constructorId    26080 non-null  int64  
 4   grid             26080 non-null  int64  
 5   points           26080 non-null  float64
 6   laps             26080 non-null  int64  
 7   milliseconds     26080 non-null  int64  
 8   fastestLap       26080 non-null  int64  
 9   race_rank        26080 non-null  object 
 10  fastestLapTime   26080 non-null  int64  
 11  fastestLapSpeed  26080 non-null  float64
 12  statusId         26080 non-null  int64  
dtypes: float64(2), int64(10), object(1)
memory usage: 2.6+ MB


In [42]:
df_results.to_csv('../data/outputs/01_results_cleaned.csv', index=False)