In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import timedelta

### Importing data

In [2]:
circuits = pd.read_csv('datasets/original/circuits.csv')

constructors_standings = pd.read_csv('datasets/original/constructor_standings.csv')
constructors_df = pd.read_csv('datasets/original/constructors.csv')

driver_standings = pd.read_csv('datasets/original/driver_standings.csv')
drivers_df = pd.read_csv('datasets/original/drivers.csv')

lap_times = pd.read_csv('datasets/original/lap_times.csv')
pit_stops = pd.read_csv('datasets/original/pit_stops.csv')

races_df = pd.read_csv('datasets/original/races.csv')

quali_df = pd.read_csv('datasets/original/qualifying.csv')
results_df = pd.read_csv('datasets/original/results.csv')
sprints_df = pd.read_csv('datasets/original/sprint_results.csv')

status_df = pd.read_csv('datasets/original/status.csv')

### EDA and Feature Engineering

In [3]:
status_df.info()
status_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139 entries, 0 to 138
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   statusId  139 non-null    int64 
 1   status    139 non-null    object
dtypes: int64(1), object(1)
memory usage: 2.3+ KB


Unnamed: 0,statusId,status
0,1,Finished
1,2,Disqualified
2,3,Accident
3,4,Collision
4,5,Engine


Sprint results data

In [4]:
sprints_df.info()
sprints_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   resultId        100 non-null    int64 
 1   raceId          100 non-null    int64 
 2   driverId        100 non-null    int64 
 3   constructorId   100 non-null    int64 
 4   number          100 non-null    int64 
 5   grid            100 non-null    int64 
 6   position        100 non-null    object
 7   positionText    100 non-null    object
 8   positionOrder   100 non-null    int64 
 9   points          100 non-null    int64 
 10  laps            100 non-null    int64 
 11  time            100 non-null    object
 12  milliseconds    100 non-null    object
 13  fastestLap      100 non-null    object
 14  fastestLapTime  100 non-null    object
 15  statusId        100 non-null    int64 
dtypes: int64(10), object(6)
memory usage: 12.6+ KB


Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,fastestLapTime,statusId
0,1,1061,830,9,33,2,1,1,1,3,17,25:38.426,1538426,14,1:30.013,1
1,2,1061,1,131,44,1,2,2,2,2,17,+1.430,1539856,17,1:29.937,1
2,3,1061,822,131,77,3,3,3,3,1,17,+7.502,1545928,17,1:29.958,1
3,4,1061,844,6,16,4,4,4,4,0,17,+11.278,1549704,16,1:30.163,1
4,5,1061,846,1,4,6,5,5,5,0,17,+24.111,1562537,16,1:30.566,1


In [5]:
# position tells the actual position drivers ended up with, meaning if they didn't finish the race, the position column will be NA while 
# positionOrder gives order in which they end up after the race
sprints_df = sprints_df.drop(columns={'resultId','number', 'position', 'positionText'})
sprints_df.head()

Unnamed: 0,raceId,driverId,constructorId,grid,positionOrder,points,laps,time,milliseconds,fastestLap,fastestLapTime,statusId
0,1061,830,9,2,1,3,17,25:38.426,1538426,14,1:30.013,1
1,1061,1,131,1,2,2,17,+1.430,1539856,17,1:29.937,1
2,1061,822,131,3,3,1,17,+7.502,1545928,17,1:29.958,1
3,1061,844,6,4,4,0,17,+11.278,1549704,16,1:30.163,1
4,1061,846,1,6,5,0,17,+24.111,1562537,16,1:30.566,1


In [6]:
for col in ('milliseconds', 'fastestLap'):
    sprints_df[col] = pd.to_numeric(sprints_df[col], errors='coerce')
    sprints_df[col] = sprints_df[col].fillna(0)

sprints_df['fastestLapTime'] = pd.to_timedelta(
    '00:'+sprints_df['fastestLapTime'], errors='coerce')
sprints_df.fastestLapTime.fillna(pd.Timedelta(0), inplace=True)

sprints_df['time'] = pd.to_timedelta(sprints_df['milliseconds'], unit = 'ms')
sprints_df.time.fillna(pd.Timedelta(0), inplace=True)

sprints_df.to_csv(r'datasets/cleaned/sprint_results.csv', index=False)
sprints_df.info()
sprints_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype          
---  ------          --------------  -----          
 0   raceId          100 non-null    int64          
 1   driverId        100 non-null    int64          
 2   constructorId   100 non-null    int64          
 3   grid            100 non-null    int64          
 4   positionOrder   100 non-null    int64          
 5   points          100 non-null    int64          
 6   laps            100 non-null    int64          
 7   time            100 non-null    timedelta64[ns]
 8   milliseconds    100 non-null    float64        
 9   fastestLap      100 non-null    float64        
 10  fastestLapTime  100 non-null    timedelta64[ns]
 11  statusId        100 non-null    int64          
dtypes: float64(2), int64(8), timedelta64[ns](2)
memory usage: 9.5 KB


Unnamed: 0,raceId,driverId,constructorId,grid,positionOrder,points,laps,time,milliseconds,fastestLap,fastestLapTime,statusId
0,1061,830,9,2,1,3,17,0 days 00:25:38.426000,1538426.0,14.0,0 days 00:01:30.013000,1
1,1061,1,131,1,2,2,17,0 days 00:25:39.856000,1539856.0,17.0,0 days 00:01:29.937000,1
2,1061,822,131,3,3,1,17,0 days 00:25:45.928000,1545928.0,17.0,0 days 00:01:29.958000,1
3,1061,844,6,4,4,0,17,0 days 00:25:49.704000,1549704.0,16.0,0 days 00:01:30.163000,1
4,1061,846,1,6,5,0,17,0 days 00:26:02.537000,1562537.0,16.0,0 days 00:01:30.566000,1


Race results data

In [7]:
results_df.info()
results_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25660 entries, 0 to 25659
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   resultId         25660 non-null  int64  
 1   raceId           25660 non-null  int64  
 2   driverId         25660 non-null  int64  
 3   constructorId    25660 non-null  int64  
 4   number           25660 non-null  object 
 5   grid             25660 non-null  int64  
 6   position         25660 non-null  object 
 7   positionText     25660 non-null  object 
 8   positionOrder    25660 non-null  int64  
 9   points           25660 non-null  float64
 10  laps             25660 non-null  int64  
 11  time             25660 non-null  object 
 12  milliseconds     25660 non-null  object 
 13  fastestLap       25660 non-null  object 
 14  rank             25660 non-null  object 
 15  fastestLapTime   25660 non-null  object 
 16  fastestLapSpeed  25660 non-null  object 
 17  statusId    

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,5690616,39,2,1:27.452,218.3,1
1,2,18,2,2,3,5,2,2,2,8.0,58,+5.478,5696094,41,3,1:27.739,217.586,1
2,3,18,3,3,7,7,3,3,3,6.0,58,+8.163,5698779,41,5,1:28.090,216.719,1
3,4,18,4,4,5,11,4,4,4,5.0,58,+17.181,5707797,58,7,1:28.603,215.464,1
4,5,18,5,1,23,3,5,5,5,4.0,58,+18.014,5708630,43,1,1:27.418,218.385,1


In [8]:
results_df = results_df.drop(columns={'resultId','number','position','positionText'})
results_df.head()

Unnamed: 0,raceId,driverId,constructorId,grid,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,18,1,1,1,1,10.0,58,1:34:50.616,5690616,39,2,1:27.452,218.3,1
1,18,2,2,5,2,8.0,58,+5.478,5696094,41,3,1:27.739,217.586,1
2,18,3,3,7,3,6.0,58,+8.163,5698779,41,5,1:28.090,216.719,1
3,18,4,4,11,4,5.0,58,+17.181,5707797,58,7,1:28.603,215.464,1
4,18,5,1,3,5,4.0,58,+18.014,5708630,43,1,1:27.418,218.385,1


In [9]:
for col in ('milliseconds', 'fastestLap', 'rank', 'fastestLapSpeed'):
    results_df[col] = pd.to_numeric(results_df[col], errors='coerce')
    if col == 'milliseconds':
        results_df['time'] = pd.to_timedelta(results_df['milliseconds'], unit = 'ms')
        results_df.time.fillna(pd.Timedelta(0), inplace=True)
        results_df.milliseconds.fillna(0, inplace=True)
    elif col != 'fastestLapSpeed':
        results_df[col] = results_df[col].fillna(0)
    else:
        results_df['fastestLapSpeed'].fillna(results_df.groupby('statusId')['fastestLapSpeed'].transform('mean').fillna(results_df.fastestLapSpeed.mean()), inplace=True)

results_df['fastestLapTime'] = pd.to_timedelta('00:'+results_df['fastestLapTime'], errors='coerce')
results_df.fastestLapTime.fillna(pd.Timedelta(0), inplace=True)

results_df.to_csv(r'datasets/cleaned/race_results.csv', index=False)
results_df.info()
results_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25660 entries, 0 to 25659
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype          
---  ------           --------------  -----          
 0   raceId           25660 non-null  int64          
 1   driverId         25660 non-null  int64          
 2   constructorId    25660 non-null  int64          
 3   grid             25660 non-null  int64          
 4   positionOrder    25660 non-null  int64          
 5   points           25660 non-null  float64        
 6   laps             25660 non-null  int64          
 7   time             25660 non-null  timedelta64[ns]
 8   milliseconds     25660 non-null  float64        
 9   fastestLap       25660 non-null  float64        
 10  rank             25660 non-null  float64        
 11  fastestLapTime   25660 non-null  timedelta64[ns]
 12  fastestLapSpeed  25660 non-null  float64        
 13  statusId         25660 non-null  int64          
dtypes: float64(5), int64(7

Unnamed: 0,raceId,driverId,constructorId,grid,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,18,1,1,1,1,10.0,58,0 days 01:34:50.616000,5690616.0,39.0,2.0,0 days 00:01:27.452000,218.3,1
1,18,2,2,5,2,8.0,58,0 days 01:34:56.094000,5696094.0,41.0,3.0,0 days 00:01:27.739000,217.586,1
2,18,3,3,7,3,6.0,58,0 days 01:34:58.779000,5698779.0,41.0,5.0,0 days 00:01:28.090000,216.719,1
3,18,4,4,11,4,5.0,58,0 days 01:35:07.797000,5707797.0,58.0,7.0,0 days 00:01:28.603000,215.464,1
4,18,5,1,3,5,4.0,58,0 days 01:35:08.630000,5708630.0,43.0,1.0,0 days 00:01:27.418000,218.385,1


Qualifying results data

In [10]:
quali_df.info()
quali_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9395 entries, 0 to 9394
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   qualifyId      9395 non-null   int64 
 1   raceId         9395 non-null   int64 
 2   driverId       9395 non-null   int64 
 3   constructorId  9395 non-null   int64 
 4   number         9395 non-null   int64 
 5   position       9395 non-null   int64 
 6   q1             9387 non-null   object
 7   q2             9244 non-null   object
 8   q3             9101 non-null   object
dtypes: int64(6), object(3)
memory usage: 660.7+ KB


Unnamed: 0,qualifyId,raceId,driverId,constructorId,number,position,q1,q2,q3
0,1,18,1,1,22,1,1:26.572,1:25.187,1:26.714
1,2,18,9,2,4,2,1:26.103,1:25.315,1:26.869
2,3,18,5,1,23,3,1:25.664,1:25.452,1:27.079
3,4,18,13,6,2,4,1:25.994,1:25.691,1:27.178
4,5,18,2,2,3,5,1:25.960,1:25.518,1:27.236


In [11]:
quali_df = quali_df.drop(columns={'qualifyId','number'})
quali_df.head()

Unnamed: 0,raceId,driverId,constructorId,position,q1,q2,q3
0,18,1,1,1,1:26.572,1:25.187,1:26.714
1,18,9,2,2,1:26.103,1:25.315,1:26.869
2,18,5,1,3,1:25.664,1:25.452,1:27.079
3,18,13,6,4,1:25.994,1:25.691,1:27.178
4,18,2,2,5,1:25.960,1:25.518,1:27.236


In [12]:
for col in ('q1', 'q2', 'q3'):
    quali_df[col] = pd.to_timedelta('00:'+quali_df[col], errors='coerce')


quali_df.fillna(pd.Timedelta(0), inplace=True)

quali_df.to_csv(r'datasets/cleaned/qualifying_results.csv', index=False)

quali_df.info()
quali_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9395 entries, 0 to 9394
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype          
---  ------         --------------  -----          
 0   raceId         9395 non-null   int64          
 1   driverId       9395 non-null   int64          
 2   constructorId  9395 non-null   int64          
 3   position       9395 non-null   int64          
 4   q1             9395 non-null   timedelta64[ns]
 5   q2             9395 non-null   timedelta64[ns]
 6   q3             9395 non-null   timedelta64[ns]
dtypes: int64(4), timedelta64[ns](3)
memory usage: 513.9 KB


Unnamed: 0,raceId,driverId,constructorId,position,q1,q2,q3
0,18,1,1,1,0 days 00:01:26.572000,0 days 00:01:25.187000,0 days 00:01:26.714000
1,18,9,2,2,0 days 00:01:26.103000,0 days 00:01:25.315000,0 days 00:01:26.869000
2,18,5,1,3,0 days 00:01:25.664000,0 days 00:01:25.452000,0 days 00:01:27.079000
3,18,13,6,4,0 days 00:01:25.994000,0 days 00:01:25.691000,0 days 00:01:27.178000
4,18,2,2,5,0 days 00:01:25.960000,0 days 00:01:25.518000,0 days 00:01:27.236000


Races data

In [13]:
races_df.info()
races_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1079 entries, 0 to 1078
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   raceId       1079 non-null   int64 
 1   year         1079 non-null   int64 
 2   round        1079 non-null   int64 
 3   circuitId    1079 non-null   int64 
 4   name         1079 non-null   object
 5   date         1079 non-null   object
 6   time         1079 non-null   object
 7   url          1079 non-null   object
 8   fp1_date     1079 non-null   object
 9   fp1_time     1079 non-null   object
 10  fp2_date     1079 non-null   object
 11  fp2_time     1079 non-null   object
 12  fp3_date     1079 non-null   object
 13  fp3_time     1079 non-null   object
 14  quali_date   1079 non-null   object
 15  quali_time   1079 non-null   object
 16  sprint_date  1079 non-null   object
 17  sprint_time  1079 non-null   object
dtypes: int64(4), object(14)
memory usage: 151.9+ KB


Unnamed: 0,raceId,year,round,circuitId,name,date,time,url,fp1_date,fp1_time,fp2_date,fp2_time,fp3_date,fp3_time,quali_date,quali_time,sprint_date,sprint_time
0,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,http://en.wikipedia.org/wiki/2009_Australian_G...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
1,2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00,http://en.wikipedia.org/wiki/2009_Malaysian_Gr...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
2,3,2009,3,17,Chinese Grand Prix,2009-04-19,07:00:00,http://en.wikipedia.org/wiki/2009_Chinese_Gran...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
3,4,2009,4,3,Bahrain Grand Prix,2009-04-26,12:00:00,http://en.wikipedia.org/wiki/2009_Bahrain_Gran...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
4,5,2009,5,4,Spanish Grand Prix,2009-05-10,12:00:00,http://en.wikipedia.org/wiki/2009_Spanish_Gran...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N


In [14]:
# dropping fp1_date to sprint_time as they have all null values until 2021
races_df = races_df.drop(columns={'time','url','fp1_date','fp1_time','fp2_date','fp2_time','fp3_date','fp3_time','quali_date', 'quali_time', 'sprint_date', 'sprint_time'})
races_df.head()

Unnamed: 0,raceId,year,round,circuitId,name,date
0,1,2009,1,1,Australian Grand Prix,2009-03-29
1,2,2009,2,2,Malaysian Grand Prix,2009-04-05
2,3,2009,3,17,Chinese Grand Prix,2009-04-19
3,4,2009,4,3,Bahrain Grand Prix,2009-04-26
4,5,2009,5,4,Spanish Grand Prix,2009-05-10


In [15]:
races_df['date'] = pd.to_datetime(races_df['date'])

races_df.to_csv(r'datasets/cleaned/races.csv', index=False)

races_df.info()
races_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1079 entries, 0 to 1078
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   raceId     1079 non-null   int64         
 1   year       1079 non-null   int64         
 2   round      1079 non-null   int64         
 3   circuitId  1079 non-null   int64         
 4   name       1079 non-null   object        
 5   date       1079 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 50.7+ KB


Unnamed: 0,raceId,year,round,circuitId,name,date
0,1,2009,1,1,Australian Grand Prix,2009-03-29
1,2,2009,2,2,Malaysian Grand Prix,2009-04-05
2,3,2009,3,17,Chinese Grand Prix,2009-04-19
3,4,2009,4,3,Bahrain Grand Prix,2009-04-26
4,5,2009,5,4,Spanish Grand Prix,2009-05-10


Pit stop data

In [16]:
pit_stops.info()
pit_stops.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9299 entries, 0 to 9298
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   raceId        9299 non-null   int64 
 1   driverId      9299 non-null   int64 
 2   stop          9299 non-null   int64 
 3   lap           9299 non-null   int64 
 4   time          9299 non-null   object
 5   duration      9299 non-null   object
 6   milliseconds  9299 non-null   int64 
dtypes: int64(5), object(2)
memory usage: 508.7+ KB


Unnamed: 0,raceId,driverId,stop,lap,time,duration,milliseconds
0,841,153,1,1,17:05:23,26.898,26898
1,841,30,1,1,17:05:52,25.021,25021
2,841,17,1,11,17:20:48,23.426,23426
3,841,4,1,12,17:22:34,23.251,23251
4,841,13,1,13,17:24:10,23.842,23842


In [17]:
pit_stops['duration'] = pd.to_datetime(pit_stops['duration'], format='%S.%f', errors='coerce').fillna(
    pd.to_datetime(pit_stops['duration'], format='%M:%S.%f', errors='coerce')) - pd.to_datetime(pit_stops['duration'], format='%S.%f', errors='coerce').fillna(
    pd.to_datetime(pit_stops['duration'], format='%M:%S.%f', errors='coerce')).dt.normalize()

pit_stops['time'] = pd.to_timedelta(pit_stops['time'])

pit_stops.to_csv(r'datasets/cleaned/pit_stops.csv', index=False)

pit_stops.info()
pit_stops.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9299 entries, 0 to 9298
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype          
---  ------        --------------  -----          
 0   raceId        9299 non-null   int64          
 1   driverId      9299 non-null   int64          
 2   stop          9299 non-null   int64          
 3   lap           9299 non-null   int64          
 4   time          9299 non-null   timedelta64[ns]
 5   duration      9299 non-null   timedelta64[ns]
 6   milliseconds  9299 non-null   int64          
dtypes: int64(5), timedelta64[ns](2)
memory usage: 508.7 KB


Unnamed: 0,raceId,driverId,stop,lap,time,duration,milliseconds
0,841,153,1,1,0 days 17:05:23,0 days 00:00:26.898000,26898
1,841,30,1,1,0 days 17:05:52,0 days 00:00:25.021000,25021
2,841,17,1,11,0 days 17:20:48,0 days 00:00:23.426000,23426
3,841,4,1,12,0 days 17:22:34,0 days 00:00:23.251000,23251
4,841,13,1,13,0 days 17:24:10,0 days 00:00:23.842000,23842


Lap time data

In [18]:
lap_times.info()
lap_times.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 528785 entries, 0 to 528784
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   raceId        528785 non-null  int64 
 1   driverId      528785 non-null  int64 
 2   lap           528785 non-null  int64 
 3   position      528785 non-null  int64 
 4   time          528785 non-null  object
 5   milliseconds  528785 non-null  int64 
dtypes: int64(5), object(1)
memory usage: 24.2+ MB


Unnamed: 0,raceId,driverId,lap,position,time,milliseconds
0,841,20,1,1,1:38.109,98109
1,841,20,2,1,1:33.006,93006
2,841,20,3,1,1:32.713,92713
3,841,20,4,1,1:32.803,92803
4,841,20,5,1,1:32.342,92342


In [19]:
lap_times['time'] = pd.to_datetime(lap_times['time'], format='%M:%S.%f', errors='coerce').fillna(
    pd.to_datetime(lap_times['time'], format='%H:%M:%S.%f', errors='coerce')) - pd.to_datetime(lap_times['time'], format='%M:%S.%f', errors='coerce').fillna(
    pd.to_datetime(lap_times['time'], format='%H:%M:%S.%f', errors='coerce')).dt.normalize()

lap_times.to_csv(r'datasets/cleaned/lap_times.csv', index=False)

lap_times.info()
lap_times.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 528785 entries, 0 to 528784
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype          
---  ------        --------------   -----          
 0   raceId        528785 non-null  int64          
 1   driverId      528785 non-null  int64          
 2   lap           528785 non-null  int64          
 3   position      528785 non-null  int64          
 4   time          528785 non-null  timedelta64[ns]
 5   milliseconds  528785 non-null  int64          
dtypes: int64(5), timedelta64[ns](1)
memory usage: 24.2 MB


Unnamed: 0,raceId,driverId,lap,position,time,milliseconds
0,841,20,1,1,0 days 00:01:38.109000,98109
1,841,20,2,1,0 days 00:01:33.006000,93006
2,841,20,3,1,0 days 00:01:32.713000,92713
3,841,20,4,1,0 days 00:01:32.803000,92803
4,841,20,5,1,0 days 00:01:32.342000,92342


Drivers data

In [20]:
drivers_df.info()
drivers_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 854 entries, 0 to 853
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   driverId     854 non-null    int64 
 1   driverRef    854 non-null    object
 2   number       854 non-null    object
 3   code         854 non-null    object
 4   forename     854 non-null    object
 5   surname      854 non-null    object
 6   dob          854 non-null    object
 7   nationality  854 non-null    object
 8   url          854 non-null    object
dtypes: int64(1), object(8)
memory usage: 60.2+ KB


Unnamed: 0,driverId,driverRef,number,code,forename,surname,dob,nationality,url
0,1,hamilton,44,HAM,Lewis,Hamilton,1985-01-07,British,http://en.wikipedia.org/wiki/Lewis_Hamilton
1,2,heidfeld,\N,HEI,Nick,Heidfeld,1977-05-10,German,http://en.wikipedia.org/wiki/Nick_Heidfeld
2,3,rosberg,6,ROS,Nico,Rosberg,1985-06-27,German,http://en.wikipedia.org/wiki/Nico_Rosberg
3,4,alonso,14,ALO,Fernando,Alonso,1981-07-29,Spanish,http://en.wikipedia.org/wiki/Fernando_Alonso
4,5,kovalainen,\N,KOV,Heikki,Kovalainen,1981-10-19,Finnish,http://en.wikipedia.org/wiki/Heikki_Kovalainen


In [21]:
drivers_df = drivers_df.drop(columns={'number','code','url'})
drivers_df.head()

Unnamed: 0,driverId,driverRef,forename,surname,dob,nationality
0,1,hamilton,Lewis,Hamilton,1985-01-07,British
1,2,heidfeld,Nick,Heidfeld,1977-05-10,German
2,3,rosberg,Nico,Rosberg,1985-06-27,German
3,4,alonso,Fernando,Alonso,1981-07-29,Spanish
4,5,kovalainen,Heikki,Kovalainen,1981-10-19,Finnish


In [22]:
drivers_df['name'] = drivers_df['forename'] + ' ' + drivers_df['surname']
drivers_df.drop(labels=['forename','surname'],axis=1,inplace=True)
drivers_df['dob'] = pd.to_datetime(drivers_df['dob'])

drivers_df.to_csv(r'datasets/cleaned/drivers.csv', index=False)

drivers_df.info()
drivers_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 854 entries, 0 to 853
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   driverId     854 non-null    int64         
 1   driverRef    854 non-null    object        
 2   dob          854 non-null    datetime64[ns]
 3   nationality  854 non-null    object        
 4   name         854 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 33.5+ KB


Unnamed: 0,driverId,driverRef,dob,nationality,name
0,1,hamilton,1985-01-07,British,Lewis Hamilton
1,2,heidfeld,1977-05-10,German,Nick Heidfeld
2,3,rosberg,1985-06-27,German,Nico Rosberg
3,4,alonso,1981-07-29,Spanish,Fernando Alonso
4,5,kovalainen,1981-10-19,Finnish,Heikki Kovalainen


In [23]:
driver_standings.info()
driver_standings = driver_standings.drop(columns={'driverStandingsId','positionText'})

driver_standings.to_csv(r'datasets/cleaned/driver_standings.csv', index=False)

driver_standings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33686 entries, 0 to 33685
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   driverStandingsId  33686 non-null  int64  
 1   raceId             33686 non-null  int64  
 2   driverId           33686 non-null  int64  
 3   points             33686 non-null  float64
 4   position           33686 non-null  int64  
 5   positionText       33686 non-null  object 
 6   wins               33686 non-null  int64  
dtypes: float64(1), int64(5), object(1)
memory usage: 1.8+ MB


Unnamed: 0,raceId,driverId,points,position,wins
0,18,1,10.0,1,1
1,18,2,8.0,2,0
2,18,3,6.0,3,0
3,18,4,5.0,4,0
4,18,5,4.0,5,0


Constructors data

In [24]:
constructors_standings.info()
constructors_standings = constructors_standings.drop(columns={'constructorStandingsId','positionText'})

constructors_standings.to_csv(r'datasets/cleaned/constructor_standings.csv', index=False)

constructors_standings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12841 entries, 0 to 12840
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   constructorStandingsId  12841 non-null  int64  
 1   raceId                  12841 non-null  int64  
 2   constructorId           12841 non-null  int64  
 3   points                  12841 non-null  float64
 4   position                12841 non-null  int64  
 5   positionText            12841 non-null  object 
 6   wins                    12841 non-null  int64  
dtypes: float64(1), int64(5), object(1)
memory usage: 702.4+ KB


Unnamed: 0,raceId,constructorId,points,position,wins
0,18,1,14.0,1,1
1,18,2,8.0,3,0
2,18,3,9.0,2,0
3,18,4,5.0,4,0
4,18,5,2.0,5,0


In [25]:
constructors_df.info()
constructors_df = constructors_df.drop(columns='url')

constructors_df.to_csv(r'datasets/cleaned/constructors.csv', index=False)

constructors_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   constructorId   211 non-null    int64 
 1   constructorRef  211 non-null    object
 2   name            211 non-null    object
 3   nationality     211 non-null    object
 4   url             211 non-null    object
dtypes: int64(1), object(4)
memory usage: 8.4+ KB


Unnamed: 0,constructorId,constructorRef,name,nationality
0,1,mclaren,McLaren,British
1,2,bmw_sauber,BMW Sauber,German
2,3,williams,Williams,British
3,4,renault,Renault,French
4,5,toro_rosso,Toro Rosso,Italian
