In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

### Data Preprocessing and Cleaning

In [3]:
results = pd.read_csv(r'../datasets/results.csv')
races = pd.read_csv(r'../datasets/races.csv')
drivers = pd.read_csv(r'../datasets/drivers.csv')
constructors = pd.read_csv(r'../datasets/constructors.csv')
circuits = pd.read_csv(r'../datasets/circuits.csv')
qualifying = pd.read_csv(r'../datasets/qualifying.csv')

In [12]:
df1 = pd.merge(races,results,how='inner',on=['raceId'])
df2 = pd.merge(df1,qualifying,how='inner',on=['raceId','driverId','constructorId'])
df3 = pd.merge(df2,drivers,how='inner',on=['driverId'])
df4 = pd.merge(df3,constructors,how='inner',on=['constructorId'])

df4.drop(['url_x','url_y'],axis=1,inplace=True)

df5 = pd.merge(df4,circuits,how='inner',on=['circuitId'])

column_names = df5.columns.tolist()
print(column_names)

['raceId', 'year', 'round', 'circuitId', 'name_x', 'date', 'time_x', 'fp1_date', 'fp1_time', 'fp2_date', 'fp2_time', 'fp3_date', 'fp3_time', 'quali_date', 'quali_time', 'sprint_date', 'sprint_time', 'resultId', 'driverId', 'constructorId', 'number_x', 'grid', 'position_x', 'positionText', 'positionOrder', 'points', 'laps', 'time_y', 'milliseconds', 'fastestLap', 'rank', 'fastestLapTime', 'fastestLapSpeed', 'statusId', 'qualifyId', 'number_y', 'position_y', 'q1', 'q2', 'q3', 'driverRef', 'number', 'code', 'forename', 'surname', 'dob', 'nationality_x', 'constructorRef', 'name_y', 'nationality_y', 'url_x', 'circuitRef', 'name', 'location', 'country', 'lat', 'lng', 'alt', 'url_y']


In [14]:
data = df5.drop(['round', 'circuitId', 'time_x', 'url_x', 'resultId', 'driverId', 'constructorId', 'time_y', 'url_y',
                 'number_x', 'number_y', 'position_x', 'positionText', 'positionOrder', 'laps', 'rank', 'fastestLapTime',
                 'fastestLapSpeed', 'qualifyId','position_y', 'q1','q2','q3', 'driverRef', 'number','code', 'dob',
                 'number_y', 'name_y','lat','lng','alt', 'fp1_time','fp2_time','fp3_time','quali_time','sprint_time',
                 'fp1_date','fp2_date','fp3_date','quali_date','sprint_date', 'nationality_y', 'constructorRef'], axis=1)

In [15]:
data = data[data['year'] >= 1980]

In [16]:
column_names = data.columns.tolist()
print(column_names)

['raceId', 'year', 'name_x', 'date', 'grid', 'points', 'milliseconds', 'fastestLap', 'statusId', 'forename', 'surname', 'nationality_x', 'circuitRef', 'name', 'location', 'country']


In [8]:
data.head()

Unnamed: 0,raceId,year,name_x,date,grid,points,milliseconds,fastestLap,statusId,forename,surname,nationality_x,circuitRef,name,location,country
0,1,2009,Australian Grand Prix,2009-03-29,1,10.0,5655784,17,1,Jenson,Button,British,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia
1,1,2009,Australian Grand Prix,2009-03-29,2,8.0,5656591,43,1,Rubens,Barrichello,Brazilian,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia
2,1,2009,Australian Grand Prix,2009-03-29,20,6.0,5657388,50,1,Jarno,Trulli,Italian,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia
3,1,2009,Australian Grand Prix,2009-03-29,19,5.0,5660219,53,1,Timo,Glock,German,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia
4,1,2009,Australian Grand Prix,2009-03-29,10,4.0,5660663,53,1,Fernando,Alonso,Spanish,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia


In [17]:
data.rename(columns={'name':'GP_name','position_y':'position','grid':'quali_pos','name_y':'constructor','nationality_x':'driver_nationality','nationality_y':'constructor_nationality'},inplace=True)
data['driver'] = data['forename']+' '+data['surname']