In [15]:
# Import dependcies

import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from config import *
import psycopg2

In [16]:
# Connect to database and extract mldata table
conn = psycopg2.connect(database=DATABASE, user=USER, password=PASSWORD, host=HOST, port= PORT)
cursor = conn.cursor()
cursor.execute("SELECT * FROM mldata")
data = cursor.fetchall()
colnames = [desc[0] for desc in cursor.description]
mldata = pd.DataFrame(data, columns=colnames)
print(mldata.shape)
mldata.head()

(7940, 17)


Unnamed: 0,index,year,round,resultId,raceId,circuitId,driverId,constructorId,grid,position,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,Win,podium
0,0,2008,1,1,18,1,1,1,1,1,1,0,0,0,0,1,1
1,1,2008,1,2,18,1,2,2,5,2,1,0,0,0,0,0,2
2,2,2008,1,3,18,1,3,3,7,3,1,0,0,0,0,0,3
3,3,2008,1,4,18,1,4,4,11,4,1,0,0,0,0,0,0
4,4,2008,1,5,18,1,5,1,3,5,1,0,0,0,0,0,0


In [17]:
# Make a copy of mldata in order to create test and train datasets
df = mldata.copy()

In [18]:
# Train the data with all outcomes before 2019
train = df[df['year']<=2018]
X_train = train[['year', 'round', 'circuitId','driverId', 'constructorId', 'grid', 'weather_warm', 'weather_cold', 'weather_dry',
       'weather_wet', 'weather_cloudy']]
y_train = train.Win

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

# Create the test dataset
test = df[(df.year == 2019)]
test = test.reset_index()
test = test.drop(["index"], axis=1)

X_test = test[['year', 'round','circuitId','driverId', 'constructorId', 'grid', 'weather_warm', 'weather_cold', 'weather_dry',
       'weather_wet', 'weather_cloudy']]
y_test = test.Win
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

In [19]:
# Create the regression classifier and fit the model
classifier = LogisticRegression(solver='lbfgs', max_iter=1000)
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

When predicting race winners, the model does not understand that every race must have a winner. Therefore the model predicts many races which have no winners. In order to solve this issue, we also calculate the probability of each predicted outcome. Finally, we will assign the winner of the race to the driver who has the highest probability to win for that race (regardless of the magnitude of the probability. 

In [20]:
# Predict Outcome 
prediction = classifier.predict(X_test)
# Predict Outcome probabilities
prediction1 = classifier.predict_proba(X_test)
# Create Data Frame for Predicted Outcome
Z = pd.DataFrame({"Prediction": prediction, "Actual": y_test})
# Create second Data Frame for prediction probabilities
Z1 = pd.DataFrame(prediction1, columns = ['proba_0', 'proba_1'])
# Merging the Z and Z1 dataframes 
Z_final = pd.merge(Z, Z1, left_index=True, right_index=True, how='outer')

In [21]:
# Calculate the accuracy score of the model
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, prediction))

0.9380952380952381


In [22]:
# Merge Z_final with test. Only bring in columns from test that we need
Z_final = (Z_final.merge(test, left_index=True, right_index=True, how='outer').reindex(
            columns=['raceId', 'round', 'circuitId','driverId','Prediction', 'Actual', 'proba_1', 'Win', 'podium']))
print(Z_final.shape)
Z_final.head()

(420, 9)


Unnamed: 0,raceId,round,circuitId,driverId,Prediction,Actual,proba_1,Win,podium
0,1010,1,1,822,0,1,0.116191,1,1
1,1010,1,1,1,0,0,0.43372,0,2
2,1010,1,1,830,0,0,0.03174,0,3
3,1010,1,1,20,0,0,0.157642,0,0
4,1010,1,1,844,0,0,0.017344,0,0


In [23]:
# We will groupby 'round' and take the max probability for a driver to win for that race or 'round'
maxprob = Z_final.groupby(['raceId']).agg({'proba_1':'max'})
maxprob = maxprob.reset_index()
maxprob = maxprob.rename(columns={'proba_1':'proba_1_max'})

# We will then merge the proba_1_max column to Z_Final
Z_final = pd.merge(Z_final, maxprob, how='left', on=['raceId'])
Z_final

Unnamed: 0,raceId,round,circuitId,driverId,Prediction,Actual,proba_1,Win,podium,proba_1_max
0,1010,1,1,822,0,1,0.116191,1,1,0.433720
1,1010,1,1,1,0,0,0.433720,0,2,0.433720
2,1010,1,1,830,0,0,0.031740,0,3,0.433720
3,1010,1,1,20,0,0,0.157642,0,0,0.433720
4,1010,1,1,844,0,0,0.017344,0,0,0.433720
...,...,...,...,...,...,...,...,...,...,...
415,1030,21,24,841,0,0,0.000032,0,0,0.471344
416,1030,21,24,847,0,0,0.000009,0,0,0.471344
417,1030,21,24,842,0,0,0.000580,0,0,0.471344
418,1030,21,24,9,0,0,0.000016,0,0,0.471344


In [24]:
# Number of times our predicted winner is the actual winner
winner = []
for index, row in Z_final.iterrows():
    if row['proba_1'] == row['proba_1_max']:
        if row['podium'] in [1]:
            winner.append(1)
        else:
            winner.append(0)
    else:
        winner.append(0)
        
Z_final['winner'] = winner
Z_final['winner'].sum()

8

In [25]:
# Number of times our predicted winner is in the top 2
top2 = []
for index, row in Z_final.iterrows():
    if row['proba_1'] == row['proba_1_max']:
        if row['podium'] in [1,2]:
            top2.append(1)
        else:
            top2.append(0)
    else:
        top2.append(0)
        
Z_final['top2'] = top2
Z_final['top2'].sum()

11

In [26]:
# Number of times our predicted winner is in the top 2
top3 = []
for index, row in Z_final.iterrows():
    if row['proba_1'] == row['proba_1_max']:
        if row['podium'] in [1,2,3]:
            top3.append(1)
        else:
            top3.append(0)
    else:
        top3.append(0)
        
Z_final['top3'] = top3
Z_final['top3'].sum()

11