In [1]:
# Import dependcies

import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import f1_score
from config import *
import psycopg2

In [2]:
# Connect to database and extract mldata table
conn = psycopg2.connect(database=DATABASE, user=USER, password=PASSWORD, host=HOST, port= PORT)
cursor = conn.cursor()
cursor.execute("SELECT * FROM mldata")
data = cursor.fetchall()
colnames = [desc[0] for desc in cursor.description]
mldata = pd.DataFrame(data, columns=colnames)
print(mldata.shape)
mldata.head()

(7940, 22)


Unnamed: 0,index,year,round,resultId,raceId,circuitId,driverId,constructorId,grid,position,...,weather_dry,weather_wet,weather_cloudy,Win,podium,age,driverposition,driverwin,constructorposition,constructorwin
0,0,2008,1,1,18,1,1,1,1,1,...,0,0,0,1,1,23.0,0.0,0.0,0.0,0.0
1,1,2008,1,2,18,1,2,2,5,2,...,0,0,0,0,2,31.0,0.0,0.0,0.0,0.0
2,2,2008,1,3,18,1,3,3,7,3,...,0,0,0,0,3,23.0,0.0,0.0,0.0,0.0
3,3,2008,1,4,18,1,4,4,11,4,...,0,0,0,0,0,27.0,0.0,0.0,0.0,0.0
4,4,2008,1,5,18,1,5,1,3,5,...,0,0,0,0,0,26.0,0.0,0.0,0.0,0.0


In [3]:
# Replace all null values with 0s
mldata = mldata.replace(np.nan, 0)

In [4]:
# Convert float64 columns to integers
mldata[['age', 'driverposition', 'driverwin', 
       'constructorposition', 'constructorwin']] = mldata[['age', 'driverposition', 'driverwin', 
                                                         'constructorposition', 'constructorwin']].astype(int)

## Building the Race Winner Predictor

In [5]:
# Make a copy of mldata in order to create test and train datasets
df = mldata.copy()

In [6]:
# Train the data with all outcomes before 2017
train = df[df['year']<=2016]
X_train = train[['year', 'round', 'circuitId','driverId', 'constructorId', 'grid', 'weather_warm', 
                 'weather_cold', 'weather_dry','weather_wet', 'weather_cloudy']]
y_train = train.Win

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

# Create the test dataset
test = df[(df.year == 2017)]
test = test.reset_index()
test = test.drop(["index"], axis=1)

X_test = test[['year', 'round','circuitId','driverId', 'constructorId', 'grid', 'weather_warm', 'weather_cold', 'weather_dry',
       'weather_wet', 'weather_cloudy']]
y_test = test.Win
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

In [7]:
# Create the regression classifier and fit the model
classifier = LogisticRegression(solver='lbfgs', max_iter=1000)
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

When predicting race winners, the model does not understand that every race must have a winner. Therefore the model predicts many races which have no winners. In order to solve this issue, we also calculate the probability of each predicted outcome. Finally, we will assign the winner of the race to the driver who has the highest probability to win for that race (regardless of the magnitude of the probability. 

In [8]:
# Predict Outcome 
prediction = classifier.predict(X_test)
# Predict Outcome probabilities
prediction1 = classifier.predict_proba(X_test)
# Create Data Frame for Predicted Outcome
Z = pd.DataFrame({"Prediction": prediction, "Actual": y_test})
# Create second Data Frame for prediction probabilities
Z1 = pd.DataFrame(prediction1, columns = ['proba_0', 'proba_1'])
# Merging the Z and Z1 dataframes 
Z_final = pd.merge(Z, Z1, left_index=True, right_index=True, how='outer')

In [9]:
classifier_accuracy = accuracy_score(y_test, prediction)
classifier_f1 = f1_score(y_test, prediction, average='weighted')
print('Accuracy (classifier): ', "%.2f" % (classifier_accuracy*100))
print('F1 (classifier): ', "%.2f" % (classifier_f1*100))

Accuracy (classifier):  94.75
F1 (classifier):  92.44


In [10]:
# Merge Z_final with test. Only bring in columns from test that we need
Z_final = (Z_final.merge(test, left_index=True, right_index=True, how='outer').reindex(
            columns=['raceId', 'round', 'circuitId','driverId','Prediction', 'Actual', 'proba_1', 'Win', 'podium']))
print(Z_final.shape)
Z_final.head()

(400, 9)


Unnamed: 0,raceId,round,circuitId,driverId,Prediction,Actual,proba_1,Win,podium
0,969,1,1,20,0,1,0.258741,1,1
1,969,1,1,1,0,0,0.408776,0,2
2,969,1,1,822,0,0,0.040636,0,3
3,969,1,1,8,0,0,0.096855,0,0
4,969,1,1,830,0,0,0.011839,0,0


In [11]:
# We will groupby 'round' and take the max probability for a driver to win for that race or 'round'
maxprob = Z_final.groupby(['raceId']).agg({'proba_1':'max'})
maxprob = maxprob.reset_index()
maxprob = maxprob.rename(columns={'proba_1':'proba_1_max'})

# We will then merge the proba_1_max column to Z_Final
Z_final = pd.merge(Z_final, maxprob, how='left', on=['raceId'])
print(Z_final.shape)
Z_final.head()

(400, 10)


Unnamed: 0,raceId,round,circuitId,driverId,Prediction,Actual,proba_1,Win,podium,proba_1_max
0,969,1,1,20,0,1,0.258741,1,1,0.408776
1,969,1,1,1,0,0,0.408776,0,2,0.408776
2,969,1,1,822,0,0,0.040636,0,3,0.408776
3,969,1,1,8,0,0,0.096855,0,0,0.408776
4,969,1,1,830,0,0,0.011839,0,0,0.408776


## Predicting the Race Winner

In [12]:
# Number of times our predicted winner is the actual winner
winner = []
for index, row in Z_final.iterrows():
    if row['proba_1'] == row['proba_1_max']:
        if row['podium'] in [1]:
            winner.append(1)
        else:
            winner.append(0)
    else:
        winner.append(0)
        
Z_final['winner'] = winner
Z_final['winner'].sum()

10

In [13]:
# Number of times our predicted winner is in the top 2
top2 = []
for index, row in Z_final.iterrows():
    if row['proba_1'] == row['proba_1_max']:
        if row['podium'] in [1,2]:
            top2.append(1)
        else:
            top2.append(0)
    else:
        top2.append(0)
        
Z_final['top2'] = top2
Z_final['top2'].sum()

16

In [14]:
# Number of times our predicted winner is in the top 3
top3 = []
for index, row in Z_final.iterrows():
    if row['proba_1'] == row['proba_1_max']:
        if row['podium'] in [1,2,3]:
            top3.append(1)
        else:
            top3.append(0)
    else:
        top3.append(0)
        
Z_final['top3'] = top3
Z_final['top3'].sum()

16

In [15]:
# Merge Z_Final and test to create a new dataframe. 
# This dataframe will be used to predict all 20 race positions. 
probability = Z_final["proba_1"].copy()
test = test.join(probability, how='outer')
classifier_pred = test[['round', 'resultId', 'raceId', 'circuitId', 'driverId', 'constructorId','position', "proba_1"]]
print(classifier_pred.shape)
classifier_pred.head()

(400, 8)


Unnamed: 0,round,resultId,raceId,circuitId,driverId,constructorId,position,proba_1
0,1,23379,969,1,20,6,1,0.258741
1,1,23380,969,1,1,131,2,0.408776
2,1,23381,969,1,822,131,3,0.040636
3,1,23382,969,1,8,6,4,0.096855
4,1,23383,969,1,830,9,5,0.011839


## Processing svm_pred for predicting all 20 Race outcomes

In order to predict all 20 race outcomes, we will sort the probabilities in ascending order for each race. We can then add a counter for each outcome starting from 1 ending at 20 which will represent the driver's ending position in the race. 

In [16]:
# Make a copy of svm_pred
X = classifier_pred.copy()
# Sort X in ascending order
X = X.sort_values(["round", "proba_1"], ascending = (True, False))
X.head()

Unnamed: 0,round,resultId,raceId,circuitId,driverId,constructorId,position,proba_1
1,1,23380,969,1,1,131,2,0.408776
0,1,23379,969,1,20,6,1,0.258741
16,1,23395,969,1,817,9,0,0.199275
3,1,23382,969,1,8,6,4,0.096855
2,1,23381,969,1,822,131,3,0.040636


In [17]:
# Create a forLoop and add a counter to each row. Once the raceId changes, the counter will refresh
# This counter will represent the driver's final predicted race outcome

round = [1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
count = 1
prediction = []

for i in round:
    count = 0
    for index, row in X.iterrows():
        if row["round"] == i:
            count = count +1
            prediction.append(count)
        else:
            continue
        

X["Pred_Position"] = prediction
X.head()

Unnamed: 0,round,resultId,raceId,circuitId,driverId,constructorId,position,proba_1,Pred_Position
1,1,23380,969,1,1,131,2,0.408776,1
0,1,23379,969,1,20,6,1,0.258741,2
16,1,23395,969,1,817,9,0,0.199275,3
3,1,23382,969,1,8,6,4,0.096855,4
2,1,23381,969,1,822,131,3,0.040636,5


In [18]:
# We will create a new column which shows the delta between actual and predicted finishing position 
X['Delta'] = X['position'] - X['Pred_Position']
X['Delta'] = X['Delta'].abs()

In [19]:
# If the delta is within 2, we will assign the number 1 to it. Otherwise, we will assign 0. 
X["Spread"] = X["Delta"].copy()
X["Spread"] = X["Spread"].apply(lambda x: 1 if x<= 2 else 0)

In addition to predicting every race position (with a spread of 2), we also want to predict the driver's finishing group or bin. To do that, we will create additional columns where we will map the predicted and actual finishing positions to their respective bins: Podium, topsix, topten and else. 

In [20]:
# Add two more columns to create ending position bins

X["pos_group"] = X["position"].copy()
X["pred_pos_group"] = X["Pred_Position"].copy()

podium = [1,2,3]
topsix = [4,5,6]
topten = [7,8,9,10]

def position_group(x):
    if x in podium:
        return 1
    if x in topsix:
        return 2
    if x in topten:
        return 3
    else:
        return 4
    

X["pos_group"] = X["pos_group"].apply(position_group)  
X["pred_pos_group"] = X["pred_pos_group"].apply(position_group)  

In [21]:
# This is the final dataframe which we will use to calute the prediction percentages. 
print(X.shape)
X

(400, 13)


Unnamed: 0,round,resultId,raceId,circuitId,driverId,constructorId,position,proba_1,Pred_Position,Delta,Spread,pos_group,pred_pos_group
1,1,23380,969,1,1,131,2,0.408776,1,1,1,1,1
0,1,23379,969,1,20,6,1,0.258741,2,1,1,1,1
16,1,23395,969,1,817,9,0,0.199275,3,3,0,4,1
3,1,23382,969,1,8,6,4,0.096855,4,0,1,2,2
2,1,23381,969,1,822,131,3,0.040636,5,2,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
397,20,23779,988,24,840,3,18,0.000032,16,2,1,4,4
395,20,23777,988,24,842,5,16,0.000010,17,1,1,4,4
393,20,23775,988,24,836,15,14,0.000005,18,4,0,4,4
396,20,23778,988,24,828,15,17,0.000003,19,2,1,4,4


## Predicting all 20 race positions with a spread of 2

In [22]:
correct = len(X.loc[X["Spread"]==1])
total = len(X)
score = (correct / total) * 100

print(f"Model predicted {score} percentage of all outcomes within a spread of 2 positions")

Model predicted 41.5 percentage of all outcomes within a spread of 2 positions


## Predicting driver finishing bins

In [23]:
group_correct = len(X.loc[X["pos_group"]==X["pred_pos_group"]])
total = len(X)
group_score = (group_correct / total) * 100
print(f"Model predicted {group_score} percentage of all outcome groups for 2017")

Model predicted 58.25 percentage of all outcome groups for 2017


In [24]:
podium_percentage = len(X.loc[(X["pos_group"]==1) & (X["pos_group"]==X["pred_pos_group"])]) / len(X.loc[X["pos_group"]==1])
topsix_percentage = len(X.loc[(X["pos_group"]==2) & (X["pos_group"]==X["pred_pos_group"])]) / len(X.loc[X["pos_group"]==2])
topten_percentage = len(X.loc[(X["pos_group"]==3) & (X["pos_group"]==X["pred_pos_group"])]) / len(X.loc[X["pos_group"]==3])
bottomten_percentage = len(X.loc[(X["pos_group"]==4) & (X["pos_group"]==X["pred_pos_group"])]) / len(X.loc[X["pos_group"]==4])

print(f"Model predicted {podium_percentage * 100} percentage of all podiums for 2017")
print(f"Model predicted {topsix_percentage * 100} percentage of all topsix for 2017")
print(f"Model predicted {topten_percentage * 100} percentage of all topten for 2017")
print(f"Model predicted {bottomten_percentage * 100} percentage of all topten for 2017")

Model predicted 61.66666666666667 percentage of all podiums for 2017
Model predicted 35.0 percentage of all topsix for 2017
Model predicted 35.0 percentage of all topten for 2017
Model predicted 73.5 percentage of all topten for 2017


## Create Final Output File

In [25]:
output1 = X[['resultId', 'raceId', 'circuitId', 'driverId', 'constructorId', 'position', 'pos_group', 
             'Pred_Position', 'pred_pos_group']]

In [26]:
# Read in races.csv again. 
cursor.execute("Select * FROM races")
data = cursor.fetchall()
colnames = [desc[0] for desc in cursor.description]
races_df = pd.DataFrame(data, columns=colnames)

In [27]:
# Read in drivers.csv from database
cursor.execute("Select * FROM drivers")
data = cursor.fetchall()
colnames = [desc[0] for desc in cursor.description]
drivers_df = pd.DataFrame(data, columns=colnames)

In [28]:
# Read in drivers.csv from database
cursor.execute("Select * FROM constructors")
data = cursor.fetchall()
colnames = [desc[0] for desc in cursor.description]
constructors_df = pd.DataFrame(data, columns=colnames)

In [29]:
races = races_df[['raceId', 'name']].copy()
drivers = drivers_df[['driverId', 'driverRef']].copy()
constructors = constructors_df[['constructorId', 'constructorRef']].copy()

In [30]:
# Merge output1 with races to bring in race name
output1 = pd.merge(output1, races, how='left', on=['raceId'])

In [31]:
# Then merge output1 with drivers to bring in driver name
output1 = pd.merge(output1, drivers, how='left', on=['driverId'])

In [32]:
# finally merge output1 with constructors to bring in constructor name
output1 = pd.merge(output1, constructors, how='left', on=['constructorId'])

In [33]:
cl_output = output1[['resultId', 'raceId', 'name', 'circuitId', 'driverId', 'driverRef', 'constructorId', 
                     'constructorRef', 'position', 'pos_group', 'Pred_Position', 'pred_pos_group', ]].copy()
cl_output = cl_output.sort_values(["resultId"], ascending = (True))
cl_output = cl_output.drop(['raceId', 'circuitId', 'driverId', 'constructorId'], axis=1)
cl_output

Unnamed: 0,resultId,name,driverRef,constructorRef,position,pos_group,Pred_Position,pred_pos_group
1,23379,Australian Grand Prix,vettel,ferrari,1,1,2,1
0,23380,Australian Grand Prix,hamilton,mercedes,2,1,1,1
4,23381,Australian Grand Prix,bottas,mercedes,3,1,5,2
3,23382,Australian Grand Prix,raikkonen,ferrari,4,2,4,2
7,23383,Australian Grand Prix,max_verstappen,red_bull,5,2,8,3
...,...,...,...,...,...,...,...,...
396,23777,Abu Dhabi Grand Prix,gasly,toro_rosso,16,4,17,4
398,23778,Abu Dhabi Grand Prix,ericsson,sauber,17,4,19,4
395,23779,Abu Dhabi Grand Prix,stroll,williams,18,4,16,4
391,23780,Abu Dhabi Grand Prix,sainz,renault,0,4,12,4


In [34]:
cl_output = cl_output.rename(columns={"position": "Outcome", 
                                      "pos_group": "Bin", 
                                      "Pred_Position": "CL_Outcome", 
                                      "pred_pos_group": "CL_Bin"})

In [35]:
# Extract CSV file
cl_output.to_csv('../../../Resources/PythonExport/ML_Export/2017/cl_output_2017.csv')