In [1]:
# Import dependcies

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import svm, datasets
from sklearn.svm import SVC
import sklearn.model_selection as model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, classification_report
from config import *
import psycopg2

In [2]:
# Connect to database and extract mldata table
conn = psycopg2.connect(database=DATABASE, user=USER, password=PASSWORD, host=HOST, port= PORT)
cursor = conn.cursor()
cursor.execute("SELECT * FROM mldata")
data = cursor.fetchall()
colnames = [desc[0] for desc in cursor.description]
mldata = pd.DataFrame(data, columns=colnames)
print(mldata.shape)
mldata.head()

(7940, 22)


Unnamed: 0,index,year,round,resultId,raceId,circuitId,driverId,constructorId,grid,position,...,weather_dry,weather_wet,weather_cloudy,Win,podium,age,driverposition,driverwin,constructorposition,constructorwin
0,0,2008,1,1,18,1,1,1,1,1,...,0,0,0,1,1,23.0,0.0,0.0,0.0,0.0
1,1,2008,1,2,18,1,2,2,5,2,...,0,0,0,0,2,31.0,0.0,0.0,0.0,0.0
2,2,2008,1,3,18,1,3,3,7,3,...,0,0,0,0,3,23.0,0.0,0.0,0.0,0.0
3,3,2008,1,4,18,1,4,4,11,4,...,0,0,0,0,0,27.0,0.0,0.0,0.0,0.0
4,4,2008,1,5,18,1,5,1,3,5,...,0,0,0,0,0,26.0,0.0,0.0,0.0,0.0


In [3]:
# Replace all null values with 0s
mldata = mldata.replace(np.nan, 0)

In [4]:
# Convert float64 columns to integers
mldata[['age', 'driverposition', 'driverwin', 
       'constructorposition', 'constructorwin']] = mldata[['age', 'driverposition', 'driverwin', 
                                                         'constructorposition', 'constructorwin']].astype(int)

## Building the Race Winner Predictor

In [5]:
# Make a copy of mldata in order to create test and train datasets
df = mldata.copy()

In [6]:
# Train the data with all outcomes before 2019
train = df[df['year']<=2018]
X_train = train[['year', 'round', 'circuitId','driverId', 'constructorId', 'grid', 'weather_warm', 
                 'weather_cold', 'weather_dry','weather_wet', 'weather_cloudy']]
y_train = train.Win

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

# Create the test dataset
test = df[(df.year == 2019)]
test = test.reset_index()
test = test.drop(["index"], axis=1)

X_test = test[['year', 'round','circuitId','driverId', 'constructorId', 'grid', 'weather_warm', 'weather_cold', 'weather_dry',
       'weather_wet', 'weather_cloudy']]
y_test = test.Win
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

In [7]:
# Create the SVM model and fit the data to the model
rbf = svm.SVC(kernel='rbf',probability=True).fit(X_train, y_train)

When predicting race winners, the model does not understand that every race must have a winner. Therefore the model predicts many races which have no winners. In order to solve this issue, we also calculate the probability of each predicted outcome. Finally, we will assign the winner of the race to the driver who has the highest probability to win for that race (regardless of the magnitude of the probability

In [8]:
# Predict Outcome 
rbf_pred = rbf.predict(X_test)
# Predict Outcome probabilities
rbf_pred1 = rbf.predict_proba(X_test)

In [9]:
rbf_accuracy = accuracy_score(y_test, rbf_pred)
rbf_f1 = f1_score(y_test, rbf_pred, average='weighted')
print('Accuracy (Rbf Kernel): ', "%.2f" % (rbf_accuracy*100))
print('F1 (Rbf Kernel): ', "%.2f" % (rbf_f1*100))

Accuracy (Rbf Kernel):  95.00
F1 (Rbf Kernel):  92.56


In [10]:
# Create Data Frame for Predicted Outcome
Z = pd.DataFrame({"Prediction": rbf_pred, "Actual": y_test})
# Create second Data Frame for prediction probabilities
Z1 = pd.DataFrame(rbf_pred1, columns = ['proba_0', 'proba_1'])
# Merging the Z and Z1 dataframes 
Z_final = pd.merge(Z, Z1, left_index=True, right_index=True, how='outer')

In [11]:
# Merge Z_final with test. Only bring in columns from test that we need
Z_final = (Z_final.merge(test, left_index=True, right_index=True, how='outer').reindex(
            columns=['raceId', 'round', 'circuitId','driverId','Prediction', 'Actual', 'proba_1', 'Win', 'podium']))
print(Z_final.shape)
Z_final.columns

(420, 9)


Index(['raceId', 'round', 'circuitId', 'driverId', 'Prediction', 'Actual',
       'proba_1', 'Win', 'podium'],
      dtype='object')

In [12]:
# We will groupby 'round' and take the max probability for a driver to win
maxprob = Z_final.groupby(['raceId']).agg({'proba_1':'max'})
maxprob = maxprob.reset_index()
maxprob = maxprob.rename(columns={'proba_1':'proba_1_max'})

# We will then merge the proba_1_max column to Z_Final
Z_final = pd.merge(Z_final, maxprob, how='left', on=['raceId'])
print(Z_final.shape)
Z_final.head()

(420, 10)


Unnamed: 0,raceId,round,circuitId,driverId,Prediction,Actual,proba_1,Win,podium,proba_1_max
0,1010,1,1,822,0,1,0.02915,1,1,0.071518
1,1010,1,1,1,0,0,0.071518,0,2,0.071518
2,1010,1,1,830,0,0,0.059994,0,3,0.071518
3,1010,1,1,20,0,0,0.061176,0,0,0.071518
4,1010,1,1,844,0,0,0.060163,0,0,0.071518


## Predicting the Race Winner

In [13]:
# Number of times our predicted winner is the actual winner
winner = []
for index, row in Z_final.iterrows():
    if row['proba_1'] == row['proba_1_max']:
        if row['podium'] in [1]:
            winner.append(1)
        else:
            winner.append(0)
    else:
        winner.append(0)
        
Z_final['winner'] = winner
Z_final['winner'].sum()

12

In [14]:
# Number of times our predicted winner is in the top 2
top2 = []
for index, row in Z_final.iterrows():
    if row['proba_1'] == row['proba_1_max']:
        if row['podium'] in [1,2]:
            top2.append(1)
        else:
            top2.append(0)
    else:
        top2.append(0)
        
Z_final['top2'] = top2
Z_final['top2'].sum()

15

In [15]:
# Number of times our predicted winner is in the top 3
top3 = []
for index, row in Z_final.iterrows():
    if row['proba_1'] == row['proba_1_max']:
        if row['podium'] in [1,2,3]:
            top3.append(1)
        else:
            top3.append(0)
    else:
        top3.append(0)
        
Z_final['top3'] = top3
Z_final['top3'].sum()

16

In [16]:
# Merge Z_Final and test to create a new dataframe. 
# This dataframe will be used to predict all 20 race positions. 
probability = Z_final["proba_1"].copy()
test = test.join(probability, how='outer')
svm_pred = test[['round', 'resultId', 'raceId', 'circuitId', 'driverId', 'constructorId','position', "proba_1"]]
print(svm_pred.shape)
svm_pred.head()

(420, 8)


Unnamed: 0,round,resultId,raceId,circuitId,driverId,constructorId,position,proba_1
0,1,24203,1010,1,822,131,1,0.02915
1,1,24204,1010,1,1,131,2,0.071518
2,1,24205,1010,1,830,9,3,0.059994
3,1,24206,1010,1,20,6,4,0.061176
4,1,24207,1010,1,844,6,5,0.060163


## Processing svm_pred for predicting all 20 Race outcomes

In order to predict all 20 race outcomes, we will sort the probabilities in ascending order for each race. We can then add a counter for each outcome starting from 1 ending at 20 which will represent the driver's ending position in the race. 

In [17]:
# Make a copy of svm_pred
X = svm_pred.copy()
# Sort X in ascending order
X = X.sort_values(["round", "proba_1"], ascending = (True, False))
X.head()

Unnamed: 0,round,resultId,raceId,circuitId,driverId,constructorId,position,proba_1
1,1,24204,1010,1,1,131,2,0.071518
3,1,24206,1010,1,20,6,4,0.061176
6,1,24209,1010,1,807,4,7,0.060758
11,1,24214,1010,1,846,1,12,0.060658
4,1,24207,1010,1,844,6,5,0.060163


In [18]:
# Create a forLoop and add a counter to each row. Once the raceId changes, the counter will refresh
# This counter will represent the driver's final predicted race outcome

round = [1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
count = 1
prediction = []

for i in round:
    count = 0
    for index, row in X.iterrows():
        if row["round"] == i:
            count = count +1
            prediction.append(count)
        else:
            continue
        

X["Pred_Position"] = prediction
X.head()

Unnamed: 0,round,resultId,raceId,circuitId,driverId,constructorId,position,proba_1,Pred_Position
1,1,24204,1010,1,1,131,2,0.071518,1
3,1,24206,1010,1,20,6,4,0.061176,2
6,1,24209,1010,1,807,4,7,0.060758,3
11,1,24214,1010,1,846,1,12,0.060658,4
4,1,24207,1010,1,844,6,5,0.060163,5


In [19]:
# We will create a new column which shows the delta between actual and predicted finishing position 
X['Delta'] = X['position'] - X['Pred_Position']
X['Delta'] = X['Delta'].abs()

In [20]:
# If the delta is within 2, we will assign the number 1 to it. Otherwise, we will assign 0. 
X["Spread"] = X["Delta"].copy()
X["Spread"] = X["Spread"].apply(lambda x: 1 if x<= 2 else 0)

In addition to predicting every race position (with a spread of 2), we also want to predict the driver's finishing group or bin. To do that, we will create additional columns where we will map the predicted and actual finishing positions to their respective bins: Podium, topsix, topten and else. 

In [21]:
# Add two more columns to create ending position bins

X["pos_group"] = X["position"].copy()
X["pred_pos_group"] = X["Pred_Position"].copy()

podium = [1,2,3]
topsix = [4,5,6]
topten = [7,8,9,10]

def position_group(x):
    if x in podium:
        return 1
    if x in topsix:
        return 2
    if x in topten:
        return 3
    else:
        return 4
    

X["pos_group"] = X["pos_group"].apply(position_group)  
X["pred_pos_group"] = X["pred_pos_group"].apply(position_group)  

In [22]:
# This is the final dataframe which we will use to calute the prediction percentages. 
print(X.shape)
X

(420, 13)


Unnamed: 0,round,resultId,raceId,circuitId,driverId,constructorId,position,proba_1,Pred_Position,Delta,Spread,pos_group,pred_pos_group
1,1,24204,1010,1,1,131,2,0.071518,1,1,1,1,1
3,1,24206,1010,1,20,6,4,0.061176,2,2,1,2,1
6,1,24209,1010,1,807,4,7,0.060758,3,4,0,3,1
11,1,24214,1010,1,846,1,12,0.060658,4,8,0,4,2
4,1,24207,1010,1,844,6,5,0.060163,5,0,1,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
418,21,24624,1030,24,9,3,19,0.019570,16,3,0,4,4
413,21,24619,1030,24,825,210,14,0.016710,17,3,0,4,4
403,21,24609,1030,24,822,131,4,0.012433,18,14,0,2,4
412,21,24618,1030,24,8,51,13,0.010786,19,6,0,4,4


In [23]:
X.to_csv('X.csv')

In [24]:
# Create Confusion matrix for Race win prediction

# Create a copy of X dataframe
matrix = X.copy()

# Crete 1, 0 column for actual and predicted winner
matrix["win"] = matrix['position'].apply(lambda x: 1 if x==1 else 0)
matrix['pred'] = matrix['Pred_Position'].apply(lambda x: 1 if x==1 else 0)

# Convert columns into arrays
actual1 = np.asarray(matrix[['win']])
prediction1 = np.asarray(matrix[['pred']])

# Create confusion matrix
matrix = confusion_matrix(actual1, prediction1)
report = classification_report(actual1, prediction1)
print(report)

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       399
           1       0.57      0.57      0.57        21

    accuracy                           0.96       420
   macro avg       0.77      0.77      0.77       420
weighted avg       0.96      0.96      0.96       420



In [30]:
print(matrix)

[[390   9]
 [  9  12]]


## Predicting all 20 race positions with a spread of 2

In [25]:
correct = len(X.loc[X["Spread"]==1])
total = len(X)
score = (correct / total) * 100

print(f"Model predicted {score} percentage of all outcomes within a spread of 2 positions")

Model predicted 38.095238095238095 percentage of all outcomes within a spread of 2 positions


## Predicting driver finishing bins

In [26]:
group_correct = len(X.loc[X["pos_group"]==X["pred_pos_group"]])
total = len(X)
group_score = (group_correct / total) * 100
print(f"Model predicted {group_score} percentage of all outcome groups for 2019")

Model predicted 55.952380952380956 percentage of all outcome groups for 2019


In [27]:
podium_percentage = len(X.loc[(X["pos_group"]==1) & (X["pos_group"]==X["pred_pos_group"])]) / len(X.loc[X["pos_group"]==1])
topsix_percentage = len(X.loc[(X["pos_group"]==2) & (X["pos_group"]==X["pred_pos_group"])]) / len(X.loc[X["pos_group"]==2])
topten_percentage = len(X.loc[(X["pos_group"]==3) & (X["pos_group"]==X["pred_pos_group"])]) / len(X.loc[X["pos_group"]==3])
bottomten_percentage = len(X.loc[(X["pos_group"]==4) & (X["pos_group"]==X["pred_pos_group"])]) / len(X.loc[X["pos_group"]==4])

print(f"Model predicted {podium_percentage * 100} percentage of all podiums for 2019")
print(f"Model predicted {topsix_percentage * 100} percentage of all topsix for 2019")
print(f"Model predicted {topten_percentage * 100} percentage of all topten for 2019")
print(f"Model predicted {bottomten_percentage * 100} percentage of all topten for 2019")

Model predicted 60.317460317460316 percentage of all podiums for 2019
Model predicted 33.33333333333333 percentage of all topsix for 2019
Model predicted 39.285714285714285 percentage of all topten for 2019
Model predicted 68.0952380952381 percentage of all topten for 2019


## Create Final Output File

In [28]:
svm_output = X[['resultId', 'Pred_Position', 'pred_pos_group']]
svm_output = svm_output.rename(columns={"Pred_Position": "SVM_Outcome", "pred_pos_group": "SVM_Bin"})

In [29]:
# Extract CSV file
svm_output.to_csv('../../../Resources/PythonExport/ML_Export/2019/svm_output_2019.csv')