In [1]:
# Import dependcies

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, datasets
import sklearn.model_selection as model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from config import *
import psycopg2

In [2]:
# Connect to database and extract mldata table
conn = psycopg2.connect(database=DATABASE, user=USER, password=PASSWORD, host=HOST, port= PORT)
cursor = conn.cursor()
cursor.execute("SELECT * FROM mldata")
data = cursor.fetchall()
colnames = [desc[0] for desc in cursor.description]
mldata = pd.DataFrame(data, columns=colnames)
print(mldata.shape)
mldata.head()

(7940, 22)


Unnamed: 0,index,year,round,resultId,raceId,circuitId,driverId,constructorId,grid,position,...,weather_dry,weather_wet,weather_cloudy,Win,podium,age,driverposition,driverwin,constructorposition,constructorwin
0,0,2008,1,1,18,1,1,1,1,1,...,0,0,0,1,1,23.0,0.0,0.0,0.0,0.0
1,1,2008,1,2,18,1,2,2,5,2,...,0,0,0,0,2,31.0,0.0,0.0,0.0,0.0
2,2,2008,1,3,18,1,3,3,7,3,...,0,0,0,0,3,23.0,0.0,0.0,0.0,0.0
3,3,2008,1,4,18,1,4,4,11,4,...,0,0,0,0,0,27.0,0.0,0.0,0.0,0.0
4,4,2008,1,5,18,1,5,1,3,5,...,0,0,0,0,0,26.0,0.0,0.0,0.0,0.0


In [3]:
# Replace all null values with 0s
mldata = mldata.replace(np.nan, 0)

In [4]:
# Convert float64 columns to integers
mldata[['age', 'driverposition', 'driverwin', 
       'constructorposition', 'constructorwin']] = mldata[['age', 'driverposition', 'driverwin', 
                                                         'constructorposition', 'constructorwin']].astype(int)

## Building the Race Winner Predictor

In [5]:
# Make a copy of mldata in order to create test and train datasets
df = mldata.copy()

In [6]:
# Train the data with all outcomes before 2017
train = df[df['year']<=2016]
X_train = train[['year', 'round', 'circuitId','driverId', 'constructorId', 'grid', 'weather_warm', 
                 'weather_cold', 'weather_dry','weather_wet', 'weather_cloudy']]
y_train = train.Win

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

# Create the test dataset
test = df[(df.year >=2017)]
test = test.reset_index()
test = test.drop(["index"], axis=1)

X_test = test[['year', 'round','circuitId','driverId', 'constructorId', 'grid', 'weather_warm', 'weather_cold', 'weather_dry',
       'weather_wet', 'weather_cloudy']]
y_test = test.Win
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

In [7]:
# Create the randomforrest model and fit the data to the model
rf_model = RandomForestClassifier(n_estimators = 10, criterion = 'gini', random_state = 42)
rf_model = rf_model.fit(X_train, y_train)

When predicting race winners, the model does not understand that every race must have a winner. Therefore the model predicts many races which have no winners. In order to solve this issue, we also calculate the probability of each predicted outcome. Finally, we will assign the winner of the race to the driver who has the highest probability to win for that race (regardless of the magnitude of the probability.

In [8]:
# Predict Outcome 
predictions = rf_model.predict(X_test)
# Predict Outcome probabilities
prediction1 = rf_model.predict_proba(X_test)

In [9]:
# Calculate the accuracy score of the model
rnf_acc_score = accuracy_score(y_test, predictions)
rnf_f1 = f1_score(y_test, predictions, average='weighted')
print(f'Accuracy Score (Random Forrest): ', "%.2f" % (rnf_acc_score*100))
print('F1 (Random Forrest): ', "%.2f" % (rnf_f1*100))

Accuracy Score (Random Forrest):  94.76
F1 (Random Forrest):  93.60


In [10]:
# Create Data Frame for Predicted Outcome
Z = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
# Create second Data Frame for prediction probabilities
Z1 = pd.DataFrame(prediction1, columns = ['proba_0', 'proba_1'])
# Merging the Z and Z1 dataframes 
Z_final = pd.merge(Z, Z1, left_index=True, right_index=True, how='outer')

In [11]:
# Merge Z_final with test. Only bring in columns from test that we need
Z_final = (Z_final.merge(test, left_index=True, right_index=True, how='outer').reindex(
            columns=['raceId', 'round', 'circuitId','driverId','Prediction', 'Actual', 'proba_1', 'Win', 'podium']))
print(Z_final.shape)
Z_final.columns

(1240, 9)


Index(['raceId', 'round', 'circuitId', 'driverId', 'Prediction', 'Actual',
       'proba_1', 'Win', 'podium'],
      dtype='object')

In [12]:
# We will groupby 'round' and take the max probability for a driver to win
maxprob = Z_final.groupby(['round']).agg({'proba_1':'max'})
maxprob = maxprob.reset_index()
maxprob = maxprob.rename(columns={'proba_1':'proba_1_max'})

# We will then merge the proba_1_max column to Z_Final
Z_final = pd.merge(Z_final, maxprob, how='left', on=['round'])
print(Z_final.shape)
Z_final.head()

(1240, 10)


Unnamed: 0,raceId,round,circuitId,driverId,Prediction,Actual,proba_1,Win,podium,proba_1_max
0,969,1,1,20,1,1,0.6,1,1,0.6
1,969,1,1,1,0,0,0.1,0,2,0.6
2,969,1,1,822,0,0,0.3,0,3,0.6
3,969,1,1,8,0,0,0.0,0,0,0.6
4,969,1,1,830,0,0,0.2,0,0,0.6


## Predicting the Race Winner

In [13]:
# Number of times our predicted winner is the actual winner
winner = []
for index, row in Z_final.iterrows():
    if row['proba_1'] == row['proba_1_max']:
        if row['podium'] in [1]:
            winner.append(1)
        else:
            winner.append(0)
    else:
        winner.append(0)
        
Z_final['winner'] = winner
Z_final['winner'].sum()

19

In [14]:
# Number of times our predicted winner is in the top 2
top2 = []
for index, row in Z_final.iterrows():
    if row['proba_1'] == row['proba_1_max']:
        if row['podium'] in [1,2]:
            top2.append(1)
        else:
            top2.append(0)
    else:
        top2.append(0)
        
Z_final['top2'] = top2
Z_final['top2'].sum()

23

In [15]:
# Number of times our predicted winner is in the top 2
top3 = []
for index, row in Z_final.iterrows():
    if row['proba_1'] == row['proba_1_max']:
        if row['podium'] in [1,2,3]:
            top3.append(1)
        else:
            top3.append(0)
    else:
        top3.append(0)
        
Z_final['top3'] = top3
Z_final['top3'].sum()

26

In [16]:
# Merge Z_Final and test to create a new dataframe. 
# This dataframe will be used to predict all 20 race positions. 
probability = Z_final["proba_1"].copy()
test = test.join(probability, how='outer')
rand_pred = test[['round', 'resultId', 'raceId', 'circuitId', 'driverId', 'constructorId','position', "proba_1"]]
print(rand_pred.shape)
rand_pred.head()

(1240, 8)


Unnamed: 0,round,resultId,raceId,circuitId,driverId,constructorId,position,proba_1
0,1,23379,969,1,20,6,1,0.6
1,1,23380,969,1,1,131,2,0.1
2,1,23381,969,1,822,131,3,0.3
3,1,23382,969,1,8,6,4,0.0
4,1,23383,969,1,830,9,5,0.2


## Processing rand_pred for predicting all 20 Race outcomes

In order to predict all 20 race outcomes, we will sort the probabilities in ascending order for each race. We can then add a counter for each outcome starting from 1 ending at 20 which will represent the driver's ending position in the race.

In [17]:
# Make a copy of svm_pred
X = rand_pred.copy()
# Sort X in ascending order
X = X.sort_values(["round", "proba_1"], ascending = (True, False))
X.head()

Unnamed: 0,round,resultId,raceId,circuitId,driverId,constructorId,position,proba_1
0,1,23379,969,1,20,6,1,0.6
401,1,23783,989,1,1,131,2,0.4
822,1,24205,1010,1,830,9,3,0.4
2,1,23381,969,1,822,131,3,0.3
402,1,23784,989,1,8,6,3,0.3


In [18]:
# Create a forLoop and add a counter to each row. Once the raceId changes, the counter will refresh
# This counter will represent the driver's final predicted race outcome

round = [1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
count = 1
prediction = []

for i in round:
    count = 0
    for index, row in X.iterrows():
        if row["round"] == i:
            count = count +1
            prediction.append(count)
        else:
            continue
        

X["Pred_Position"] = prediction
X.head()

Unnamed: 0,round,resultId,raceId,circuitId,driverId,constructorId,position,proba_1,Pred_Position
0,1,23379,969,1,20,6,1,0.6,1
401,1,23783,989,1,1,131,2,0.4,2
822,1,24205,1010,1,830,9,3,0.4,3
2,1,23381,969,1,822,131,3,0.3,4
402,1,23784,989,1,8,6,3,0.3,5


In [19]:
# We will create a new column which shows the delta between actual and predicted finishing position 
X['Delta'] = X['position'] - X['Pred_Position']
X['Delta'] = X['Delta'].abs()

In [20]:
# If the delta is within 2, we will assign the number 1 to it. Otherwise, we will assign 0. 
X["Spread"] = X["Delta"].copy()
X["Spread"] = X["Spread"].apply(lambda x: 1 if x<= 2 else 0)

In addition to predicting every race position (with a spread of 2), we also want to predict the driver's finishing group or bin. To do that, we will create additional columns where we will map the predicted and actual finishing positions to their respective bins: Podium, topsix, topten and else. 

In [21]:
# Add two more columns to create ending position bins

X["pos_group"] = X["position"].copy()
X["pred_pos_group"] = X["Pred_Position"].copy()

podium = [1,2,3]
topsix = [4,5,6]
topten = [7,8,9,10]

def position_group(x):
    if x in podium:
        return 1
    if x in topsix:
        return 2
    if x in topten:
        return 3
    else:
        return 4
    

X["pos_group"] = X["pos_group"].apply(position_group)  
X["pred_pos_group"] = X["pred_pos_group"].apply(position_group)  

In [22]:
# This is the final dataframe which we will use to calute the prediction percentages. 
print(X.shape)
X

(1240, 13)


Unnamed: 0,round,resultId,raceId,circuitId,driverId,constructorId,position,proba_1,Pred_Position,Delta,Spread,pos_group,pred_pos_group
0,1,23379,969,1,20,6,1,0.6,1,0,1,1,1
401,1,23783,989,1,1,131,2,0.4,2,0,1,1,1
822,1,24205,1010,1,830,9,3,0.4,3,0,1,1,1
2,1,23381,969,1,822,131,3,0.3,4,1,1,1,2
402,1,23784,989,1,8,6,3,0.3,5,2,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1235,21,24621,1030,24,841,51,16,0.0,36,20,0,4,4
1236,21,24622,1030,24,847,3,17,0.0,37,20,0,4,4
1237,21,24623,1030,24,842,5,18,0.0,38,20,0,4,4
1238,21,24624,1030,24,9,3,19,0.0,39,20,0,4,4


## Predicting all 20 race positions with a spread of 2

In [23]:
correct = len(X.loc[X["Spread"]==1])
total = len(X)
score = (correct / total) * 100

print(f"Model predicted {score} percentage of all outcomes within a spread of 2 positions")

Model predicted 7.016129032258064 percentage of all outcomes within a spread of 2 positions


## Predicting driver finishing bins

In [24]:
group_correct = len(X.loc[X["pos_group"]==X["pred_pos_group"]])
total = len(X)
group_score = (group_correct / total) * 100
print(f"Model predicted {group_score} percentage of all outcome groups for 2017-19")

Model predicted 52.66129032258065 percentage of all outcome groups for 2017-19


In [25]:
podium_percentage = len(X.loc[(X["pos_group"]==1) & (X["pos_group"]==X["pred_pos_group"])]) / len(X.loc[X["pos_group"]==1])
topsix_percentage = len(X.loc[(X["pos_group"]==2) & (X["pos_group"]==X["pred_pos_group"])]) / len(X.loc[X["pos_group"]==2])
topten_percentage = len(X.loc[(X["pos_group"]==3) & (X["pos_group"]==X["pred_pos_group"])]) / len(X.loc[X["pos_group"]==3])
bottomten_percentage = len(X.loc[(X["pos_group"]==4) & (X["pos_group"]==X["pred_pos_group"])]) / len(X.loc[X["pos_group"]==4])

print(f"Model predicted {podium_percentage * 100} percentage of all podiums for 2017-19")
print(f"Model predicted {topsix_percentage * 100} percentage of all topsix for 2017-19")
print(f"Model predicted {topten_percentage * 100} percentage of all topten for 2017-19")
print(f"Model predicted {bottomten_percentage * 100} percentage of all topten for 2017-19")

Model predicted 23.118279569892472 percentage of all podiums for 2017-19
Model predicted 4.864864864864865 percentage of all topsix for 2017-19
Model predicted 1.6129032258064515 percentage of all topten for 2017-19
Model predicted 96.1352657004831 percentage of all topten for 2017-19


## Create Final Output File

In [26]:
rnf_output = X[['resultId', 'Pred_Position', 'pred_pos_group']]
rnf_output = rnf_output.rename(columns={"Pred_Position": "RNF_Outcome", "pred_pos_group": "RNF_Bin"})

In [27]:
# Extract CSV file
rnf_output.to_csv('../../../Resources/PythonExport/ML_Export/2017-19/rnf_output_2017-19.csv')