# Exploring Logistic Regression and Random Forest Results

In [1]:
%matplotlib inline
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from datetime import datetime
from scipy.stats import stats
from scipy.stats import norm
from statsmodels.stats.weightstats import ztest
from statsmodels.stats.proportion import proportions_ztest
import math

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

In [2]:
infile = open('../PickledFiles/RFandLR_preds', 'rb')
predsRFandLR = pickle.load(infile)
infile.close

<function BufferedReader.close>

In [3]:
predsRFandLR.head()

Unnamed: 0,prob_of_homewin_RF,pred_RF,game_result,prob_of_homewin_LR,pred_LR
21867,0.55,1,1,0.536126,1
21868,0.518,1,1,0.423835,0
21869,0.721,1,0,0.689292,1
21870,0.461,0,0,0.430619,0
21871,0.59,1,1,0.487055,0


In [4]:
ytest = predsRFandLR['game_result']

In [5]:
#RF confusion matrix
confusion_matrix(ytest, predsRFandLR.pred_RF)

array([[479, 669],
       [332, 951]])

In [6]:
#RF confusion matrix
confusion_matrix(ytest, predsRFandLR.pred_LR)

array([[677, 471],
       [538, 745]])

In [7]:
#calculate avg of Logistic Reg and Random Forests predicted probabilities 
#will choose result if which ever model had higher confidence in prediction 
predsRFandLR['AvgProb'] = (predsRFandLR['prob_of_homewin_RF'] + predsRFandLR['prob_of_homewin_LR'])/2
predsRFandLR['AvgPred'] = np.where(predsRFandLR['AvgProb']>.5,1,0)

In [8]:
print("[Test Classification Report:]")
print(classification_report(ytest, predsRFandLR['AvgPred']))
print('Test Accuracy: ',accuracy_score(predsRFandLR['AvgPred'], ytest))
yprobs = predsRFandLR['AvgProb']
fpr, tpr, threshold = roc_curve(ytest,  yprobs)
roc_auc = auc(fpr, tpr)
print('AUC: ', roc_auc)

[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.57      0.50      0.53      1148
           1       0.60      0.66      0.63      1283

   micro avg       0.59      0.59      0.59      2431
   macro avg       0.58      0.58      0.58      2431
weighted avg       0.58      0.59      0.58      2431

Test Accuracy:  0.5857671740024681
AUC:  0.6118743906512665


In [9]:
#see accuracy on ones where both models make same prediction of which team wins
match = predsRFandLR[predsRFandLR.pred_RF == predsRFandLR.pred_LR]
print("[Test Classification Report:]")
print(classification_report(match.game_result, match.pred_RF))
print('Test Accuracy: ',accuracy_score(match.pred_RF, match.game_result))
ytest = match.game_re
fpr, tpr, threshold = roc_curve(ytest,  yprobs)
roc_auc = auc(fpr, tpr)
print('AUC: ', roc_auc)

[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.60      0.50      0.55       896
           1       0.62      0.70      0.66      1021

   micro avg       0.61      0.61      0.61      1917
   macro avg       0.61      0.60      0.60      1917
weighted avg       0.61      0.61      0.61      1917

Test Accuracy:  0.6098069900886802


In [10]:
match.shape

(1917, 7)

In [11]:
match.head()

Unnamed: 0,prob_of_homewin_RF,pred_RF,game_result,prob_of_homewin_LR,pred_LR,AvgProb,AvgPred
21867,0.55,1,1,0.536126,1,0.543063,1
21869,0.721,1,0,0.689292,1,0.705146,1
21870,0.461,0,0,0.430619,0,0.44581,0
21872,0.484,0,0,0.421855,0,0.452928,0
21874,0.669,1,0,0.586793,1,0.627896,1


In [12]:
from sklearn.metrics import log_loss
print('Ran. For. Log Loss: ', log_loss(predsRFandLR.game_result, predsRFandLR.prob_of_homewin_RF, eps=1e-15))
print('Log Reg Log Loss: ',log_loss(predsRFandLR.game_result, predsRFandLR.prob_of_homewin_LR, eps=1e-15))

Ran. For. Log Loss:  0.6768342560697054
Log Reg Log Loss:  0.6744207602469292
