# Simulation

In [1]:
%matplotlib inline
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from datetime import datetime
from scipy.stats import stats
from scipy.stats import norm
from statsmodels.stats.weightstats import ztest
from statsmodels.stats.proportion import proportions_ztest
import math

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

In [2]:
#load data frame predictions from the Random Forest model and the Logsitic Regresion model
infile = open('../PickledFiles/RFandLR_preds', 'rb')
RF_LRpreds = pickle.load(infile)
infile.close

infile = open('../PickledFiles/gamelogsRoll_ext', 'rb')
glRoll = pickle.load(infile)
infile.close

<function BufferedReader.close>

In [3]:
results = pd.concat([glRoll[glRoll.Date>'2018-01'], RF_LRpreds], axis=1)
results.shape

(2431, 385)

In [4]:
results.head()

Unnamed: 0,Date,DoubleHeader,DayOfWeek,VisitingTeam,VisitingTeamLeague,VisitingTeamGameNumber,HomeTeam,HomeTeamLeague,HomeTeamGameNumber,VisitorRunsScored,...,AvgPitchBABIP_diff,AvgAper9_diff,AvgEper9_diff,pctWminL_SPdiff,AvgSpread_diff,prob_of_homewin_RF,pred_RF,game_result,prob_of_homewin_LR,pred_LR
21867,2018-03-29,0,Thu,COL,NL,1,ARI,NL,1,2,...,-0.010999,-0.812006,0.192501,-0.033333,1.444444,0.55,1,1,0.536126,1
21868,2018-03-29,0,Thu,PHI,NL,1,ATL,NL,1,5,...,-0.001366,0.503619,0.093545,-0.1,-0.061728,0.518,1,1,0.423835,0
21869,2018-03-29,0,Thu,SFN,NL,1,LAN,NL,1,1,...,-0.027265,-0.915768,0.009094,0.582051,3.08642,0.721,1,0,0.689292,1
21870,2018-03-29,0,Thu,CHN,NL,1,FLO,NL,1,8,...,0.012379,-0.603012,-0.122898,0.0,-0.703704,0.461,0,0,0.430619,0
21871,2018-03-29,0,Thu,SLN,NL,1,NYN,NL,1,4,...,0.020489,-0.75769,-0.006177,0.033333,-0.962963,0.59,1,1,0.487055,0


In [5]:
results['predwinner_LR'] = np.where(results.pred_LR==1, 'Home', 'Visitor')
results['predwinner_RF'] = np.where(results.pred_RF==1, 'Home', 'Visitor')
results['Winner'] = np.where(results.game_result ==1, 'Home','Visitor')

In [6]:
def client_winnings(df):
    investment = 100* df.shape[0]
    df['LR_winnings'] = np.where(df.pred_LR == df.HomeWin, 50, -100)
    df['RF_winnings'] = np.where(df.pred_RF == df.HomeWin, 50, -100)
    ROI_LR = sum(df.LR_winnings)/investment
    ROI_RF = sum(df.RF_winnings)/investment    
    return sum(df.LR_winnings), ROI_LR, sum(df.RF_winnings), ROI_RF

In [7]:
sample_games = results.sample(n=30, random_state = 123).sort_index()

In [8]:
client_winnings(sample_games)

(-600, -0.2, 150, 0.05)

In [9]:
sample_games[['Date','VisitingTeam','HomeTeam','prob_of_homewin_LR','LR_winnings','prob_of_homewin_RF',  'RF_winnings', 'Winner']]

Unnamed: 0,Date,VisitingTeam,HomeTeam,prob_of_homewin_LR,LR_winnings,prob_of_homewin_RF,RF_winnings,Winner
22051,2018-04-12,CHA,MIN,0.56938,50,0.629,50,Home
22056,2018-04-13,MIL,NYN,0.457341,-100,0.554,50,Home
22057,2018-04-13,SFN,SDN,0.50147,50,0.553,50,Home
22133,2018-04-19,TOR,NYA,0.568426,50,0.699,50,Home
22284,2018-04-30,NYA,HOU,0.48609,-100,0.561,50,Home
22298,2018-05-01,NYA,HOU,0.515514,-100,0.561,-100,Visitor
22333,2018-05-04,CHN,SLN,0.48129,-100,0.414,-100,Home
22482,2018-05-15,CLE,DET,0.399107,-100,0.494,-100,Home
22545,2018-05-20,LAN,WAS,0.491341,50,0.492,50,Visitor
22556,2018-05-21,FLO,NYN,0.504208,50,0.512,50,Home


In [10]:
client_winnings(results)

(-29800, -0.12258329905388729, -28600, -0.11764705882352941)

In [11]:

high_conf_LR = results[(results.prob_of_homewin_LR > 0.55) | (results.prob_of_homewin_LR < 0.45)]
total = high_conf_LR.shape[0]
print(total)
print(len(high_conf_LR[high_conf_LR.LR_winnings == 50])/total)

1392
0.6170977011494253


In [12]:
#if only bet on games where the Log Reg model had a predicted probability lower than .4 or higher than .6 (583 games)
client_winnings(high_conf_LR)[:2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


(-10350, -0.07435344827586207)

In [13]:

high_conf_RF = results[(results.prob_of_homewin_RF > 0.6) | (results.prob_of_homewin_RF < 0.4)]
high_conf_RF.shape[0]

624

In [14]:
#if only bet on games where the Ran. Forest model had a predicted probability lower than .4 or higher than .6 (624 games)
client_winnings(high_conf_RF)[2:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


(-3600, -0.057692307692307696)

In [15]:
agree = results[results.predwinner_LR == results.predwinner_RF]
agree.shape[0]

1917

In [16]:
#if bet only on games where both models agreed (1917 games)
client_winnings(agree)[:2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


(-16350, -0.08528951486697965)