# Extending the Baseline

In [1]:
%matplotlib inline
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from datetime import datetime
from scipy.stats import stats
from scipy.stats import norm
from statsmodels.stats.weightstats import ztest
from statsmodels.stats.proportion import proportions_ztest
import math

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve
from sklearn.metrics import auc



In [2]:
def train_test(df, start_date, split_date):
    training_data = df[df.Date < split_date]

    #remove April 2009 data because there are a lot of nulls
    training_data = training_data[training_data.Date > start_date]

    test_data = df[df.Date > split_date]
    Xtrain = training_data.iloc[:,1:-1]
    #use 2018 season as test data
    Xtest = test_data.iloc[:,1:-1]
    ytrain = training_data.iloc[:,-1]
    ytest = test_data.iloc[:,-1]
    return Xtrain, Xtest, ytrain, ytest

def run_logreg(Xtrain, Xtest, ytrain, ytest, scaler, cw):
    """Fit & tune a Logistic Regression model."""
    Xtrain_scld = scaler.fit_transform(Xtrain)
    Xtest_scld = scaler.transform(Xtest)

    param_grid = {'C': [0.001, 0.1, 1, 10, 100]}
    logreg = LogisticRegression(solver = 'lbfgs', class_weight = cw)
    #cross validation
    logreg_cv = GridSearchCV(logreg, param_grid, cv = 10)
    fitted = logreg_cv.fit(Xtrain_scld, ytrain)
    print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
    print("Best score is {}".format(logreg_cv.best_score_))
    return fitted, Xtrain_scld, Xtest_scld, ytrain, ytest
    
def run_report(fitted, Xtrain_scld, Xtest_scld, ytrain, ytest):
    """Generate Training and Test Classification Reports"""
    ypred = fitted.predict(Xtest_scld)
    ypred_train = fitted.predict(Xtrain_scld)
    
    yprobs = fitted.predict_proba(Xtest_scld)[:,1]
    fpr, tpr, threshold = roc_curve(ytest,  yprobs)
    roc_auc = auc(fpr, tpr)

    print("[Training Classification Report:]")
    print(classification_report(ytrain, ypred_train))
    print('Training Accuracy: ',accuracy_score(ypred_train, ytrain))
    print('')
    print("[Test Classification Report:]")
    print(classification_report(ytest, ypred))
    print('Test Accuracy: ',accuracy_score(ypred, ytest))
    print('')
    print('AUC: ', roc_auc)
    return 

def split_fit_report(df, start_date = '2009-04-30',split_date = '2018-01',scaler = StandardScaler(), cw=None):
    Xtrain, Xtest, ytrain, ytest = train_test(df, start_date, split_date)
    print('')
    fitted, Xtrain_scld, Xtest_scld, ytrain, ytest = run_logreg(Xtrain, Xtest, ytrain, ytest, scaler, cw)
    print('')
    run_report(fitted, Xtrain_scld, Xtest_scld, ytrain, ytest)
    return fitted, Xtrain_scld, Xtest_scld, ytrain, ytest

## Recall the Baseline Results

AUC:  0.560206370630681

## Trial 1: Using class_weight = 'balanced'

In [3]:
infile = open('../PickledFiles/gamelogsS09', 'rb')
gamelogsS09 = pickle.load(infile)
infile.close

infile = open('../PickledFiles/rel_cols', 'rb')
rel_cols = pickle.load(infile)
infile.close


<function BufferedReader.close>

In [4]:
fit= split_fit_report(gamelogsS09[rel_cols], cw='balanced')


Tuned Logistic Regression Parameters: {'C': 0.001}
Best score is 0.5404753063497958

[Training Classification Report:]
              precision    recall  f1-score   support

           0       0.51      0.55      0.53      9948
           1       0.59      0.55      0.57     11596

   micro avg       0.55      0.55      0.55     21544
   macro avg       0.55      0.55      0.55     21544
weighted avg       0.55      0.55      0.55     21544

Training Accuracy:  0.5487838841440772

[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.51      0.57      0.54      1148
           1       0.57      0.52      0.54      1283

   micro avg       0.54      0.54      0.54      2431
   macro avg       0.54      0.54      0.54      2431
weighted avg       0.54      0.54      0.54      2431

Test Accuracy:  0.5405183052241875

AUC:  0.5602124810915183


AUC decreased.  Test accuracy deceased by about .007. 

## Trial 2: Using a Smaller Subset of Variables

In [5]:
home_stats = ['pctWminL_HSP','AvgFIPnoConst_HSP','AvgIP_HSP','AvgSB_H', 'AvgCS_H', 'AvgGDP_H',
              'AvgDB_H', 'AvgOBP_H','AvgISO_H','AvgAper9_H', 'AvgPitchBABIP_H', 'AvgRelFIPnoConst_H',
              'AvgEper9_H', 'AvgRunDiffAtHome_H']
visit_stats = []
for stat in home_stats[:-1]:
    if stat.endswith('HSP'):
        visit_stat = stat[:-3] +'VSP'
    else:
        visit_stat = stat[:-1] + 'V'
    visit_stats.append(visit_stat)

my_stats = ['Date'] +home_stats + visit_stats + ['AvgRunDiffOnRoad_V','AvgAttendance','HomeWin']

In [6]:
fit2 = split_fit_report(gamelogsS09[my_stats], cw = 'balanced')


Tuned Logistic Regression Parameters: {'C': 0.1}
Best score is 0.540893056071296

[Training Classification Report:]
              precision    recall  f1-score   support

           0       0.51      0.55      0.53      9948
           1       0.59      0.55      0.57     11596

   micro avg       0.55      0.55      0.55     21544
   macro avg       0.55      0.55      0.55     21544
weighted avg       0.55      0.55      0.55     21544

Training Accuracy:  0.5498978834014111

[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.52      0.64      0.57      1148
           1       0.59      0.48      0.53      1283

   micro avg       0.55      0.55      0.55      2431
   macro avg       0.56      0.56      0.55      2431
weighted avg       0.56      0.55      0.55      2431

Test Accuracy:  0.5528589058000822

AUC:  0.5775756950309732


We see from the baseline model, the training accuracy decreases very slightly by about 0.001, but the test accuracy improves by about .006.  The area under the ROC curve also improved from 0.56 to 0.578.

## Trial 3: Using Data just from Current Season 

In [7]:
infile = open('../PickledFiles/gamelogs3', 'rb')
gamelogs3 = pickle.load(infile)
infile.close

<function BufferedReader.close>

In [8]:
#Add the features that we added in AveragesSince2009 and RollingAverages

#Avg errors per 9 innings - NEW COL 
gamelogs3['AvgEper9_H'] = gamelogs3['AvgE_H']*9/gamelogs3['AvgDefInnings_H']
gamelogs3['AvgEper9_V'] = gamelogs3['AvgE_V']*9/gamelogs3['AvgDefInnings_V']

#Avg strikeouts by offense per 9 innings - NEW COL
gamelogs3['AvgKper9_H'] = gamelogs3['AvgK_H']*9/gamelogs3['AvgOffenseInnings_H']
gamelogs3['AvgKper9_V'] = gamelogs3['AvgK_V']*9/gamelogs3['AvgOffenseInnings_V']

#pct games won by starting pitcher - percent games lost
gamelogs3['pctWminL_HSP'] = gamelogs3['pctW_HSP']-gamelogs3['pctL_HSP']
gamelogs3['pctWminL_VSP'] = gamelogs3['pctW_VSP']-gamelogs3['pctL_VSP']

#wild pitches t
gamelogs3['HomeReliefWP'] = gamelogs3['VisitorWP'] - gamelogs3['WP_HSP']
gamelogs3['VisitorReliefWP'] = gamelogs3['HomeWP'] - gamelogs3['WP_VSP']

In [9]:
outfile = open('../PickledFiles/gamelogs3_A', 'wb')
pickle.dump(gamelogs3, outfile)
outfile.close()

In [10]:
gamelogs3_rel = gamelogs3[rel_cols]

In [11]:
fit3 = split_fit_report(gamelogs3_rel, cw = 'balanced')


Tuned Logistic Regression Parameters: {'C': 0.001}
Best score is 0.5462309691793539

[Training Classification Report:]
              precision    recall  f1-score   support

           0       0.52      0.56      0.53      9948
           1       0.59      0.55      0.57     11596

   micro avg       0.55      0.55      0.55     21544
   macro avg       0.55      0.55      0.55     21544
weighted avg       0.56      0.55      0.55     21544

Training Accuracy:  0.5534719643520237

[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.53      0.63      0.58      1148
           1       0.61      0.51      0.55      1283

   micro avg       0.57      0.57      0.57      2431
   macro avg       0.57      0.57      0.57      2431
weighted avg       0.57      0.57      0.56      2431

Test Accuracy:  0.5656108597285068

AUC:  0.5934336987841541


This test accuracy by about 1.8% (from 0.547 to 0.565). AUC increases.

### Using the subset of variables I chose above

In [12]:
fit3sub = split_fit_report(gamelogs3_rel[my_stats], cw = 'balanced')


Tuned Logistic Regression Parameters: {'C': 0.001}
Best score is 0.5480876346082436

[Training Classification Report:]
              precision    recall  f1-score   support

           0       0.52      0.56      0.54      9948
           1       0.59      0.55      0.57     11596

   micro avg       0.55      0.55      0.55     21544
   macro avg       0.55      0.55      0.55     21544
weighted avg       0.56      0.55      0.55     21544

Training Accuracy:  0.5533791310805792

[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.54      0.65      0.59      1148
           1       0.61      0.50      0.55      1283

   micro avg       0.57      0.57      0.57      2431
   macro avg       0.57      0.57      0.57      2431
weighted avg       0.58      0.57      0.57      2431

Test Accuracy:  0.5680789798436857

AUC:  0.596847409571969


These results are not much different from using the full set of features on the same data set. Test accuracy and recall for class 0 slighlty higher.

## Trial 4: Using a Rolling Window (see RollingAverages)

In [13]:
infile = open('../PickledFiles/gamelogsRoll', 'rb')
gamelogsRoll = pickle.load(infile)
infile.close

<function BufferedReader.close>

In [14]:
glRoll_rel= gamelogsRoll[rel_cols]

In [15]:
fit4,Xtrain_scld4,Xtest_scld4,ytrain,ytest = split_fit_report(glRoll_rel, cw = 'balanced')


Tuned Logistic Regression Parameters: {'C': 0.1}
Best score is 0.5506405495729669

[Training Classification Report:]
              precision    recall  f1-score   support

           0       0.52      0.56      0.54      9948
           1       0.60      0.56      0.57     11596

   micro avg       0.56      0.56      0.56     21544
   macro avg       0.56      0.56      0.56     21544
weighted avg       0.56      0.56      0.56     21544

Training Accuracy:  0.5573245451169699

[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.55      0.62      0.58      1148
           1       0.61      0.54      0.57      1283

   micro avg       0.58      0.58      0.58      2431
   macro avg       0.58      0.58      0.58      2431
weighted avg       0.58      0.58      0.58      2431

Test Accuracy:  0.5771287535993418

AUC:  0.6084457431814045


Best results yet. Training accuracy, test accuracy, and AUC are all slighlty higher when using the data computed using just current season data.

In [16]:
fit4sub = split_fit_report(gamelogsRoll[my_stats], cw = 'balanced')


Tuned Logistic Regression Parameters: {'C': 0.1}
Best score is 0.553704047530635

[Training Classification Report:]
              precision    recall  f1-score   support

           0       0.52      0.56      0.54      9948
           1       0.59      0.56      0.57     11596

   micro avg       0.56      0.56      0.56     21544
   macro avg       0.56      0.56      0.56     21544
weighted avg       0.56      0.56      0.56     21544

Training Accuracy:  0.5560712959524694

[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.55      0.62      0.58      1148
           1       0.62      0.54      0.58      1283

   micro avg       0.58      0.58      0.58      2431
   macro avg       0.58      0.58      0.58      2431
weighted avg       0.58      0.58      0.58      2431

Test Accuracy:  0.5795968737145207

AUC:  0.6104343587139245


We see the test accuracy and AUC slighlty higher than on full set of features with rolling average data. Precision, recall, and F1 score are the same.

## Trial 5: Explicitly Compare the Stats of the Home and Away Team
By subtracting visiting team's stats with the corresponding stat of the home team.

In [17]:
home_stats = [stat for stat in rel_cols if stat.endswith(('H','HSP'))]
home_stats

['pctSho_HSP',
 'AvgIP_HSP',
 'AvgWP_HSP',
 'AvgFIPnoConst_HSP',
 'AvgSH_H',
 'AvgSF_H',
 'AvgIBB_H',
 'AvgSB_H',
 'AvgCS_H',
 'AvgGDP_H',
 'AvgCI_H',
 'AvgPassed_H',
 'AvgDB_H',
 'AvgTP_H',
 'AvgReliefIP_H',
 'AvgReliefWP_H',
 'AvgOBP_H',
 'AvgISO_H',
 'AvgRelFIPnoConst_H',
 'AvgPitchBABIP_H',
 'AvgAper9_H',
 'AvgEper9_H',
 'pctWminL_HSP',
 'AvgRunDiffAtHome_H']

In [18]:
for home_stat in home_stats[:-1]:
    if home_stat.endswith('H'):
        stat = home_stat[:-1]
        visit_stat = stat + 'V'
        diff_stat = stat + 'diff'
    else:
        stat = home_stat[:-3]
        visit_stat = stat + 'VSP'
        diff_stat = stat + 'SPdiff'
    gamelogsRoll[diff_stat] = gamelogsRoll[home_stat] - gamelogsRoll[visit_stat]

In [19]:
gamelogsRoll['AvgSpread_diff'] = gamelogsRoll['AvgRunDiffAtHome_H'] - gamelogsRoll['AvgRunDiffOnRoad_V']

In [20]:
diff_stats = [col for col in gamelogsRoll.columns if col.endswith('diff')]
rel_stats_diff = ['Date'] + diff_stats  + ['AvgAttendance','HomeWin']

In [21]:
fit5,_,_,_,_ = split_fit_report(gamelogsRoll[rel_stats_diff], cw = 'balanced')


Tuned Logistic Regression Parameters: {'C': 1}
Best score is 0.5529613813590791

[Training Classification Report:]
              precision    recall  f1-score   support

           0       0.52      0.56      0.54      9948
           1       0.60      0.56      0.58     11596

   micro avg       0.56      0.56      0.56     21544
   macro avg       0.56      0.56      0.56     21544
weighted avg       0.56      0.56      0.56     21544

Training Accuracy:  0.557927961381359

[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.56      0.59      0.57      1148
           1       0.61      0.58      0.60      1283

   micro avg       0.58      0.58      0.58      2431
   macro avg       0.58      0.58      0.58      2431
weighted avg       0.59      0.58      0.58      2431

Test Accuracy:  0.583710407239819

AUC:  0.6090812311084919


In [22]:
my_home_stats = [stat for stat in my_stats if stat.endswith(('H','HSP'))]
my_diff_stats = []
for home_stat in my_home_stats[:-1]:
    if home_stat.endswith('H'):
        stat = home_stat[:-1]
        visit_stat = stat + 'V'
        diff_stat = stat + 'diff'
    else:
        stat = home_stat[:-3]
        visit_stat = stat + 'VSP'
        diff_stat = stat + 'SPdiff'
    my_diff_stats.append(diff_stat)

In [23]:
my_diff_stats = ['Date'] + my_diff_stats  + ['AvgSpread_diff','AvgAttendance', 'HomeWin']

In [24]:
fit5sub, _,_,_,_ = split_fit_report(gamelogsRoll[my_diff_stats],cw = 'balanced')


Tuned Logistic Regression Parameters: {'C': 0.1}
Best score is 0.555235796509469

[Training Classification Report:]
              precision    recall  f1-score   support

           0       0.52      0.56      0.54      9948
           1       0.59      0.56      0.57     11596

   micro avg       0.56      0.56      0.56     21544
   macro avg       0.56      0.56      0.56     21544
weighted avg       0.56      0.56      0.56     21544

Training Accuracy:  0.5569067953954697

[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.56      0.59      0.57      1148
           1       0.61      0.58      0.60      1283

   micro avg       0.58      0.58      0.58      2431
   macro avg       0.58      0.58      0.58      2431
weighted avg       0.59      0.58      0.58      2431

Test Accuracy:  0.5845331139448786

AUC:  0.6115193049826055


AUC, test precision, recall, f1-score, all at their highest.

## Pickle

In [25]:
#all_rels = rel_cols
all_rels = rel_cols[:-1]+rel_stats_diff[1:-2]+['HomeWin']

In [26]:
outfile = open('../PickledFiles/gamelogsRoll_ext', 'wb')
pickle.dump(gamelogsRoll, outfile)
outfile.close()

outfile = open('../PickledFiles/rel_diffs', 'wb')
pickle.dump(rel_stats_diff, outfile)
outfile.close()

outfile = open('../PickledFiles/rel_cols_extended', 'wb')
pickle.dump(rel_cols, outfile)
outfile.close()