In [113]:
import pandas as pd
df = pd.read_csv('pitchers cleaned.csv')

In [154]:
import warnings
warnings.filterwarnings("ignore")

Based on trial and error in the last model, resampling and boosting to even out the classes is the way to go

In [114]:
dfc = df[df['CYA'] == 1].sample(n=300, replace = True)

In [115]:
dfn = df[df['CYA'] == 0].sample(n=1500, replace = False)

In [116]:
df_resample = pd.concat([dfc, dfn], axis = 0)

In [117]:
df_resample.drop(columns='Unnamed: 0', inplace = True)

In [118]:
df_resample.columns

Index(['Season', 'Name', 'Team', 'W', 'L', 'SV', 'G', 'GS', 'IP', 'K/9',
       'BB/9', 'HR/9', 'BABIP', 'LOB%', 'ERA', 'FIP', 'WAR', 'ERA-', 'FIP-',
       'H', 'HR', 'SO', 'WHIP', 'playerid', 'CYA'],
      dtype='object')

In [119]:
traditional = ['W', 'IP', 'SO', 'H', 'HR', 'ERA', 'L']
sabr = ['K/9', 'ERA-', 'FIP-', 'BB/9', 'LOB%', 'WHIP', 'WAR']

In [120]:
df_resample['LOB%'] = df_resample['LOB%'].str[:-1].astype(float)/100

In [121]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np

In [122]:

params = {
    'penalty': ['l1', 'l2'],
    'C': [1, 2,3,4,5,6,7,8,9,10]
}

Grid Searching the traditional model

In [155]:
grid_t = GridSearchCV(LogisticRegression(), param_grid=params, cv=3)
grid_s = GridSearchCV(LogisticRegression(), param_grid=params, cv=3)

In [156]:
X_t = df_resample[traditional]
X_s = df_resample[sabr]
y = df_resample['CYA']

In [160]:
X_train, X_test, y_train, y_test = train_test_split(X_t, y, stratify = y, random_state = 42)

In [161]:
grid_t.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l1', 'l2'], 'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [162]:
grid_t.best_params_

{'C': 6, 'penalty': 'l2'}

In [128]:
from sklearn.ensemble import AdaBoostClassifier

In [163]:
ada_t = AdaBoostClassifier(base_estimator=LogisticRegression(penalty = 'l2', C = 6))

In [164]:
ada_t.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=LogisticRegression(C=6, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          learning_rate=1.0, n_estimators=50, random_state=None)

In [165]:
preds = ada_t.predict(df_resample[traditional])

In [166]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

In [167]:
confusion_matrix(y, preds)

array([[1463,   37],
       [  47,  253]])

In [168]:
print(f'Accuracy is {accuracy_score(y, preds)}')
print(f'Recall is {recall_score(y, preds)}')
print(f'Specificity is {1-precision_score(y, preds)}')

Accuracy is 0.9533333333333334
Recall is 0.8433333333333334
Specificity is 0.12758620689655176


In [184]:
X_train, X_test, y_train, y_test = train_test_split(X_s, y, stratify = y)

In [185]:
grid_s.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l1', 'l2'], 'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [186]:
grid_s.best_params_

{'C': 8, 'penalty': 'l1'}

In [187]:
ada_s = AdaBoostClassifier(base_estimator=LogisticRegression(C=8, penalty='l1'))

In [188]:
ada_s.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=LogisticRegression(C=8, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          learning_rate=1.0, n_estimators=50, random_state=None)

In [189]:
preds2 = ada_s.predict(df_resample[sabr])

In [190]:
confusion_matrix(y, preds2)

array([[1455,   45],
       [  76,  224]])

In [191]:
print(f'Accuracy is {accuracy_score(y, preds2)}')
print(f'Recall is {recall_score(y, preds2)}')
print(f'Specificity is {1-precision_score(y, preds2)}')

Accuracy is 0.9327777777777778
Recall is 0.7466666666666667
Specificity is 0.16728624535315983


In [192]:
df_resample['trad_proj'] = preds
df_resample['sabr_proj'] = preds2

In [193]:
df_resample.head()

Unnamed: 0,Season,Name,Team,W,L,SV,G,GS,IP,K/9,...,ERA-,FIP-,H,HR,SO,WHIP,playerid,CYA,trad_proj,sabr_proj
91,1998,Roger Clemens,Blue Jays,20,6,0,33,33,234.2,10.39,...,57,59,169,11,271,1.1,815,1,1,1
194,2016,Max Scherzer,Nationals,20,7,0,34,34,228.1,11.19,...,70,78,165,31,284,0.97,3137,1,1,0
530,2005,Bartolo Colon,Angels,21,8,0,33,33,222.2,6.35,...,82,88,215,26,157,1.16,375,1,0,0
5,2014,Clayton Kershaw,Dodgers,21,3,0,27,27,198.1,10.85,...,51,49,139,9,239,0.86,2036,1,1,1
6,2013,Clayton Kershaw,Dodgers,16,9,0,33,33,236.0,8.85,...,51,64,164,11,232,0.92,2036,1,1,1


In [194]:
df_resample[(df_resample['CYA'] == df_resample['trad_proj'])&(df_resample['CYA'] == 1)&(df_resample['sabr_proj'] == 0)]

Unnamed: 0,Season,Name,Team,W,L,SV,G,GS,IP,K/9,...,ERA-,FIP-,H,HR,SO,WHIP,playerid,CYA,trad_proj,sabr_proj
194,2016,Max Scherzer,Nationals,20,7,0,34,34,228.1,11.19,...,70,78,165,31,284,0.97,3137,1,1,0
137,2005,Chris Carpenter,Cardinals,21,5,0,33,33,241.2,7.93,...,68,69,204,18,213,1.06,1292,1,1,0
73,2012,David Price,Rays,20,5,0,31,31,211.0,8.74,...,66,75,173,16,205,1.1,3184,1,1,0
59,2015,Dallas Keuchel,Astros,20,8,0,33,33,232.0,8.38,...,64,72,185,17,216,1.02,9434,1,1,0
295,2016,Rick Porcello,Red Sox,22,4,0,33,33,223.0,7.63,...,71,81,193,23,189,1.01,2717,1,1,0
73,2012,David Price,Rays,20,5,0,31,31,211.0,8.74,...,66,75,173,16,205,1.1,3184,1,1,0
59,2015,Dallas Keuchel,Astros,20,8,0,33,33,232.0,8.38,...,64,72,185,17,216,1.02,9434,1,1,0
113,2002,Barry Zito,Athletics,23,5,0,35,35,229.1,7.14,...,63,89,182,24,182,1.13,944,1,1,0
137,2005,Chris Carpenter,Cardinals,21,5,0,33,33,241.2,7.93,...,68,69,204,18,213,1.06,1292,1,1,0
168,2013,Max Scherzer,Tigers,21,3,0,32,32,214.1,10.08,...,72,69,152,18,240,0.97,3137,1,1,0


In [195]:
df_resample[(df_resample['CYA'] == df_resample['sabr_proj'])&(df_resample['CYA'] == 1)&(df_resample['trad_proj'] == 0)]

Unnamed: 0,Season,Name,Team,W,L,SV,G,GS,IP,K/9,...,ERA-,FIP-,H,HR,SO,WHIP,playerid,CYA,trad_proj,sabr_proj
63,2017,Max Scherzer,Nationals,16,6,0,31,31,200.2,12.02,...,57,67,126,22,268,0.9,3137,1,0,1
28,2010,Felix Hernandez,Mariners,13,12,0,34,34,249.2,8.36,...,59,75,194,17,232,1.06,4772,1,0,1
270,2006,Brandon Webb,Diamondbacks,16,8,0,33,33,235.0,6.82,...,66,68,216,15,178,1.13,1692,1,0,1
345,1996,Pat Hentgen,Blue Jays,20,10,0,35,35,265.2,6.0,...,65,81,238,20,177,1.25,145,1,0,1
63,2017,Max Scherzer,Nationals,16,6,0,31,31,200.2,12.02,...,57,67,126,22,268,0.9,3137,1,0,1
63,2017,Max Scherzer,Nationals,16,6,0,31,31,200.2,12.02,...,57,67,126,22,268,0.9,3137,1,0,1
270,2006,Brandon Webb,Diamondbacks,16,8,0,33,33,235.0,6.82,...,66,68,216,15,178,1.13,1692,1,0,1
270,2006,Brandon Webb,Diamondbacks,16,8,0,33,33,235.0,6.82,...,66,68,216,15,178,1.13,1692,1,0,1
28,2010,Felix Hernandez,Mariners,13,12,0,34,34,249.2,8.36,...,59,75,194,17,232,1.06,4772,1,0,1
345,1996,Pat Hentgen,Blue Jays,20,10,0,35,35,265.2,6.0,...,65,81,238,20,177,1.25,145,1,0,1
