In [97]:
import pandas as pd
df = pd.read_csv('pitchers cleaned.csv')

In [98]:
import warnings
warnings.filterwarnings("ignore")

Based on trial and error in the last model, resampling and boosting to even out the classes is the way to go

In [99]:
dfc = df[df['CYA'] == 1].sample(n=300, replace = True)

In [100]:
dfn = df[df['CYA'] == 0].sample(n=1500, replace = False)

In [101]:
df_resample = pd.concat([dfc, dfn], axis = 0)

In [102]:
df_resample.drop(columns='Unnamed: 0', inplace = True)

In [103]:
df_resample.columns

Index(['Season', 'Name', 'Team', 'W', 'L', 'SV', 'G', 'GS', 'IP', 'K/9',
       'BB/9', 'HR/9', 'BABIP', 'LOB%', 'ERA', 'FIP', 'WAR', 'ERA-', 'FIP-',
       'H', 'HR', 'SO', 'WHIP', 'playerid', 'CYA'],
      dtype='object')

In [104]:
df_resample.to_csv('pitcher_resample.csv')

In [105]:
traditional = ['W', 'IP', 'SO', 'H', 'HR', 'ERA', 'L']
sabr = ['K/9', 'ERA-', 'FIP-', 'BB/9', 'LOB%', 'WHIP', 'WAR']

In [106]:
df_resample['LOB%'] = df_resample['LOB%'].str[:-1].astype(float)/100

In [107]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np

In [108]:

params = {
    'penalty': ['l1', 'l2'],
    'C': [1, 2,3,4,5,6,7,8,9,10]
}

Grid Searching the traditional model

In [109]:
grid_t = GridSearchCV(LogisticRegression(), param_grid=params, cv=3)
grid_s = GridSearchCV(LogisticRegression(), param_grid=params, cv=3)

In [110]:
X_t = df_resample[traditional]
X_s = df_resample[sabr]
y = df_resample['CYA']

In [124]:
X_train, X_test, y_train, y_test = train_test_split(X_t, y, stratify = y)

In [125]:
grid_t.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l1', 'l2'], 'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [126]:
a = grid_t.best_params_

In [127]:
from sklearn.ensemble import AdaBoostClassifier

In [128]:
ada_t = AdaBoostClassifier(base_estimator=LogisticRegression(penalty = a['penalty'], C = a['C']))

In [129]:
ada_t.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          learning_rate=1.0, n_estimators=50, random_state=None)

In [131]:
preds = ada_t.predict(X_test)

In [72]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, 

In [132]:
confusion_matrix(y_test, preds)

array([[366,   9],
       [ 10,  65]])

In [134]:
print(f'Accuracy is {accuracy_score(y_test, preds)}')
print(f'Recall is {recall_score(y_test, preds)}')
print(f'Specificity is {recall_score(y_test, preds, pos_label=0)}')

Accuracy is 0.9577777777777777
Recall is 0.8666666666666667
Specificity is 0.976


In [135]:
preds_full = ada_t.predict(X_t)

In [136]:
print(f'Accuracy is {accuracy_score(y, preds_full)}')
print(f'Recall is {recall_score(y, preds_full)}')
print(f'Specificity is {recall_score(y, preds_full, pos_label=0)}')

Accuracy is 0.9533333333333334
Recall is 0.8566666666666667
Specificity is 0.9726666666666667


In [137]:
X_train, X_test, y_train, y_test = train_test_split(X_s, y, stratify = y)

In [138]:
grid_s.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l1', 'l2'], 'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [139]:
a = grid_s.best_params_

In [141]:
ada_s = AdaBoostClassifier(base_estimator=LogisticRegression(C=a['C'], penalty=a['penalty']))

In [142]:
ada_s.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=LogisticRegression(C=5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          learning_rate=1.0, n_estimators=50, random_state=None)

In [143]:
preds2 = ada_s.predict(X_test)

In [144]:
confusion_matrix(y_test, preds2)

array([[367,   8],
       [ 23,  52]])

In [145]:
print(f'Accuracy is {accuracy_score(y_test, preds2)}')
print(f'Recall is {recall_score(y_test, preds2)}')
print(f'Specificity is {recall_score(y_test, preds2, pos_label=0)}')

Accuracy is 0.9311111111111111
Recall is 0.6933333333333334
Specificity is 0.9786666666666667


In [146]:
preds2_full = ada_s.predict(X_s)

In [147]:
confusion_matrix(y, preds2_full)

array([[1461,   39],
       [  92,  208]])

In [148]:
print(f'Accuracy is {accuracy_score(y, preds2_full)}')
print(f'Recall is {recall_score(y, preds2_full)}')
print(f'Specificity is {recall_score(y, preds2_full, pos_label=0)}')

Accuracy is 0.9272222222222222
Recall is 0.6933333333333334
Specificity is 0.974


In [149]:
df_resample['trad_proj'] = preds_full
df_resample['sabr_proj'] = preds2_full

In [150]:
df_resample.head()

Unnamed: 0,Season,Name,Team,W,L,SV,G,GS,IP,K/9,...,ERA-,FIP-,H,HR,SO,WHIP,playerid,CYA,trad_proj,sabr_proj
25,2017,Corey Kluber,Indians,18,4,0,29,29,203.2,11.71,...,49,56,141,21,265,0.87,2429,1,1,1
70,2008,Cliff Lee,Indians,22,3,0,31,31,223.1,6.85,...,59,67,214,12,170,1.11,1636,1,1,1
69,2007,Jake Peavy,Padres,19,6,0,34,34,223.1,9.67,...,63,65,169,13,240,1.06,1051,1,1,1
295,2016,Rick Porcello,Red Sox,22,4,0,33,33,223.0,7.63,...,71,81,193,23,189,1.01,2717,1,1,0
30,2011,Clayton Kershaw,Dodgers,21,5,0,33,33,233.1,9.57,...,62,65,174,15,248,0.98,2036,1,1,1


In [151]:
df_resample[(df_resample['CYA'] == df_resample['trad_proj'])&(df_resample['CYA'] == 1)&(df_resample['sabr_proj'] == 0)]

Unnamed: 0,Season,Name,Team,W,L,SV,G,GS,IP,K/9,...,ERA-,FIP-,H,HR,SO,WHIP,playerid,CYA,trad_proj,sabr_proj
295,2016,Rick Porcello,Red Sox,22,4,0,33,33,223.0,7.63,...,71,81,193,23,189,1.01,2717,1,1,0
113,2002,Barry Zito,Athletics,23,5,0,35,35,229.1,7.14,...,63,89,182,24,182,1.13,944,1,1,0
137,2005,Chris Carpenter,Cardinals,21,5,0,33,33,241.2,7.93,...,68,69,204,18,213,1.06,1292,1,1,0
59,2015,Dallas Keuchel,Astros,20,8,0,33,33,232.0,8.38,...,64,72,185,17,216,1.02,9434,1,1,0
204,2004,Roger Clemens,Astros,18,4,0,33,33,214.1,9.15,...,68,69,169,15,218,1.16,815,1,1,0
168,2013,Max Scherzer,Tigers,21,3,0,32,32,214.1,10.08,...,72,69,152,18,240,0.97,3137,1,1,0
168,2013,Max Scherzer,Tigers,21,3,0,32,32,214.1,10.08,...,72,69,152,18,240,0.97,3137,1,1,0
295,2016,Rick Porcello,Red Sox,22,4,0,33,33,223.0,7.63,...,71,81,193,23,189,1.01,2717,1,1,0
567,2001,Roger Clemens,Yankees,20,3,0,33,33,220.1,8.70,...,80,74,205,19,213,1.26,815,1,1,0
137,2005,Chris Carpenter,Cardinals,21,5,0,33,33,241.2,7.93,...,68,69,204,18,213,1.06,1292,1,1,0


In [152]:
df_resample[(df_resample['CYA'] == df_resample['sabr_proj'])&(df_resample['CYA'] == 1)&(df_resample['trad_proj'] == 0)]

Unnamed: 0,Season,Name,Team,W,L,SV,G,GS,IP,K/9,...,ERA-,FIP-,H,HR,SO,WHIP,playerid,CYA,trad_proj,sabr_proj
49,2010,Roy Halladay,Phillies,21,10,0,33,33,250.2,7.86,...,61,75,231,24,219,1.04,1303,1,0,1
49,2010,Roy Halladay,Phillies,21,10,0,33,33,250.2,7.86,...,61,75,231,24,219,1.04,1303,1,0,1
49,2010,Roy Halladay,Phillies,21,10,0,33,33,250.2,7.86,...,61,75,231,24,219,1.04,1303,1,0,1
345,1996,Pat Hentgen,Blue Jays,20,10,0,35,35,265.2,6.0,...,65,81,238,20,177,1.25,145,1,0,1
270,2006,Brandon Webb,Diamondbacks,16,8,0,33,33,235.0,6.82,...,66,68,216,15,178,1.13,1692,1,0,1
28,2010,Felix Hernandez,Mariners,13,12,0,34,34,249.2,8.36,...,59,75,194,17,232,1.06,4772,1,0,1
28,2010,Felix Hernandez,Mariners,13,12,0,34,34,249.2,8.36,...,59,75,194,17,232,1.06,4772,1,0,1
28,2010,Felix Hernandez,Mariners,13,12,0,34,34,249.2,8.36,...,59,75,194,17,232,1.06,4772,1,0,1
270,2006,Brandon Webb,Diamondbacks,16,8,0,33,33,235.0,6.82,...,66,68,216,15,178,1.13,1692,1,0,1
49,2010,Roy Halladay,Phillies,21,10,0,33,33,250.2,7.86,...,61,75,231,24,219,1.04,1303,1,0,1
