# Baseball Pitch Data Analysis
Class: cse6242
Christian Rivera
Team: Philly Philly

In [513]:
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score
from sklearn import preprocessing
import numpy as np

import pickle
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import xgboost as xgb

## Load the data sets

In [2]:
players = pd.read_csv("player_lookup.csv")
data = pd.read_csv("Modeling_data.csv")

In [436]:
players['last'] = players['MLBNAME'].str.split(" ").str.get(-1)
players[players['last']=='Happ']

Unnamed: 0,MLBID,MLBNAME,last
841,664023.0,Ian Happ,Happ
842,457918.0,J.A. Happ,Happ


## Below are the functions to clean the data, select a player and create training and testing data.  All the cleaning functions are called by the dataOrchestrator function so that the user be only concerned with the dataOrchestrator and createTrainTest functions

In [3]:
def cleanData(df):
    """
    One-hot encode categorical attributes + drop not useful attributes
    input: df (pitch data)
    output: df (cleaned pitched data)
    """
    
    attributes = ['stand','outcomelag3','outcomelag2','outcomelag1',
                  'pitch_typelag1', 'pitch_typelag2', 'pitch_typelag3']
    
    for trait in attributes:
        try:
            dummies = pd.get_dummies(df[trait], prefix = trait)
            df = pd.concat([df, dummies], axis=1)
        except:
            pass
    
    attributes = attributes + ['inning1', 'inning2',
       'inning3', 'inning4', 'inning5', 'inning6', 'inning7', 'inning8',
       'inning9', 'inning10', 'BR_EMPTY', 'BR_1B_2B', 'BR_1B_3B',
        'BR_2B_3B','BR_FULL','batting_order1', 'batting_order2', 'batting_order3', 'batting_order4',
       'batting_order5', 'batting_order6', 'batting_order7', 'batting_order8',
       'batting_order9','pitcher','batter']
    df.drop(attributes,axis=1,inplace=True)
    
    return df

In [4]:
def simplifyOutcomes(df):
    """
    Standardize pitch outcomes.  Shrink dimension space from 23 results to 6 for
    all the "outcome lag" columns
    """
    
    
    subs = {'ball':'ball',
         'called_strike':'strike',
         'catcher_interf':'hit',
         'double':'hit',
         'double_play':'out',
         'field_error':'hit',
         'field_out':'out',
         'fielders_choice':'hit',
         'fielders_choice_out':'out',
         'force_out':'out',
         'foul':'foul',
         'grounded_into_double_play':'out',
         'hit_by_pitch':'hit',
         'home_run':'score',
         'offensive_substitution': 'other',
         'sac_bunt': 'score',
         'sac_bunt_double_play':'score',
         'sac_fly': 'score',
         'sac_fly_double_play': 'score',
         'single': 'hit',
         'swinging_strike':'strike',
         'triple': 'hit',
         'triple_play':'out'}
    
    attributes = ['outcomelag1', 'outcomelag2', 'outcomelag3']
    for trait in attributes:
        df[trait] = df[trait].map(subs)
    
    return df

In [5]:
def Determine_Pitch_Type_To_Keep_Pitcher_Specific(the_pitcher_id, the_data=None):
    print("Determining the PitchTypes to use with pitcher {}\n".format(the_pitcher_id))
    all_pitch_types = list(data.pitch_type.unique())
    # Getting count for each type of pitch
    pitch_type_count_dict = the_data.pitch_type.value_counts()
    num_of_pitch = len(the_data["pitch_type"])
    list_of_pitch_types_used = list(the_data.pitch_type.unique())
    threshold = 0.02
    knuckle_thresh = 0.5
    # Remove woba and swstrike columns relating to pitch types not thrown by the pitcher
    for pitchType in all_pitch_types:
        if pitchType not in list_of_pitch_types_used:
            woba_column_to_remove = "woba.{}".format(pitchType)
            swstrike_column_to_remove = "swstrike_pct.{}".format(pitchType)
            if woba_column_to_remove in the_data.columns:
                the_data.drop(woba_column_to_remove, axis=1, inplace=True)
            if swstrike_column_to_remove in the_data.columns:
                the_data.drop(swstrike_column_to_remove, axis=1, inplace=True)

    for key, value in pitch_type_count_dict.iteritems():
        current_pitch_type_percentage_of_total = value/(len(the_data["pitch_type"]))
        # If the pitcher throws more than knuckle_thresh, KN, then we remove all woba and swstrike columns
        if (key == "KN") and (current_pitch_type_percentage_of_total > knuckle_thresh):
            current_pitch_types = list(the_data.pitch_type.unique())
            for pitch in current_pitch_types:
                if pitch in the_data.columns:
                    the_data.drop(pitch, axis=1, inplace=True)
                if pitch in the_data.columns:
                    the_data.drop(pitch, axis=1, inplace=True)
        # Finds and removes pitch types if they have not been used enough by the pitcher
        #   specified by the threshold
        if current_pitch_type_percentage_of_total < threshold:
            print("Pitch total {}, current pitch type {} and total {}, percentage {}".format(
                (len(the_data["pitch_type"])), key, value, current_pitch_type_percentage_of_total))
            the_data = the_data[the_data.pitch_type != key]
            print("The number of pitches now {}".format(len(the_data["pitch_type"])))


In [6]:
def fillData(df):
    """
    1. Fill NaN values in WOBA fields with mean value of dataset
    2. Fill NaN values for all other fields with 0
    """
    
    attributes = ['woba.FF','woba.SL', 'woba.CH', 'woba.CU', 'woba.FT', 'woba.SI',
                  'woba.FC','woba.FS', 'woba.KC', 'woba.KN','swstrike_pct.FF', 'swstrike_pct.SL',
       'swstrike_pct.CH', 'swstrike_pct.CU', 'swstrike_pct.FT',
       'swstrike_pct.SI', 'swstrike_pct.FC', 'swstrike_pct.FS',
       'swstrike_pct.KC', 'swstrike_pct.KN']#pitch_number','score_diff']
    
    for trait in attributes:
        try:
            df[trait] = df[trait].fillna((df[trait].mean()))
            #df[trait]=(df[trait]-df[trait].mean())/df[trait].std()
            df[trait]=(df[trait]-df[trait].min())/(df[trait].max()-df[trait].min())

        except:
            pass
        
    attributes = ['release_speedlag1', 'release_speedlag2',
       'release_speedlag3', 'avg2_release_speed', 'avg3_release_speed',
                  'plate_xlag1', 'plate_xlag2', 'plate_xlag3','plate_xlag1', 
                  'plate_xlag2', 'plate_xlag3', 'plate_zlag1',
       'plate_zlag2', 'plate_zlag3', 'avg2_plate_x', 'avg2_plate_z',
       'avg3_plate_x', 'avg3_plate_z', 'pfx_xlag1', 'pfx_xlag2', 'pfx_xlag3',
       'pfx_zlag1', 'pfx_zlag2', 'pfx_zlag3', 'avg2_pfx_x', 'avg2_pfx_z',
       'avg3_pfx_x', 'avg3_pfx_z']
    
    for trait in attributes:
        try:
            df[trait] = df[trait].fillna((0))
            #df[trait]=(df[trait]-df[trait].mean())/df[trait].std()
            df[trait]=(df[trait]-df[trait].min())/(df[trait].max()-df[trait].min())
        except:
            pass
        
    return df

In [7]:
def dataOrchestrator(mainData, reference, playerNames = ["Justin Verlander"]):
    """
    Receives mainData, MLBID reference data, and specific player name.
    
    Runs all the above cleaning code to produce a "cleaned" dataset that can be 
    used as an input for the "createTrainTest" function
    
    """
    
    
    mlbids = list(reference[reference['MLBNAME'].isin(playerNames)]['MLBID'])
    
    
    
    df = mainData[mainData['pitcher'].isin(mlbids)]
    
    Determine_Pitch_Type_To_Keep_Pitcher_Specific(mlbids,df)
    
    """df = df[['game_year','pitch_type', 'pitch_number','game_pitch_number',
             'outs_when_up', 'BR_1B',
       'BR_2B', 'BR_3B','zero_zero_count',
       'zero_one_count', 'zero_two_count', 'one_zero_count', 'one_one_count',
       'one_two_count', 'two_zero_count', 'two_one_count', 'two_two_count',
       'three_zero_count', 'three_one_count', 'three_two_count', 'score_diff',
              'stand','outcomelag1', 'outcomelag2', 'outcomelag3','woba.FF',
       'woba.SL', 'woba.CH', 'woba.CU', 'woba.FT', 'woba.SI', 'woba.FC',
       'woba.FS', 'woba.KC', 'woba.KN','release_speedlag1', 'release_speedlag2',
       'release_speedlag3', 'avg2_release_speed', 'avg3_release_speed','plate_xlag1', 'plate_xlag2', 'plate_xlag3'
             ]]"""
    
    df = simplifyOutcomes(df)
    df = cleanData(df)
    
    df = fillData(df)
    print("Number of rows:{0}".format(len(df)))
    
    return df
    

In [8]:
def createTrainTest(df,testYear=2018,trainYearStart=2016):
    """
    Create training and testing data outputs
    
    inputs:
        - testYear (int)
        - traingYearStart (int): all data starting from this value up to but not including
                                the testYear
    
    outputs:
        xTrain, yTrain, xTest, yTest numpy arrays
    """
    
    yData = pd.get_dummies(df['pitch_type'])
    
    yData['game_year'] = df['game_year']
    
    xTrain = df[(df['game_year'] < testYear) & (df['game_year'] >= trainYearStart)]
    xTest = df[df['game_year'] >= testYear]
    
    xTrain.drop(['game_year','pitch_type'],axis=1,inplace=True)
    xTest.drop(['game_year','pitch_type'],axis=1,inplace=True)
    
    yTrain = yData[(yData['game_year'] < testYear) & (yData['game_year'] >= trainYearStart)]
    yTest = yData[yData['game_year'] >= testYear]
    
    yTrain.drop(['game_year'],axis=1,inplace=True)
    yTest.drop(['game_year'],axis=1,inplace=True)
    
    xTrain = xTrain.values
    xTest = xTest.values
    yTrain = yTrain.values
    yTest = yTest.values
    
    return xTrain, xTest, yTrain, yTest
    


## Procure the cleaned data for predicting

Determining the PitchTypes to use with pitcher [605483.0]



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Number of rows:6905


## Create the model

In [512]:
cleaned = dataOrchestrator(data, players, playerNames = ['Blake Snell']);

xTrain, xTest, yTrain, yTest = createTrainTest(cleaned,2018,2015)

resultTrain = yTrain.argmax(axis=1)
resultTest = yTest.argmax(axis=1)
dtrain = xgb.DMatrix(xTrain, label=resultTrain)
dtest = xgb.DMatrix(xTest, label=resultTest)

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)

param = {
    'max_depth': 4,  # the maximum depth of each tree
    "n_estimators" :1000,
    'learning_rate':0.1,
    'eta': 0.1,  # the training step for each iteration
    "min_child_weight" :1,
    "gamma" : 1, 
    "subsample": 1,
    "colsample_bytree" : 1, 
    "scale_pos_weight": 1,
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': yTrain.shape[1],
    "tree_method" :'gpu_hist',
    "cv":kfold}  
num_round = 100  # the number of training iterations

bst = xgb.train(param, dtrain,num_round)

preds = bst.predict(dtest)
best_preds = np.asarray([np.argmax(line) for line in preds])

precision = round(precision_score(resultTest, best_preds, average='micro'),4)
print ("Numpy array precision:", round(precision*100,2))



Numpy array precision: 52.15


In [507]:
naive = cleaned[cleaned['game_year'] ==2018]
naive = naive.groupby(['pitch_type'])[['pitch_type']].count()
naive['rate'] = naive / naive.sum() * 100

naive

Unnamed: 0_level_0,pitch_type,rate
pitch_type,Unnamed: 1_level_1,Unnamed: 2_level_1
CH,559,19.189839
CU,588,20.185376
FF,1500,51.493306
SL,266,9.13148


In [489]:
from scipy.stats import chi2_contingency

num_pitches = naive['pitch_type'].sum()
naive_guess = naive['pitch_type'].max()
model_guess = round(num_pitches * precision)
chi_inputs = [[num_pitches -model_guess,model_guess], [num_pitches-naive_guess,naive_guess]]
print(naive_guess)
g, p, dof, expctd = chi2_contingency(chi_inputs)
print("P value is:",p)

1590
P value is: 0.12411916914839463


In [149]:
naive['pitch_type'].max()

1005

## notes
1. Corey Kluber 'max_depth': 4,"min_child_weight" :0.5, wins 36.02% versus  slider at 32.67%  (depth = 4)
2. Justin Verlander "min_child_weight" :0.5,'max_depth': 6,  wins 63.16% naive guess FF 61.17% (depth = 4)
3. Clayton Kershaw (best pitcher in baseball 3 cy youngs + nl mvp) 'max_depth': 8,"min_child_weight" :0.5,"colsample_bytree" : 0.4, , 48.86% vs  ff 41.77% 
4. Max Scherzer (3 cy youngs) "colsample_bytree" : 0.5, wins.. 50.52% vs 50.17% FF
5. Chris Sale (7 time all star 2012-2018) depth=8m, min_child_weight=0.1, colsample_bytree=0.5: 40.01% vs 39.10 FF
6. Zack Greinke (5x all star) 'max_depth': 4,"colsample_bytree" : 0.5,  46.53% vs 44.35% FF , p=0.09
7. Patrick Corbin (2x all star + 2018 but was bad in 2015-2017). 'max_depth': 4, "min_child_weight" :0.5, "colsample_bytree" : 0.5,   48.55% vs 40.93% SL.
8. Dallas Keuchel (all-star and cy-young) 'max_depth': 8, 42.73% vs FT 41.1%, p=0.18
9. Jake Arrieta (all star 2016 new team) 'max_depth': 4, "min_child_weight" :0.5, 54.04 vs 53.46% slider, p = 0.6
10. Carlos Carrasco (AL wins leader 2017)  35.99% vs 31.88% slider but p = 0.001
11. Trevor Bauer (2018 all star) 40.69% vs 36.38% FF p = 0.003
12. Charlie Morton (2018 all star)  "min_child_weight" :0.1, 38.64% vs 30.79% FF (balanced pitcher)
13. David Price (5x allstar, al comback 2017) 'max_depth': 8, "min_child_weight" :0.5, 33.91% vs 33.39% FT p = 0.7
14. Gerrit Cole (allstar 15,18, new team 2018) 48.48% vs 50.32% FF.  definite fail
15. Stephen Strasburg (3x allstar 16,17) 45.13% vs 45.41 FF.  fail
16. J.A. Happ (allstar 2018) "colsample_bytree" : 0.5,'learning_rate':0.01,  59.3% vs 59.26 FF.
17. Madison Bumgarner 30.3 vs 35.19% FC
18. Luis Severino (young allstar 17,18) 'learning_rate':0.01, "min_child_weight" :0.5, 52.44% vs 50.47% FF
19. Jacob deGrom (2018 cy young) "min_child_weight" :0.5, 42.93  42.88% FF
20. Blake Snell (2018 sweep, crappy 2017) 52.15% vs 51.49% FF

In [537]:
myPitchers = ['Corey Kluber','Justin Verlander','Clayton Kershaw','Max Scherzer','Chris Sale',
           'Zack Greinke','Patrick Corbin','Dallas Keuchel','Jake Arrieta','Carlos Carrasco','Trevor Bauer',
           'Charlie Morton','David Price','Gerrit Cole','Stephen Strasburg','J.A. Happ','Madison Bumgarner',
             'Luis Severino','Jacob deGrom','Blake Snell']
print(len(myPitchers))

20


In [536]:
name = 'Corey Kluber'

cleaned = dataOrchestrator(data, players, playerNames = [name]);

xTrain, xTest, yTrain, yTest = createTrainTest(cleaned,2018,2015)

resultTrain = yTrain.argmax(axis=1)
resultTest = yTest.argmax(axis=1)
dtrain = xgb.DMatrix(xTrain, label=resultTrain)
dtest = xgb.DMatrix(xTest, label=resultTest)

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)

param = {
    'max_depth': 4,  # the maximum depth of each tree
    "n_estimators" :1000,
    'learning_rate':0.1,
    'eta': 0.1,  # the training step for each iteration
    "min_child_weight" : 0.5,
    "gamma" : 1, 
    "subsample": 1,
    "colsample_bytree" : 1, 
    "scale_pos_weight": 1,
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': yTrain.shape[1],
    "tree_method" :'gpu_hist',
    "cv":kfold}  
num_round = 100  # the number of training iterations

bst = xgb.train(param, dtrain,num_round)

preds = bst.predict(dtest)
best_preds = np.asarray([np.argmax(line) for line in preds])

precision = round(precision_score(resultTest, best_preds, average='micro'),4)
print ("Numpy array precision:", round(precision*100,2))

# save model to file
pickle.dump(bst, open("models/"+name+".dat", "wb"))


Determining the PitchTypes to use with pitcher [446372.0]



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Number of rows:19216
Numpy array precision: 36.02


In [515]:
# load model from file
loaded_model = pickle.load(open("models/"+name+".dat", "rb"))

preds = loaded_model.predict(dtest)
best_preds = np.asarray([np.argmax(line) for line in preds])
precision = round(precision_score(resultTest, best_preds, average='micro'),4)
print ("Numpy array precision:", round(precision*100,2))


Numpy array precision: 52.15


In [56]:
model = XGBClassifier(learning_rate=0.1,
                      n_estimators=150,
                      max_depth=6,
                      min_child_weight = 1,
                     scoring="merror",
                      objective = "multi:softprob",
                      num_class = yTrain.shape[1]
                     )
result = yTrain.argmax(axis=1)
model.fit(xTrain, result)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=None, n_estimators=150,
       n_jobs=1, nthread=None, num_class=6, objective='multi:softprob',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       scoring='merror', seed=None, silent=True, subsample=1)

In [57]:
model.predict([xTrain[0]])

array([5])

## Set up Grid 

In [106]:
# each fold takes 3 minutes
n_estimators = [150,1000]
max_depth = [4,6]
learning_rate = [0.1,0.01]
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators,learning_rate=learning_rate)

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
myModel = GridSearchCV(XGBClassifier(min_child_weight = 1,gamma = 0, subsample=0.8,
                                     colsample_bytree = 0.8, scale_pos_weight=1,
                 objective = "multi:softprob",tree_method = 'gpu_hist'
                ,num_class= yTrain.shape[1]), 
                           param_grid, 
                           n_jobs=-1, 
                           cv=kfold, 
                           verbose=1)

resultTrain = yTrain.argmax(axis=1)
resultTest = yTest.argmax(axis=1)


In [107]:
%%time
myModel.fit(xTrain,resultTrain)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Exception in thread QueueManagerThread:
Traceback (most recent call last):
  File "/home/christian/anaconda3/envs/tf-gpu/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/christian/anaconda3/envs/tf-gpu/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "/home/christian/anaconda3/envs/tf-gpu/lib/python3.6/site-packages/sklearn/externals/joblib/externals/loky/process_executor.py", line 747, in _queue_management_worker
    recursive_terminate(p)
  File "/home/christian/anaconda3/envs/tf-gpu/lib/python3.6/site-packages/sklearn/externals/joblib/externals/loky/backend/utils.py", line 28, in recursive_terminate
    _recursive_terminate_without_psutil(process)
  File "/home/christian/anaconda3/envs/tf-gpu/lib/python3.6/site-packages/sklearn/externals/joblib/externals/loky/backend/utils.py", line 53, in _recursive_terminate_without_ps

KeyboardInterrupt: 

In [None]:
myModel.best_params_

In [108]:
xgb.train?

## Predict on test data

In [110]:

resultTrain = yTrain.argmax(axis=1)
resultTest = yTest.argmax(axis=1)
dtrain = xgb.DMatrix(xTrain, label=resultTrain)
dtest = xgb.DMatrix(xTest, label=resultTest)

kfold = StratifiedKFold(n_splits=20, shuffle=True, random_state=7)

param = {
    'max_depth': 6,  # the maximum depth of each tree
    "n_estimators" :150,
    'learning_rate':0.01,
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': yTrain.shape[1],
    "tree_method" :'gpu_hist',
    "cv":kfold}  
num_round = 1000  # the number of training iterations

bst = xgb.train(param, dtrain,num_round)

preds = bst.predict(dtest)
best_preds = np.asarray([np.argmax(line) for line in preds])
print ("Numpy array precision:", precision_score(resultTest, best_preds, average='micro'))
print ("Numpy array accuracy:", accuracy_score(resultTest, best_preds) * 100)

naive = cleaned[cleaned['game_year'] ==2018]
naive = naive.groupby(['pitch_type'])[['pitch_type']].count()
naive['rate'] = naive / naive.sum() * 100

naive

Numpy array precision: 0.3514304291287386
Numpy array accuracy: 35.14304291287386


In [83]:
naive = cleaned[cleaned['game_year'] ==2018]
naive = naive.groupby(['pitch_type'])[['pitch_type']].count()
naive['rate'] = naive / naive.sum() * 100

naive

Unnamed: 0_level_0,pitch_type,rate
pitch_type,Unnamed: 1_level_1,Unnamed: 2_level_1
CH,202,6.56697
CU,691,22.464239
FC,905,29.421326
FF,273,8.875163
SI,1005,32.672302
