## Important Links

* https://www.retrosheet.org/game.htm - RetroSheet home page for data querying
* https://www.retrosheet.org/datause.txt - How to use RetroSheet Event Files
* https://www.baseball-reference.com/about/coverage.shtml#all_pbp - Baseball Reference play-by-play coverage availability
* https://github.com/benryan03/Baseball-Simulator - Ben Ryan pitch simulator GitHub

In [None]:
%load_ext autoreload
%autoreload 2
from pybaseball import statcast, pitching_stats
import datetime as dt
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from matplotlib import patches
%matplotlib inline

# use Statcast data (from 2015-2018) so we can get spin rate, etc.
train_data_dates = [('2015-04-05', '2015-10-04')]#,      # 2015 data
#                     ('2016-04-03', '2016-10-02'),       # 2016 data
#                     ('2017-04-02', '2017-10-01'),       # 2017 data
#                     ('2018-03-29', '2018-10-01')]       # 2018 data

In [None]:
raw_data = statcast(start_dt=train_data_dates[0][0], end_dt=train_data_dates[0][1], verbose=0)
print(raw_data.shape)
raw_data.head()

In [None]:
outcome = ['pitch_type']

id_columns = ['game_pk', 'pitcher', 'batter']

situation_features = ['stand', 'p_throws', 'inning', 'balls', 'strikes', 
                      'on_1b', 'on_2b', 'on_3b', 'outs_when_up', 'pitch_number', 
                      'fld_score', 'bat_score']

prev_pitch_features = ['type', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'release_speed', 'release_spin_rate']

data = raw_data[outcome + id_columns + situation_features + prev_pitch_features]

print(data.shape)
data.head()

In [None]:
# drop all columns with no pitch type categorization
data = data[pd.notnull(data['pitch_type'])]
print(data.shape)
data.head()

In [None]:
# categorize pitch types as "Fastball" (1) or "Off-speed" (0)
fastball_pitches = ['FA', 'FF', 'FT', 'FC', 'FS', 'SI', 'SF']
def map_fastballs(x):
    if x in fastball_pitches:
        return 1
    else:
        return 0
data['pitch_type'] = data['pitch_type'].apply(map_fastballs)

data.head()

In [None]:
# make sure ID columns are int's
for col in id_columns:
    data[col] = data[col].astype(int)
    
# convert innings, balls and strikes to ints
for col in ['inning', 'balls', 'strikes', 'outs_when_up', 'pitch_number']:
    data[col] = data[col].astype(int)
    
# if inning > 9, just replace with "9"
def cap_extra_innings(x):
    if x > 9:
        return 9
    else:
        return x
data['inning'] = data['inning'].apply(cap_extra_innings)
    
# make a new id based on game id + pitcher id that we can use for groupby's
data['game_pitcher_id'] = data['game_pk'].astype(str) + '_' + data['pitcher'].astype(str)

# convert on_1b/on_2b/on_3b to boolean 
data['on_1b'] = data['on_1b'].apply(lambda x: not np.isnan(x))
data['on_2b'] = data['on_2b'].apply(lambda x: not np.isnan(x))
data['on_3b'] = data['on_3b'].apply(lambda x: not np.isnan(x))

# handedness: does the batter hit from the same side that the pitcher is pitching from
data['pitch_bat_same_side'] = data['p_throws'] == data['stand'] 
data.drop(['p_throws', 'stand'], axis=1, inplace=True)

# score differential
data['score_diff'] = data['fld_score'] - data['bat_score']
data.drop(['fld_score', 'bat_score'], axis=1, inplace=True)

data.head()