In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()


from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, RandomForestRegressor
from sklearn.svm import SVC

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.metrics import mean_squared_error, f1_score, confusion_matrix, roc_auc_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

from sklearn.linear_model import Lasso, ElasticNet, Ridge

In [2]:
from sklearn.decomposition import PCA

In [3]:
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Recall, Precision
from tensorflow.keras.metrics import AUC

In [5]:
from pybaseball import statcast_pitcher
from pybaseball import playerid_lookup

In [6]:
from pybaseball import statcast

In [7]:
import lzma
import pickle

## preprocessing

In [23]:
with lzma.open("../data/pitches.xz", "rb") as f:
    pitches = pickle.load(f)

In [37]:
#  View data
pitches.head()

Unnamed: 0,vx0,vy0,vz0,release_speed,release_spin_rate,pfx_x,pfx_z,plate_x,plate_z,ax,...,release_pos_y,release_pos_z,vz0*2,pfx_z*2,plate_z*2,sz_top,sz_bot,pitch_type,player_name,pitch_no
3295,4.974465,-120.970548,-1.590698,83.1,2275,0.57,-0.13,-0.19,2.35,4.535602,...,53.14,5.97,2.530321,0.0169,5.5225,3.21,1.52,SL,"Romano, Jordan",1
3404,6.550197,-122.185723,-2.165955,84.0,2284,0.46,-0.13,0.13,1.99,3.33423,...,52.88,5.78,4.69136,0.0169,3.9601,3.08,1.52,SL,"Romano, Jordan",2
3519,8.688477,-126.547267,-4.968054,87.1,2224,0.46,-0.26,0.95,1.07,3.120242,...,52.88,5.9,24.681557,0.0676,1.1449,3.16,1.52,SL,"Romano, Jordan",3
3629,8.199169,-140.755107,-5.791813,96.9,2440,-0.01,1.61,0.17,3.2,-1.983056,...,52.91,6.02,33.545099,2.5921,10.24,3.45,1.58,FF,"Romano, Jordan",4
3702,7.799187,-136.869501,-4.917298,94.2,2353,-0.22,1.55,-0.5,3.01,-4.529335,...,52.67,5.73,24.179823,2.4025,9.0601,3.45,1.58,FF,"Romano, Jordan",5


In [38]:
#  View columns
pitches.columns

Index(['vx0', 'vy0', 'vz0', 'release_speed', 'release_spin_rate', 'pfx_x',
       'pfx_z', 'plate_x', 'plate_z', 'ax', 'ay', 'az', 'release_pos_x',
       'release_pos_y', 'release_pos_z', 'vz0*2', 'pfx_z*2', 'plate_z*2',
       'sz_top', 'sz_bot', 'pitch_type', 'player_name', 'pitch_no'],
      dtype='object')

In [39]:
#  Check again for nulls
pitches.isnull().sum().sum()

0

In [29]:
X = pitches.drop(columns = 'pitch_no')

#  Square root transformation to normalize data 
## more effective at normalization than log-transform for this dataset
y = pd.DataFrame([x ** 0.5 for x in pitches['pitch_no']])

In [30]:
#  Preserve dummified columns for later processing
X['pitch_type_orig'] = X['pitch_type']
X['player_name_orig'] = X['player_name']

In [31]:
#  Dummify pitch type and pitcher name (only categorical columns in remaining data)
X = pd.get_dummies(X, columns = ['pitch_type','player_name'])

In [33]:
#  Train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [40]:
#  Generating new features
for col in pitches.columns[:-3]: 
    
    # get avg metrics from train data by player and pitch type
    X_train[f'{col}_avg'] = X_train.groupby(['player_name_orig','pitch_type_orig'])[col].transform('mean')
    
    # store averages in a variable
    avgs = X_train.groupby(['player_name_orig', 'pitch_type_orig'])[col].mean().reset_index()
    
    # merge averages onto test data 
    merge = X_test.merge(avgs, on = ['player_name_orig','pitch_type_orig'], how = 'left')
    X_test[f'{col}_avg'] = merge[col + '_y']
    
    # create new columns for differential from train avg in both train and test data
    X_train[f'{col}_diff'] = X_train[col] - X_train[f'{col}_avg']
    X_test[f'{col}_diff'] = X_test[col] - X_test[f'{col}_avg']    
    

In [41]:
#  Drop categorical columns now that they have been dummified and used for grouped avgs
X_train.drop(columns = ['player_name_orig', 'pitch_type_orig'], inplace = True)
X_test.drop(columns = ['player_name_orig', 'pitch_type_orig'], inplace = True)

In [42]:
#  Check again for nulls
X_train.isnull().sum().sum()

0

In [43]:
X_test.isnull().sum().sum()

520

In [44]:
#  The merge above created nulls in X_test, but not in X_test
#  Combine X_test and y_test to drop indices of nulls

X_test = X_test.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)
combined = pd.concat([X_test, y_test], axis = 1)
combined.dropna(inplace = True)
X_test = combined.iloc[:,:-1]
y_test = combined.iloc[:,-1]

In [None]:
# Check shapes
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((107897, 623), (35953, 623), (107897, 1), (35953,))

In [47]:
#  Reshape y_train 
y_train = y_train.squeeze()

In [48]:
# Rescale and reshape as df
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [49]:
X_train = pd.DataFrame(X_train, columns = ss.get_feature_names_out())
X_test = pd.DataFrame(X_test, columns = ss.get_feature_names_out())

In [54]:
#  Compress large-width X dfs and save to data folder 
with lzma.open("../data/X_train.xz", "wb") as f:
    pickle.dump(X_train, f)
with lzma.open("../data/X_test.xz", "wb") as f:
    pickle.dump(X_test, f)

In [55]:
#  Write single-column y dfs to data folder as csv
y_train.to_csv('../data/y_train.csv',index = False)
y_test.to_csv('../data/y_test.csv', index = False)

In [56]:
# save scaler to app folder
pickle.dump(ss, open('../app/scaler.pkl', 'wb'))

## fatigue through game

### fewer swinging strikes

In [39]:
X = ck[['pitch_type','release_speed','pfx_x','pfx_z',
            'plate_x','plate_z','vx0','vy0','vz0',
            'ax','ay','az','sz_top','sz_bot',
            'release_spin_rate','release_extension','effective_speed','inning',
        'release_pos_x','release_pos_y','release_pos_z','pitch_no','swing']]

In [40]:
X.isnull().sum()

pitch_type           181
release_speed        176
pfx_x                181
pfx_z                181
plate_x              181
plate_z              181
vx0                  181
vy0                  181
vz0                  181
ax                   181
ay                   181
az                   181
sz_top               181
sz_bot               181
release_spin_rate    929
release_extension    345
effective_speed      174
inning                 0
delta_run_exp          6
release_pos_x        181
release_pos_y        181
release_pos_z        181
pitch_no               0
swing                  0
dtype: int64

In [41]:
X.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.dropna(inplace = True)


In [42]:
X = pd.get_dummies(X, columns = ['pitch_type'])

In [43]:
y = X['swing']

In [44]:
X.drop(columns = 'swing', inplace = True)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42)

In [46]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

X_train = pd.DataFrame(X_train, columns = ss.get_feature_names_out())
X_test = pd.DataFrame(X_test, columns = ss.get_feature_names_out())

In [49]:
poly = PolynomialFeatures()
X_train = poly.fit_transform(X_train)
X_test = poly.transform(X_test)

X_train = pd.DataFrame(X_train, columns = poly.get_feature_names_out())
X_test = pd.DataFrame(X_test, columns = poly.get_feature_names_out())

In [52]:
base = [np.mean(y_train)] * len(y_test)
roc_auc_score(y_test, base)

0.5

In [85]:
model = Sequential()
model.add(Dense(128,activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(1,activation = 'sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics = ['AUC'])

In [86]:
model.fit(X_train,y_train,validation_data=(X_test, y_test), epochs = 10, batch_size = 32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x230e5f637c0>

In [87]:
y_pred = model.predict(X_test)



In [88]:
roc_auc_score(y_test, y_pred)

0.8447126978658592

### balls

In [248]:
ck['ball'] = ck['type'].apply(lambda x: 1 if x == 'B' else 0)

In [259]:
X = ck[['pitch_type','release_speed','vx0','vy0','vz0',
            'ax','ay','az','sz_top','sz_bot',
            'release_spin_rate','release_extension','effective_speed','inning',
            'release_pos_x','release_pos_y','release_pos_z','pitch_no','ball']]
X.dropna(inplace = True)
X = pd.get_dummies(X, columns = ['pitch_type'])
y = X['ball']
X = X.drop(columns = 'ball')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.dropna(inplace = True)


In [260]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42)

In [None]:
X_train['ball']

KeyError: 'ball'

In [261]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

X_train = pd.DataFrame(X_train, columns = ss.get_feature_names_out())
X_test = pd.DataFrame(X_test, columns = ss.get_feature_names_out())

In [262]:
poly = PolynomialFeatures()
X_train = poly.fit_transform(X_train)
X_test = poly.transform(X_test)

X_train = pd.DataFrame(X_train, columns = poly.get_feature_names_out())
X_test = pd.DataFrame(X_test, columns = poly.get_feature_names_out())

In [263]:
base = [np.mean(y_train)] * len(y_test)
roc_auc_score(y_test, base)

0.5

In [264]:
model = Sequential()
model.add(Dense(128,activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(1,activation = 'linear'))
model.compile(loss='mse',optimizer='adam')

In [265]:
model.fit(X_train,y_train,validation_data=(X_test, y_test), epochs = 10, batch_size = 32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2308fdce880>

In [266]:
y_pred = model.predict(X_test)



In [267]:
roc_auc_score(y_test, y_pred)

0.5440852051111157

### fewer called strikes

In [None]:
#  events, description, hit_location, bb_type, hit_distance_sc, delta_run_exp

In [203]:
ck['events'].value_counts()

field_out                    1986
strikeout                    1407
single                        641
walk                          213
double                        168
home_run                      130
grounded_into_double_play      82
force_out                      74
field_error                    40
sac_bunt                       24
hit_by_pitch                   18
triple                         13
sac_fly                        11
fielders_choice                 8
fielders_choice_out             7
double_play                     6
catcher_interf                  4
strikeout_double_play           3
caught_stealing_2b              3
intent_walk                     2
caught_stealing_home            1
sac_bunt_double_play            1
Name: events, dtype: int64

In [90]:
ck.columns

Index(['pitch_type', 'game_date', 'release_speed', 'release_pos_x',
       'release_pos_z', 'player_name', 'batter', 'pitcher', 'events',
       'description', 'spin_dir', 'spin_rate_deprecated',
       'break_angle_deprecated', 'break_length_deprecated', 'zone', 'des',
       'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'type',
       'hit_location', 'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x',
       'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b',
       'outs_when_up', 'inning', 'inning_topbot', 'hc_x', 'hc_y',
       'tfs_deprecated', 'tfs_zulu_deprecated', 'fielder_2', 'umpire', 'sv_id',
       'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot',
       'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed',
       'release_spin_rate', 'release_extension', 'game_pk', 'pitcher.1',
       'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',
       'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y',
       'estima

### delta run exp

In [168]:
X = ck[['pitch_type','release_speed','pfx_x','pfx_z',
            'plate_x','plate_z','vx0','vy0','vz0',
            'ax','ay','az','sz_top','sz_bot',
            'release_spin_rate','release_extension','effective_speed','inning',
            'release_pos_x','release_pos_y','release_pos_z','pitch_no','delta_run_exp']]

In [169]:
X.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.dropna(inplace = True)


In [170]:
y = X['delta_run_exp']

In [171]:
X = X.drop(columns = 'delta_run_exp')

In [172]:
X = pd.get_dummies(X, columns = ['pitch_type'])

In [173]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42)

In [174]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

X_train = pd.DataFrame(X_train, columns = ss.get_feature_names_out())
X_test = pd.DataFrame(X_test, columns = ss.get_feature_names_out())

In [175]:
poly = PolynomialFeatures()
X_train = poly.fit_transform(X_train)
X_test = poly.transform(X_test)

X_train = pd.DataFrame(X_train, columns = poly.get_feature_names_out())
X_test = pd.DataFrame(X_test, columns = poly.get_feature_names_out())

In [176]:
base = [np.mean(y_train)] * len(y_test)
mean_squared_error(y_test, base, squared=False)

0.2118249216952286

In [177]:
model = Sequential()
model.add(Dense(128,activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(1,activation = 'linear'))
model.compile(loss='mse',optimizer='adam')

In [178]:
model.fit(X_train,y_train,validation_data=(X_test, y_test), epochs = 10, batch_size = 32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x230eef3f400>

In [179]:
y_pred = model.predict(X_test)



In [180]:
mean_squared_error(y_test, y_pred, squared=False)

0.21379986599690892

## mechanics

#### fastball velocity

In [191]:
ck = ck[ck['pitch_type'] == 'FF']

In [192]:
X = ck[['pfx_x','pfx_z',
            'plate_x','plate_z','vx0','vy0','vz0',
            'ax','ay','az','sz_top','sz_bot',
            'release_spin_rate','release_extension','effective_speed','inning',
            'release_pos_x','release_pos_y','release_pos_z','pitch_no','release_speed']]
X.dropna(inplace = True)
y = X['release_speed']
X.drop(columns = 'release_speed')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.dropna(inplace = True)


Unnamed: 0,pfx_x,pfx_z,plate_x,plate_z,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,release_spin_rate,release_extension,effective_speed,inning,release_pos_x,release_pos_y,release_pos_z,pitch_no
1,-0.22,1.65,-0.97,2.82,-4.795000,-137.464000,-7.076000,-1.522000,34.627000,-10.191000,3.61,1.57,2377.0,6.5,94.0,5,0.92,50.00,6.14,2
4,-0.03,1.74,-1.78,1.84,-7.810000,-135.858000,-9.625000,1.261000,27.852000,-9.069000,3.42,1.51,2391.0,6.6,93.6,5,1.04,50.00,6.04,5
5,-0.66,1.58,-0.12,2.36,-1.940000,-137.062000,-7.767000,-7.657000,34.975000,-10.969000,3.42,1.47,2315.0,6.5,93.8,5,1.14,50.00,6.01,6
7,0.49,1.90,0.96,1.32,-2.652000,-137.073000,-11.628000,6.630000,29.854000,-6.227000,3.75,1.64,2338.0,6.6,94.5,5,1.48,50.00,6.04,8
8,0.18,1.72,-0.46,3.21,-4.678000,-137.196000,-6.301000,3.275000,30.735000,-9.518000,3.66,1.54,2253.0,6.7,94.7,5,1.04,50.00,6.19,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18184,-0.40,1.57,-0.33,2.52,-3.524664,-131.515107,-6.378435,-3.980430,26.766435,-12.799638,3.50,1.65,2342.0,6.4,90.7,2,1.43,54.11,6.11,88
18185,-0.37,1.51,0.82,4.29,-0.832675,-132.912806,-2.271370,-4.220362,27.587624,-14.005365,3.53,1.68,2365.0,6.5,91.8,2,1.46,54.02,6.23,89
18187,-0.10,1.71,0.84,3.56,-1.568255,-132.391093,-4.116400,-0.745037,33.478473,-11.592067,3.50,1.69,2322.0,6.5,90.6,2,1.55,54.02,6.13,91
18189,0.00,1.80,1.97,3.64,0.730560,-132.765704,-4.148554,-0.161907,32.548116,-10.343355,3.73,1.86,2502.0,6.6,91.1,3,1.68,53.94,6.11,93


In [193]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42)

In [194]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

X_train = pd.DataFrame(X_train, columns = ss.get_feature_names_out())
X_test = pd.DataFrame(X_test, columns = ss.get_feature_names_out())

In [195]:
poly = PolynomialFeatures()
X_train = poly.fit_transform(X_train)
X_test = poly.transform(X_test)

X_train = pd.DataFrame(X_train, columns = poly.get_feature_names_out())
X_test = pd.DataFrame(X_test, columns = poly.get_feature_names_out())

In [196]:
base = [np.mean(y_train)] * len(y_test)
mean_squared_error(y_test, base, squared=False)

1.8182370499669918

In [197]:
model = Sequential()
model.add(Dense(128,activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(128,activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(128,activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(128,activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(1,activation = 'linear'))
model.compile(loss='mse',optimizer='adam')

In [198]:
model.fit(X_train,y_train,validation_data=(X_test, y_test), epochs = 10, batch_size = 32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x230eedc1880>

In [199]:
y_pred = model.predict(X_test)



In [200]:
mean_squared_error(y_test, y_pred, squared=False)

7.054394702303111