On this notebook, variables that have high multicollinearity will be carefully removed. For this puprose, the Variance Inflation Factor (VIF) will be used.

In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, Lasso
import statsmodels.api as sm

In [33]:
data=pd.read_pickle('Datasets/Outliers.plk')
data.head() 

Unnamed: 0,start_id,pbp_idx,game_id,top_bot,inning,batter_stands,pitcher_id,throws,EventType,postouts,...,pas_since_single,has_have_single,pas_since_walk,has_have_walk,pas_since_double,has_have_double,pas_since_home_run,has_have_home_run,pas_since_points_allowed,has_have_points_allowed
0,2019/04/01/anamlb-seamlb-1_433587,1,2019/04/01/anamlb-seamlb-1,Y,1,L,433587,R,field_out,1,...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,False
1,2019/04/01/anamlb-seamlb-1_433587,2,2019/04/01/anamlb-seamlb-1,Y,1,R,433587,R,hit_by_pitch,1,...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,False
2,2019/04/01/anamlb-seamlb-1_433587,3,2019/04/01/anamlb-seamlb-1,Y,1,L,433587,R,force_out,2,...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,False
3,2019/04/01/anamlb-seamlb-1_433587,4,2019/04/01/anamlb-seamlb-1,Y,1,R,433587,R,field_out,3,...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,False
4,2019/04/01/anamlb-seamlb-1_433587,5,2019/04/01/anamlb-seamlb-1,Y,2,R,433587,R,single,0,...,0.0,True,0.0,False,0.0,False,0.0,False,0.0,False


In [20]:
target=data['last_batter'].copy()
starter=data['start_id'].copy()

# Drop the ids; on a separate data frame if we need them in the future.
df = data.drop(['start_id', 'game_id', #'pbp_idx', 
                'top_bot', 'batter_stands', 'pitcher_id', 'EventType', 'PostVisTeamScore', 'PostHomeTeamScore', 'home_away', 'bats_right', 'throws_right', 'throws', 'next_batter_hand'], axis=1)
df.shape

(49398, 86)

In [34]:
df.head()

Unnamed: 0,pbp_idx,inning,postouts,post_runner_on_first,post_runner_on_second,post_runner_on_third,pitches_in_pa,pitch_total,er_total,runners_on_base,...,has_have_single,pas_since_walk,has_have_walk,pas_since_double,has_have_double,pas_since_home_run,has_have_home_run,pas_since_points_allowed,has_have_points_allowed,constant
0,1,1,1,0,0,0,1,1,0,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,1
1,2,1,1,1,0,0,5,6,0,1,...,0,0.0,0,0.0,0,0.0,0,0.0,0,1
2,3,1,2,1,0,0,4,10,0,1,...,0,0.0,0,0.0,0,0.0,0,0.0,0,1
3,4,1,3,1,0,0,4,14,0,1,...,0,0.0,0,0.0,0,0.0,0,0.0,0,1
4,5,2,0,0,1,0,3,17,0,1,...,1,0.0,0,0.0,0,0.0,0,0.0,0,1


In [57]:
df['constant']=1
df=df+0

X_train, X_test, y_train, y_test =train_test_split(df, target, train_size=0.8, random_state=38)

changes=X_train.loc[X_train.last_batter==1]
no_changes=X_train.loc[X_train.last_batter==0]

# Downsample: will keep 60% of rows from "normal" plays and 40% of "change" plays.
X_train2=pd.concat([changes, 
                    no_changes.sample(n=int(1766*1.5), random_state=42)])

y_train2=X_train2.last_batter

X_train2.drop('last_batter', axis=1, inplace=True)
X_test.drop('last_batter', axis=1, inplace=True)

In [41]:
pd.set_option('display.max_rows', None)

In [37]:
vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

  vif = 1. / (1. - r_squared_i)


pbp_idx                                     inf
inning                              1341.019101
postouts                              56.823253
post_runner_on_first                        inf
post_runner_on_second                       inf
post_runner_on_third                        inf
pitches_in_pa                          1.353043
pitch_total                           22.927038
er_total                              18.027845
runners_on_base                             inf
score_diff                             1.860654
tying_run_on                           1.138689
total_outs_recorded                 1341.851279
total_bases_allowed                   54.204902
opposite_hand                          1.260517
end_of_inning                          3.434109
previously_walk                        1.929884
consec_walks                           1.134932
double_header                          1.022414
home_team                              1.039521
opposite_actual                        1

In [8]:
X_train2.drop(['post_runner_on_first', 'post_runner_on_second', 'post_runner_on_third'], axis=1, inplace=True)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                                     inf
inning                              1339.586328
postouts                              51.743461
pitches_in_pa                          1.334767
pitch_total                           23.523630
er_total                              19.017472
runners_on_base                       24.074075
score_diff                             1.892393
tying_run_on                           1.141376
total_outs_recorded                 1351.746455
total_bases_allowed                   60.576366
opposite_hand                          1.235109
end_of_inning                          3.434221
previously_walk                        1.932370
consec_walks                           1.110132
double_header                          1.023707
home_team                              1.030174
opposite_actual                        1.923476
bats_left                              1.848422
throws_left                            2.759656
batter_order                           1

In [9]:
X_train2.drop(['inning_cum_bats_right', 'inning_pa'], axis=1, inplace=True)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                                     inf
inning                              1243.649976
postouts                              37.506061
pitches_in_pa                          1.332723
pitch_total                           23.521021
er_total                              19.003829
runners_on_base                       16.743233
score_diff                             1.892218
tying_run_on                           1.140534
total_outs_recorded                 1294.934071
total_bases_allowed                   60.529476
opposite_hand                          1.233639
end_of_inning                          3.424562
previously_walk                        1.924499
consec_walks                           1.109723
double_header                          1.023700
home_team                              1.029542
opposite_actual                        1.920219
bats_left                              1.844351
throws_left                            2.759655
batter_order                           1

In [10]:
X_train2.drop(['total_outs_recorded'], axis=1, inplace=True)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                                    inf
inning                              108.229929
postouts                             15.757253
pitches_in_pa                         1.332055
pitch_total                          23.494002
er_total                             18.925706
runners_on_base                      14.808104
score_diff                            1.892217
tying_run_on                          1.140312
total_bases_allowed                  58.803052
opposite_hand                         1.233514
end_of_inning                         3.424221
previously_walk                       1.892176
consec_walks                          1.109074
double_header                         1.023539
home_team                             1.028892
opposite_actual                       1.919046
bats_left                             1.841782
throws_left                           2.759368
batter_order                          1.154002
field_out                             6.484740
strikeout    

In [11]:
X_train2.drop(['cum_bats_right'], axis=1, inplace=True)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                             346.254559
inning                              108.229929
postouts                             15.757253
pitches_in_pa                         1.332055
pitch_total                          23.494002
er_total                             18.925706
runners_on_base                      14.808104
score_diff                            1.892217
tying_run_on                          1.140312
total_bases_allowed                  58.803052
opposite_hand                         1.233514
end_of_inning                         3.424221
previously_walk                       1.892176
consec_walks                          1.109074
double_header                         1.023539
home_team                             1.028892
opposite_actual                       1.919046
bats_left                             1.841782
throws_left                           2.759368
batter_order                          1.154002
field_out                             6.484740
strikeout    

In [12]:
X_train2=X_train2.drop(['total_bases_allowed'], axis=1)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                             285.034783
inning                               99.928378
postouts                             15.655291
pitches_in_pa                         1.331007
pitch_total                          23.493866
er_total                             17.635215
runners_on_base                      14.771138
score_diff                            1.891995
tying_run_on                          1.140295
opposite_hand                         1.233445
end_of_inning                         3.422695
previously_walk                       1.890104
consec_walks                          1.109033
double_header                         1.023449
home_team                             1.028644
opposite_actual                       1.918964
bats_left                             1.841407
throws_left                           2.758213
batter_order                          1.152937
field_out                             6.410300
strikeout                             5.076569
single       

In [13]:
X_train2=X_train2.drop(['inning'], axis=1)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                              87.147601
postouts                             13.836042
pitches_in_pa                         1.330210
pitch_total                          23.481700
er_total                             17.518798
runners_on_base                      14.217083
score_diff                            1.888428
tying_run_on                          1.140053
opposite_hand                         1.233356
end_of_inning                         3.422684
previously_walk                       1.888653
consec_walks                          1.108850
double_header                         1.023447
home_team                             1.026550
opposite_actual                       1.918633
bats_left                             1.840488
throws_left                           2.757479
batter_order                          1.151609
field_out                             6.346612
strikeout                             5.027403
single                                3.998214
walk         

In [14]:
X_train2=X_train2.drop(['pitch_total'], axis=1)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                              75.102097
postouts                             13.835765
pitches_in_pa                         1.207164
er_total                             17.511962
runners_on_base                      14.215693
score_diff                            1.885066
tying_run_on                          1.139872
opposite_hand                         1.232481
end_of_inning                         3.416976
previously_walk                       1.888573
consec_walks                          1.108845
double_header                         1.023139
home_team                             1.026494
opposite_actual                       1.917360
bats_left                             1.840269
throws_left                           2.757392
batter_order                          1.150872
field_out                             6.343423
strikeout                             5.000144
single                                3.997789
walk                                  3.738023
double       

In [15]:
X_train2=X_train2.drop(['er_total'], axis=1)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                              73.540943
postouts                             13.834206
pitches_in_pa                         1.205634
runners_on_base                      14.211403
score_diff                            1.884739
tying_run_on                          1.139310
opposite_hand                         1.232274
end_of_inning                         3.416158
previously_walk                       1.887042
consec_walks                          1.108809
double_header                         1.023090
home_team                             1.026490
opposite_actual                       1.917128
bats_left                             1.840208
throws_left                           2.757345
batter_order                          1.150591
field_out                             6.343052
strikeout                             4.999837
single                                3.989656
walk                                  3.732712
double                                2.710316
home_run     

In [16]:
X_train2=X_train2.drop(['inning_cum_points_allowed'], axis=1)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                              71.855111
postouts                             11.485891
pitches_in_pa                         1.204492
runners_on_base                       7.261171
score_diff                            1.883689
tying_run_on                          1.139240
opposite_hand                         1.231896
end_of_inning                         3.401202
previously_walk                       1.884893
consec_walks                          1.108611
double_header                         1.022990
home_team                             1.026465
opposite_actual                       1.905738
bats_left                             1.839028
throws_left                           2.757113
batter_order                          1.150582
field_out                             6.329932
strikeout                             4.990784
single                                3.988992
walk                                  3.732073
double                                2.700644
home_run     

In [17]:
X_train2=X_train2.drop(['postouts'], axis=1)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                              69.132210
pitches_in_pa                         1.204330
runners_on_base                       6.415979
score_diff                            1.882207
tying_run_on                          1.138606
opposite_hand                         1.231866
end_of_inning                         2.645396
previously_walk                       1.883238
consec_walks                          1.108434
double_header                         1.022988
home_team                             1.026264
opposite_actual                       1.902164
bats_left                             1.837337
throws_left                           2.756153
batter_order                          1.150278
field_out                             5.541383
strikeout                             4.437388
single                                3.601982
walk                                  3.486844
double                                2.532401
home_run                              2.718239
cum_post_runn

In [18]:
X_train2=X_train2.drop(['inning_cum_post_runner_on_first'], axis=1)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                              68.988636
pitches_in_pa                         1.203243
runners_on_base                       5.159387
score_diff                            1.881600
tying_run_on                          1.137969
opposite_hand                         1.231764
end_of_inning                         2.630215
previously_walk                       1.805829
consec_walks                          1.105342
double_header                         1.021847
home_team                             1.025457
opposite_actual                       1.898405
bats_left                             1.836483
throws_left                           2.756075
batter_order                          1.149837
field_out                             5.541248
strikeout                             4.436294
single                                3.310869
walk                                  3.172004
double                                2.518418
home_run                              2.716788
cum_post_runn

In [19]:
X_train2=X_train2.drop(['has_have_home_run'], axis=1)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                              68.808761
pitches_in_pa                         1.203143
runners_on_base                       5.155079
score_diff                            1.881321
tying_run_on                          1.137966
opposite_hand                         1.231755
end_of_inning                         2.629850
previously_walk                       1.805543
consec_walks                          1.105218
double_header                         1.021605
home_team                             1.024944
opposite_actual                       1.897995
bats_left                             1.836481
throws_left                           2.756010
batter_order                          1.149787
field_out                             5.540726
strikeout                             4.436134
single                                3.310507
walk                                  3.171996
double                                2.518413
home_run                              2.696618
cum_post_runn

In [20]:
X_train2=X_train2.drop(['cum_single'], axis=1)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                              53.143529
pitches_in_pa                         1.203120
runners_on_base                       5.151856
score_diff                            1.879892
tying_run_on                          1.137111
opposite_hand                         1.230433
end_of_inning                         2.626797
previously_walk                       1.799182
consec_walks                          1.104321
double_header                         1.021398
home_team                             1.023423
opposite_actual                       1.896015
bats_left                             1.836481
throws_left                           2.755165
batter_order                          1.149666
field_out                             5.518760
strikeout                             4.428096
single                                3.277381
walk                                  3.116854
double                                2.509788
home_run                              2.688223
cum_post_runn

In [21]:
X_train2=X_train2.drop(['has_have_double'], axis=1)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                              53.135222
pitches_in_pa                         1.202217
runners_on_base                       5.149101
score_diff                            1.879561
tying_run_on                          1.136614
opposite_hand                         1.230330
end_of_inning                         2.626067
previously_walk                       1.798771
consec_walks                          1.103869
double_header                         1.021381
home_team                             1.023422
opposite_actual                       1.895844
bats_left                             1.836251
throws_left                           2.754490
batter_order                          1.148591
field_out                             5.516293
strikeout                             4.425827
single                                3.275414
walk                                  3.116452
double                                2.494819
home_run                              2.688186
cum_post_runn

In [None]:
X_train2=X_train2.drop(['cum_bats_left'], axis=1)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

In [None]:
X_train2.drop('constant', axis=1, inplace=True)

In [28]:
(vif[vif<100]).max() # Corresponds to pbp_idx; will be kept as it summarises other variables.

53.13522167371571

In [23]:
X_test=X_test[X_train2.columns.tolist()]

In [69]:
#  DO NOT RUN always; final variables chosen. Only if do not want to print each VIF iterations.

vars=['pbp_idx', 'pitches_in_pa', 'runners_on_base', 'score_diff',
       'tying_run_on', 'opposite_hand', 'end_of_inning', 'previously_walk',
       'consec_walks', 'double_header', 'home_team', 'opposite_actual',
       'bats_left', 'throws_left', 'batter_order', 'field_out', 'strikeout',
       'single', 'walk', 'double', 'home_run', 'cum_post_runner_on_first',
       'cum_post_runner_on_second', 'inning_cum_post_runner_on_second',
       'cum_post_runner_on_third', 'inning_cum_post_runner_on_third',
      'inning_cum_bats_left', 'cum_opposite_actual',
       'inning_cum_opposite_actual', 'cum_field_out', 'inning_cum_field_out',
       'cum_strikeout', 'inning_cum_strikeout', 'inning_cum_single',
       'cum_walk', 'inning_cum_walk', 'cum_double', 'inning_cum_double',
       'cum_home_run', 'inning_cum_home_run', 'cum_points_allowed',
       'points_allowed', 'hot_cold_field_out', 'hot_cold_strikeout',
       'hot_cold_single', 'hot_cold_walk', 'hot_cold_double',
       'hot_cold_home_run', 'hot_cold_points_allowed',
       'pas_since_post_runner_on_first', 'has_have_post_runner_on_first',
       'pas_since_post_runner_on_second', 'has_have_post_runner_on_second',
       'pas_since_post_runner_on_third', 'has_have_post_runner_on_third',
       'pas_since_field_out', 'has_have_field_out', 'pas_since_strikeout',
       'has_have_strikeout', 'pas_since_single', 'has_have_single',
       'pas_since_walk', 'has_have_walk', 'pas_since_double',
       'pas_since_home_run', 'pas_since_points_allowed',
       'has_have_points_allowed']

X_train2=X_train2[vars]
X_test=X_test[vars]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats i

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats i

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats i

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats i

In [59]:
X_train2.to_pickle('Datasets/X_train2.plk')
X_test.to_pickle('Datasets/X_test.plk')
y_train2.to_pickle('Datasets/Y_train2.plk')
y_test.to_pickle('Datasets/Y_test.plk')