On this notebook, variables that have high multicollinearity will be carefully removed. For this purpose, the Variance Inflation Factor (VIF) will be used.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, Lasso
import statsmodels.api as sm

In [2]:
data=pd.read_pickle('Datasets/Outliers.plk')
data.head() 

Unnamed: 0,start_id,pbp_idx,game_id,top_bot,inning,batter_stands,pitcher_id,throws,EventType,postouts,...,pas_since_single,has_have_single,pas_since_walk,has_have_walk,pas_since_double,has_have_double,pas_since_home_run,has_have_home_run,pas_since_points_allowed,has_have_points_allowed
0,2019/04/01/anamlb-seamlb-1_433587,1,2019/04/01/anamlb-seamlb-1,Y,1,L,433587,R,field_out,1,...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,False
1,2019/04/01/anamlb-seamlb-1_433587,2,2019/04/01/anamlb-seamlb-1,Y,1,R,433587,R,hit_by_pitch,1,...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,False
2,2019/04/01/anamlb-seamlb-1_433587,3,2019/04/01/anamlb-seamlb-1,Y,1,L,433587,R,force_out,2,...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,False
3,2019/04/01/anamlb-seamlb-1_433587,4,2019/04/01/anamlb-seamlb-1,Y,1,R,433587,R,field_out,3,...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,False
4,2019/04/01/anamlb-seamlb-1_433587,5,2019/04/01/anamlb-seamlb-1,Y,2,R,433587,R,single,0,...,0.0,True,0.0,False,0.0,False,0.0,False,0.0,False


In [3]:
target=data['last_batter'].copy()
starter=data['start_id'].copy()

# Drop the ids; on a separate data frame if we need them in the future.
df = data.drop(['start_id', 'game_id', #'pbp_idx',  #will be kept, as it is an important indicator of the current status.
                'top_bot', 'batter_stands', 'pitcher_id', 'EventType', 'PostVisTeamScore', 'PostHomeTeamScore', 'home_away', 'bats_right', 'throws_right', 'throws', 'next_batter_hand'], axis=1)
df.shape

(49398, 86)

In [4]:
df.head()

Unnamed: 0,pbp_idx,inning,postouts,post_runner_on_first,post_runner_on_second,post_runner_on_third,pitches_in_pa,pitch_total,er_total,runners_on_base,...,pas_since_single,has_have_single,pas_since_walk,has_have_walk,pas_since_double,has_have_double,pas_since_home_run,has_have_home_run,pas_since_points_allowed,has_have_points_allowed
0,1,1,1,0,0,0,1,1,0,0,...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,False
1,2,1,1,1,0,0,5,6,0,1,...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,False
2,3,1,2,1,0,0,4,10,0,1,...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,False
3,4,1,3,1,0,0,4,14,0,1,...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,False
4,5,2,0,0,1,0,3,17,0,1,...,0.0,True,0.0,False,0.0,False,0.0,False,0.0,False


In [5]:
df['constant']=1
df=df+0

X_train, X_test, y_train, y_test =train_test_split(df, target, train_size=0.8, random_state=42)

# Will only use information from the plate appearance number 18, onwards.
X_train['last_batter']=y_train
X_train=X_train[X_train.pbp_idx>18]
y_train=X_train['last_batter']

changes=X_train.loc[X_train.last_batter==1]
no_changes=X_train.loc[X_train.last_batter==0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['last_batter']=y_train


In [6]:
print(changes.shape)
print(no_changes.shape)

(1551, 87)
(7418, 87)


In [7]:
# Downsample: will keep 60% of rows from "normal" plays and 40% of "change" plays.
X_train2=pd.concat([changes, 
                    no_changes.sample(n=int(len(changes)*1.5), random_state=42)])

y_train2=X_train2.last_batter

X_train2.drop('last_batter', axis=1, inplace=True)
X_test.drop('last_batter', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [8]:
pd.set_option('display.max_rows', None)

In [9]:
vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss


pbp_idx                                    inf
inning                              351.485371
postouts                             62.003250
post_runner_on_first                       inf
post_runner_on_second                      inf
post_runner_on_third                       inf
pitches_in_pa                         1.334409
pitch_total                           2.800208
er_total                             13.062530
runners_on_base                            inf
score_diff                            1.713676
tying_run_on                          1.158029
total_outs_recorded                 332.524001
total_bases_allowed                  26.398067
opposite_hand                         1.292357
end_of_inning                         3.238299
previously_walk                       1.967720
consec_walks                          1.103528
double_header                         1.036361
home_team                             1.045444
opposite_actual                       2.081144
bats_left    

In [10]:
X_train2.drop(['post_runner_on_first', 'post_runner_on_second', 'post_runner_on_third'], axis=1, inplace=True)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                                    inf
inning                              347.329389
postouts                             61.964696
pitches_in_pa                         1.332259
pitch_total                           2.799486
er_total                             13.061807
runners_on_base                      31.668820
score_diff                            1.713457
tying_run_on                          1.156204
total_outs_recorded                 328.650775
total_bases_allowed                  26.360111
opposite_hand                         1.292143
end_of_inning                         3.234596
previously_walk                       1.952263
consec_walks                          1.103297
double_header                         1.035146
home_team                             1.045180
opposite_actual                       2.078874
bats_left                             1.981266
throws_left                           3.885166
batter_order                          2.106208
field_out    

In [11]:
X_train2.drop(['inning_cum_bats_right', 'inning_pa'], axis=1, inplace=True)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                                    inf
inning                              324.199087
postouts                             37.220408
pitches_in_pa                         1.331227
pitch_total                           2.799465
er_total                             13.061792
runners_on_base                      18.043992
score_diff                            1.713427
tying_run_on                          1.155820
total_outs_recorded                 313.021260
total_bases_allowed                  26.350583
opposite_hand                         1.291676
end_of_inning                         3.218101
previously_walk                       1.943658
consec_walks                          1.103293
double_header                         1.034937
home_team                             1.045073
opposite_actual                       2.068024
bats_left                             1.978284
throws_left                           3.885074
batter_order                          2.106203
field_out    

In [12]:
X_train2.drop(['total_outs_recorded'], axis=1, inplace=True)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                                   inf
inning                              20.937132
postouts                            16.126162
pitches_in_pa                        1.331217
pitch_total                          2.798351
er_total                            12.996944
runners_on_base                     15.674353
score_diff                           1.713426
tying_run_on                         1.155812
total_bases_allowed                 25.890330
opposite_hand                        1.291172
end_of_inning                        3.217982
previously_walk                      1.908723
consec_walks                         1.100516
double_header                        1.034232
home_team                            1.045025
opposite_actual                      2.067031
bats_left                            1.975850
throws_left                          3.885031
batter_order                         2.104509
field_out                            6.357465
strikeout                         

In [13]:
X_train2.drop(['cum_bats_right'], axis=1, inplace=True)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                             31.981824
inning                              20.937132
postouts                            16.126162
pitches_in_pa                        1.331217
pitch_total                          2.798351
er_total                            12.996944
runners_on_base                     15.674353
score_diff                           1.713426
tying_run_on                         1.155812
total_bases_allowed                 25.890330
opposite_hand                        1.291172
end_of_inning                        3.217982
previously_walk                      1.908723
consec_walks                         1.100516
double_header                        1.034232
home_team                            1.045025
opposite_actual                      2.067031
bats_left                            1.975850
throws_left                          3.885031
batter_order                         2.104509
field_out                            6.357465
strikeout                         

In [14]:
X_train2=X_train2.drop(['total_bases_allowed'], axis=1)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                             26.671192
inning                              19.332278
postouts                            16.049718
pitches_in_pa                        1.331122
pitch_total                          2.798006
er_total                            12.033663
runners_on_base                     15.601491
score_diff                           1.710274
tying_run_on                         1.155407
opposite_hand                        1.291172
end_of_inning                        3.214873
previously_walk                      1.908485
consec_walks                         1.100477
double_header                        1.033798
home_team                            1.044339
opposite_actual                      2.066978
bats_left                            1.975814
throws_left                          3.881081
batter_order                         2.104504
field_out                            6.302214
strikeout                            4.953652
single                            

In [15]:
X_train2=X_train2.drop(['inning'], axis=1)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                              8.631360
postouts                            14.254601
pitches_in_pa                        1.329694
pitch_total                          2.796582
er_total                            11.963453
runners_on_base                     14.983366
score_diff                           1.708128
tying_run_on                         1.154863
opposite_hand                        1.290859
end_of_inning                        3.212848
previously_walk                      1.906733
consec_walks                         1.099628
double_header                        1.033759
home_team                            1.036562
opposite_actual                      2.066546
bats_left                            1.975465
throws_left                          3.879972
batter_order                         2.104234
field_out                            6.266783
strikeout                            4.921763
single                               4.019676
walk                              

In [16]:
X_train2=X_train2.drop(['runners_on_base'], axis=1)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                              8.476701
postouts                             9.823716
pitches_in_pa                        1.329644
pitch_total                          2.796559
er_total                            11.955095
score_diff                           1.707633
tying_run_on                         1.146964
opposite_hand                        1.289425
end_of_inning                        3.210703
previously_walk                      1.867428
consec_walks                         1.099221
double_header                        1.033707
home_team                            1.036516
opposite_actual                      2.061432
bats_left                            1.974512
throws_left                          3.879365
batter_order                         2.103268
field_out                            6.141757
strikeout                            4.834446
single                               3.937646
walk                                 3.564640
double                            

In [17]:
X_train2=X_train2.drop(['er_total'], axis=1)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                             8.325887
postouts                            9.823509
pitches_in_pa                       1.328017
pitch_total                         2.793942
score_diff                          1.706734
tying_run_on                        1.144677
opposite_hand                       1.288939
end_of_inning                       3.210592
previously_walk                     1.866510
consec_walks                        1.099127
double_header                       1.033666
home_team                           1.036511
opposite_actual                     2.061330
bats_left                           1.974505
throws_left                         3.879025
batter_order                        2.103219
field_out                           6.141567
strikeout                           4.833243
single                              3.936528
walk                                3.563556
double                              2.459258
home_run                            3.070387
cum_post_r

In [18]:
X_train2=X_train2.drop(['postouts'], axis=1)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                             8.175365
pitches_in_pa                       1.327844
pitch_total                         2.793885
score_diff                          1.706734
tying_run_on                        1.141700
opposite_hand                       1.285892
end_of_inning                       2.491893
previously_walk                     1.861810
consec_walks                        1.098312
double_header                       1.033658
home_team                           1.036333
opposite_actual                     2.054695
bats_left                           1.973265
throws_left                         3.878628
batter_order                        2.103208
field_out                           5.501348
strikeout                           4.353745
single                              3.239987
walk                                3.096091
double                              2.210150
home_run                            2.913732
cum_post_runner_on_first            3.755696
inning_cum

In [19]:
X_train2=X_train2.drop(['inning_cum_single'], axis=1)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                             8.159231
pitches_in_pa                       1.327248
pitch_total                         2.793882
score_diff                          1.705492
tying_run_on                        1.137724
opposite_hand                       1.285063
end_of_inning                       2.476882
previously_walk                     1.826949
consec_walks                        1.097756
double_header                       1.033461
home_team                           1.036309
opposite_actual                     2.048826
bats_left                           1.973260
throws_left                         3.878616
batter_order                        2.101172
field_out                           5.498488
strikeout                           4.353602
single                              2.869202
walk                                2.927275
double                              2.177034
home_run                            2.911605
cum_post_runner_on_first            3.659145
inning_cum

In [20]:
X_train2=X_train2.drop(['has_have_home_run'], axis=1)

vif=pd.Series([variance_inflation_factor(X_train2.values, i) 
               for i in range(X_train2.shape[1])], 
              index=X_train2.columns)

vif

pbp_idx                             8.083122
pitches_in_pa                       1.327095
pitch_total                         2.792169
score_diff                          1.704547
tying_run_on                        1.137535
opposite_hand                       1.284897
end_of_inning                       2.476856
previously_walk                     1.826933
consec_walks                        1.097732
double_header                       1.033079
home_team                           1.036308
opposite_actual                     2.048716
bats_left                           1.973253
throws_left                         3.877883
batter_order                        2.101094
field_out                           5.498336
strikeout                           4.353143
single                              2.868717
walk                                2.927198
double                              2.176061
home_run                            2.894318
cum_post_runner_on_first            3.658978
inning_cum

In [21]:
vif.max()

8.083122385426854

In [22]:
X_train2.drop('constant', axis=1, inplace=True)

In [23]:
X_test=X_test[X_train2.columns.tolist()]

In [24]:
X_train2.to_pickle('Datasets/X_train.plk')
X_test.to_pickle('Datasets/X_test.plk')
y_train2.to_pickle('Datasets/Y_train.plk')
y_test.to_pickle('Datasets/Y_test.plk')