On this dataset, new variables are created, developed from the original dataset.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv('MLBPitcherRemovalData.csv')
data.head()

Unnamed: 0,start_id,pbp_idx,game_id,top_bot,inning,batter_stands,pitcher_id,throws,EventType,postouts,...,score_diff,tying_run_on,total_outs_recorded,total_bases_allowed,next_batter_hand,opposite_hand,end_of_inning,previously_walk,consec_walks,last_batter
0,2019/04/01/anamlb-seamlb-1_433587,1,2019/04/01/anamlb-seamlb-1,Y,1,L,433587,R,field_out,1,...,0,0,0,0,R,0.0,0,0,0,0
1,2019/04/01/anamlb-seamlb-1_433587,2,2019/04/01/anamlb-seamlb-1,Y,1,R,433587,R,hit_by_pitch,1,...,0,0,1,0,L,1.0,0,0,0,0
2,2019/04/01/anamlb-seamlb-1_433587,3,2019/04/01/anamlb-seamlb-1,Y,1,L,433587,R,force_out,2,...,0,0,1,1,R,0.0,0,0,0,0
3,2019/04/01/anamlb-seamlb-1_433587,4,2019/04/01/anamlb-seamlb-1,Y,1,R,433587,R,field_out,3,...,0,0,2,1,R,0.0,1,0,0,0
4,2019/04/01/anamlb-seamlb-1_433587,5,2019/04/01/anamlb-seamlb-1,Y,2,R,433587,R,single,0,...,4,0,3,1,L,1.0,0,0,0,0


In [3]:
data.dtypes

start_id                  object
pbp_idx                    int64
game_id                   object
top_bot                   object
inning                     int64
batter_stands             object
pitcher_id                 int64
throws                    object
EventType                 object
postouts                   int64
post_runner_on_first       int64
post_runner_on_second      int64
post_runner_on_third       int64
PostVisTeamScore           int64
PostHomeTeamScore          int64
home_away                 object
pitches_in_pa              int64
pitch_total                int64
er_total                   int64
runners_on_base            int64
score_diff                 int64
tying_run_on               int64
total_outs_recorded        int64
total_bases_allowed        int64
next_batter_hand          object
opposite_hand            float64
end_of_inning              int64
previously_walk            int64
consec_walks               int64
last_batter                int64
dtype: obj

In [4]:
data['double_header']=data.game_id.apply(lambda x:x[-1])
data['double_header']=data['double_header'].astype('int32')
data['double_header'].value_counts()

1    48660
2      861
Name: double_header, dtype: int64

In [5]:
data['home_team']=LabelEncoder().fit_transform(data[['top_bot']])
data['home_team'].value_counts()

  y = column_or_1d(y, warn=True)


0    24890
1    24631
Name: home_team, dtype: int64

In [6]:
data['opposite_actual']=data['throws']!=data['batter_stands']
data['opposite_actual']

0         True
1        False
2         True
3        False
4        False
         ...  
49516     True
49517     True
49518     True
49519    False
49520     True
Name: opposite_actual, Length: 49521, dtype: bool

In [7]:
data['bats_right']=LabelEncoder().fit_transform(data[['batter_stands']])
data['bats_left']=np.abs(data['bats_right']-1)
data['bats_right'].value_counts()

1    29294
0    20227
Name: bats_right, dtype: int64

In [8]:
data['throws_right']=LabelEncoder().fit_transform(data[['throws']])
data['throws_left']=np.abs(data['throws_right']-1)
data['throws_right'].value_counts()

1    33113
0    16408
Name: throws_right, dtype: int64

In [9]:
data['throws'].value_counts()

R    33113
L    16408
Name: throws, dtype: int64

In [10]:
data['batter_order']=data['pbp_idx']%9
data['batter_order'].replace(0, 9, inplace=True)

In [11]:
events=['field_out', 'strikeout', 'single', 'walk', 'double', 'home_run']
for event in events:
    data[event]=(data['EventType']==event)+0

In [12]:
vars_to_cum=['post_runner_on_first', 'post_runner_on_second', 'post_runner_on_third', 'bats_right', 'bats_left', 'opposite_actual', 'field_out', 'strikeout', 'single', 'walk', 'double', 'home_run']
for var in vars_to_cum:
    data['cum_'+var]=data[['start_id', var]].groupby('start_id').cumsum()
    data['inning_cum_'+var]=data[['start_id', 'inning', var]].groupby(['start_id', 'inning']).cumsum()
data.head()

Unnamed: 0,start_id,pbp_idx,game_id,top_bot,inning,batter_stands,pitcher_id,throws,EventType,postouts,...,cum_strikeout,inning_cum_strikeout,cum_single,inning_cum_single,cum_walk,inning_cum_walk,cum_double,inning_cum_double,cum_home_run,inning_cum_home_run
0,2019/04/01/anamlb-seamlb-1_433587,1,2019/04/01/anamlb-seamlb-1,Y,1,L,433587,R,field_out,1,...,0,0,0,0,0,0,0,0,0,0
1,2019/04/01/anamlb-seamlb-1_433587,2,2019/04/01/anamlb-seamlb-1,Y,1,R,433587,R,hit_by_pitch,1,...,0,0,0,0,0,0,0,0,0,0
2,2019/04/01/anamlb-seamlb-1_433587,3,2019/04/01/anamlb-seamlb-1,Y,1,L,433587,R,force_out,2,...,0,0,0,0,0,0,0,0,0,0
3,2019/04/01/anamlb-seamlb-1_433587,4,2019/04/01/anamlb-seamlb-1,Y,1,R,433587,R,field_out,3,...,0,0,0,0,0,0,0,0,0,0
4,2019/04/01/anamlb-seamlb-1_433587,5,2019/04/01/anamlb-seamlb-1,Y,2,R,433587,R,single,0,...,0,0,1,1,0,0,0,0,0,0


In [13]:
data['cum_points_allowed']=0
data.loc[data['home_away']=='Home', 'cum_points_allowed']=data.loc[data['home_away']=='Home', 'PostVisTeamScore']
data.loc[data['home_away']=='Away', 'cum_points_allowed']=data.loc[data['home_away']=='Away', 'PostHomeTeamScore']
data.head()

Unnamed: 0,start_id,pbp_idx,game_id,top_bot,inning,batter_stands,pitcher_id,throws,EventType,postouts,...,inning_cum_strikeout,cum_single,inning_cum_single,cum_walk,inning_cum_walk,cum_double,inning_cum_double,cum_home_run,inning_cum_home_run,cum_points_allowed
0,2019/04/01/anamlb-seamlb-1_433587,1,2019/04/01/anamlb-seamlb-1,Y,1,L,433587,R,field_out,1,...,0,0,0,0,0,0,0,0,0,0
1,2019/04/01/anamlb-seamlb-1_433587,2,2019/04/01/anamlb-seamlb-1,Y,1,R,433587,R,hit_by_pitch,1,...,0,0,0,0,0,0,0,0,0,0
2,2019/04/01/anamlb-seamlb-1_433587,3,2019/04/01/anamlb-seamlb-1,Y,1,L,433587,R,force_out,2,...,0,0,0,0,0,0,0,0,0,0
3,2019/04/01/anamlb-seamlb-1_433587,4,2019/04/01/anamlb-seamlb-1,Y,1,R,433587,R,field_out,3,...,0,0,0,0,0,0,0,0,0,0
4,2019/04/01/anamlb-seamlb-1_433587,5,2019/04/01/anamlb-seamlb-1,Y,2,R,433587,R,single,0,...,0,1,1,0,0,0,0,0,0,0


In [14]:
data['points_allowed']=data['cum_points_allowed'].diff()
data['points_allowed'][0]=0
data['points_allowed'][data['points_allowed']<0]=0 #Different pitcher.

data['inning_cum_points_allowed']=data[['start_id', 'inning', 'points_allowed']].groupby(['start_id', 'inning']).cumsum()

data[['inning', 'cum_points_allowed','inning_cum_points_allowed']].head(20)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['points_allowed'][0]=0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['points_allowed'][data['points_allowed']<0]=0 #Different pitcher.


Unnamed: 0,inning,cum_points_allowed,inning_cum_points_allowed
0,1,0,0.0
1,1,0,0.0
2,1,0,0.0
3,1,0,0.0
4,2,0,0.0
5,2,0,0.0
6,2,0,0.0
7,2,1,1.0
8,2,1,1.0
9,3,1,0.0


In [15]:
data['inning_pa']=1
data['inning_pa']=data[['start_id', 'inning', 'inning_pa']].groupby(['start_id', 'inning']).cumsum()

Add variables corresponding to opponent pitcher performance? Next batter performance? _hot_cold_ variables will indicate whether the next batter is hot or cold, and the 9 batters have the same order, the information from 10 batters ago will be pasted.

In [16]:
vars_hotcold=['field_out', 'strikeout', 'single', 'walk', 'double', 'home_run', 'points_allowed']

for var in vars_hotcold:
    data['hot_cold_'+var]=0
    for i in range(len(data)-10):
        if data['start_id'][i+10]==data['start_id'][i]:
            data['hot_cold_'+var][i+10]=data[var][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['hot_cold_'+var][i+10]=data[var][i]


No. of pitches between variables? Hypothesis: player will be pulled out when he is playing worse. If the frequency of "bad" events (Home-runs, allowd points, etc) is more common, and the opposite with "good" events (strikeout, field_out), then the playr is likely to be pulled.

In [17]:
vars_pas_since=['post_runner_on_first', 'post_runner_on_second', 'post_runner_on_third', 'field_out', 'strikeout', 'single', 'walk', 'double', 'home_run', 'points_allowed']

for var in vars_pas_since:
    data['pas_since_'+var]=data["pbp_idx"]-data["pbp_idx"].where(data[var]>=1).groupby(data["start_id"]).ffill()
    data['has_have_'+var]=~data['pas_since_'+var].isna()
data.head(50)

Unnamed: 0,start_id,pbp_idx,game_id,top_bot,inning,batter_stands,pitcher_id,throws,EventType,postouts,...,pas_since_single,has_have_single,pas_since_walk,has_have_walk,pas_since_double,has_have_double,pas_since_home_run,has_have_home_run,pas_since_points_allowed,has_have_points_allowed
0,2019/04/01/anamlb-seamlb-1_433587,1,2019/04/01/anamlb-seamlb-1,Y,1,L,433587,R,field_out,1,...,,False,,False,,False,,False,,False
1,2019/04/01/anamlb-seamlb-1_433587,2,2019/04/01/anamlb-seamlb-1,Y,1,R,433587,R,hit_by_pitch,1,...,,False,,False,,False,,False,,False
2,2019/04/01/anamlb-seamlb-1_433587,3,2019/04/01/anamlb-seamlb-1,Y,1,L,433587,R,force_out,2,...,,False,,False,,False,,False,,False
3,2019/04/01/anamlb-seamlb-1_433587,4,2019/04/01/anamlb-seamlb-1,Y,1,R,433587,R,field_out,3,...,,False,,False,,False,,False,,False
4,2019/04/01/anamlb-seamlb-1_433587,5,2019/04/01/anamlb-seamlb-1,Y,2,R,433587,R,single,0,...,0.0,True,,False,,False,,False,,False
5,2019/04/01/anamlb-seamlb-1_433587,6,2019/04/01/anamlb-seamlb-1,Y,2,L,433587,R,field_out,1,...,1.0,True,,False,,False,,False,,False
6,2019/04/01/anamlb-seamlb-1_433587,7,2019/04/01/anamlb-seamlb-1,Y,2,R,433587,R,field_out,2,...,2.0,True,,False,,False,,False,,False
7,2019/04/01/anamlb-seamlb-1_433587,8,2019/04/01/anamlb-seamlb-1,Y,2,L,433587,R,single,2,...,0.0,True,,False,,False,,False,0.0,True
8,2019/04/01/anamlb-seamlb-1_433587,9,2019/04/01/anamlb-seamlb-1,Y,2,R,433587,R,strikeout,3,...,1.0,True,,False,,False,,False,1.0,True
9,2019/04/01/anamlb-seamlb-1_433587,10,2019/04/01/anamlb-seamlb-1,Y,3,L,433587,R,strikeout,1,...,2.0,True,,False,,False,,False,2.0,True


Beware! What will happen when using the first of the batter? An interaction between pas_since and has_have?

## Missing Values Threatment

Finally, the missing values will be added, either with the correct value or a stand by value.

In [18]:
data.at[8388, 'next_batter_hand']='R'
data.at[8388, 'opposite_hand']=0

data.at[17989, 'next_batter_hand']='R'
data.at[17989, 'opposite_hand']=0

data.at[22880, 'next_batter_hand']='R'
data.at[22880, 'opposite_hand']=1

data.at[25444, 'next_batter_hand']='R'
data.at[25444, 'opposite_hand']=0

data.at[25776, 'next_batter_hand']='R'
data.at[25776, 'opposite_hand']=1

data.at[27663, 'next_batter_hand']='R'
data.at[27663, 'opposite_hand']=0

data.at[28309, 'next_batter_hand']='R'
data.at[28309, 'opposite_hand']=0

data.at[32986, 'next_batter_hand']='L'
data.at[32986, 'opposite_hand']=1

data.at[43471, 'next_batter_hand']='R'
data.at[43471, 'opposite_hand']=1

data.at[46615, 'next_batter_hand']='L'
data.at[46615, 'opposite_hand']=1

data.at[47965, 'next_batter_hand']='R'
data.at[47965, 'opposite_hand']=1

In [19]:
data.fillna(0, inplace=True)

In [20]:
data.to_pickle('New_variables.plk')