In [1]:
# dependencies
import pandas as pd
pd.options.mode.chained_assignment = None

import numpy as np

In [2]:
# load data
pitcher = pd.read_csv('data/zac_gallen.csv')

# check columns
print(pitcher.columns)

# preview data
pitcher.head()

Index(['pitch_type', 'game_date', 'release_speed', 'release_pos_x',
       'release_pos_z', 'player_name', 'batter', 'pitcher', 'events',
       'description', 'spin_dir', 'spin_rate_deprecated',
       'break_angle_deprecated', 'break_length_deprecated', 'zone', 'des',
       'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'type',
       'hit_location', 'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x',
       'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b',
       'outs_when_up', 'inning', 'inning_topbot', 'hc_x', 'hc_y',
       'tfs_deprecated', 'tfs_zulu_deprecated', 'fielder_2', 'umpire', 'sv_id',
       'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot',
       'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed',
       'release_spin_rate', 'release_extension', 'game_pk', 'pitcher.1',
       'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',
       'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y',
       'estima

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment
0,CH,2020-09-25,86.6,-3.25,5.58,Zac Gallen,641857,668678,grounded_into_double_play,hit_into_play,...,4,0,0,4,0,4,0,4,Standard,Standard
1,CH,2020-09-25,86.2,-3.32,5.56,Zac Gallen,641857,668678,,blocked_ball,...,4,0,0,4,0,4,0,4,Standard,Standard
2,FF,2020-09-25,95.1,-2.94,5.93,Zac Gallen,641857,668678,,swinging_strike,...,4,0,0,4,0,4,0,4,Standard,Standard
3,FF,2020-09-25,93.6,-2.92,5.98,Zac Gallen,658069,668678,single,hit_into_play_no_out,...,4,0,0,4,0,4,0,4,Standard,Standard
4,FF,2020-09-25,93.2,-2.99,5.97,Zac Gallen,658069,668678,,called_strike,...,4,0,0,4,0,4,0,4,Standard,Standard


In [3]:
# list of necessary columns
cols_needed = ['p_throws', 'pitch_type', 'release_pos_x', 'release_pos_z', 'release_speed', 
               'effective_speed', 'release_spin_rate', 'release_extension',
               'plate_x', 'plate_z', 'balls', 'strikes', 'woba_denom', 'events', 'description']

# subset columns
pitcher_data = pitcher[cols_needed]

# preview data
pitcher_data.head()

Unnamed: 0,p_throws,pitch_type,release_pos_x,release_pos_z,release_speed,effective_speed,release_spin_rate,release_extension,plate_x,plate_z,balls,strikes,woba_denom,events,description
0,R,CH,-3.25,5.58,86.6,86.8,1523.0,6.7,-0.46,1.74,1,1,1.0,grounded_into_double_play,hit_into_play
1,R,CH,-3.32,5.56,86.2,86.5,1505.0,6.8,-0.85,0.58,0,1,,,blocked_ball
2,R,FF,-2.94,5.93,95.1,95.0,2406.0,6.8,-0.18,3.34,0,0,,,swinging_strike
3,R,FF,-2.92,5.98,93.6,94.1,2307.0,6.9,-0.42,2.18,1,1,1.0,single,hit_into_play_no_out
4,R,FF,-2.99,5.97,93.2,93.0,2423.0,6.7,0.24,2.09,1,0,,,called_strike


In [4]:
# process to replace description types with base types

# list different description values
print('Original Counts')
print(pitcher_data['description'].value_counts())
print()

# build list of strikes types
strikes = ['called_strike', 'foul', 'swinging_strike', 'foul_tip', 'foul_bunt', 'swinging_strike_blocked']

# build list of balls types
balls = ['ball', 'blocked_ball']

# build list of in play types
in_play = ['hit_into_play', 'hit_into_play_no_out', 'hit_into_play_score']

# conditional replace
pitcher_data.loc[pitcher_data['description'].isin(strikes), 'description'] = 'strike'
pitcher_data.loc[pitcher_data['description'].isin(balls), 'description'] = 'ball'
pitcher_data.loc[pitcher_data['description'].isin(in_play), 'description'] = 'in_play'

# check dataframe
print('Updated Counts')
print(pitcher_data['description'].value_counts())

Original Counts
ball                       638
called_strike              364
foul                       295
swinging_strike            215
hit_into_play              193
hit_into_play_no_out        69
blocked_ball                39
hit_into_play_score         29
swinging_strike_blocked     24
foul_tip                    16
foul_bunt                    4
hit_by_pitch                 4
Name: description, dtype: int64

Updated Counts
strike          918
ball            677
in_play         291
hit_by_pitch      4
Name: description, dtype: int64


In [5]:
# list different events values
print('Original Counts')
print(pitcher_data['events'].value_counts())
print()

# convert different out types to base type
outs = ['field_out', 'force_out', 'grounded_into_double_play', 'double_play', 'field_error', 'sac_fly', 'sac_bunt', 'fielders_choice', 'fielders_choice_out']

# conditional replace
pitcher_data.loc[pitcher_data['events'].isin(outs), 'events'] = 'out'

# convert caught stealing to like values
cs = ['caught_stealing_2b', 'caught_stealing_3b', 'caught_stealing_home']

# conditional replace
pitcher_data.loc[pitcher_data['events'].isin(cs), 'events'] = 'cs'

# replace strikeout/walk events with ball/strike
pitcher_data['events'] = np.where((pitcher_data['woba_denom'] == 1) & (pitcher_data['description'] == 'strike'), 'strike', pitcher_data['events'])
pitcher_data['events'] = np.where((pitcher_data['woba_denom'] == 1) & (pitcher_data['description'] == 'ball'), 'ball', pitcher_data['events'])

print('Updated Counts')
print(pitcher_data['events'].value_counts())
print()

Original Counts
field_out                    169
strikeout                    135
single                        61
walk                          43
double                        16
home_run                      14
grounded_into_double_play     12
force_out                      9
hit_by_pitch                   4
field_error                    4
double_play                    2
interf_def                     1
triple                         1
caught_stealing_2b             1
sac_fly                        1
caught_stealing_home           1
sac_bunt                       1
fielders_choice_out            1
Name: events, dtype: int64

Updated Counts
out             199
strike          136
single           61
ball             43
double           16
home_run         14
hit_by_pitch      4
cs                2
triple            1
Name: events, dtype: int64



In [6]:
# if events is null, change to description procedure

# conditional replace
pitcher_data['events'] = np.where(pitcher_data['events'].isna(), pitcher_data['description'], pitcher_data['events'])

# check values
print('Updated Counts')
print(pitcher_data['events'].value_counts())
print()

# check data
pitcher_data.head()

Updated Counts
strike          917
ball            676
out             199
single           61
double           16
home_run         14
hit_by_pitch      4
cs                2
triple            1
Name: events, dtype: int64



Unnamed: 0,p_throws,pitch_type,release_pos_x,release_pos_z,release_speed,effective_speed,release_spin_rate,release_extension,plate_x,plate_z,balls,strikes,woba_denom,events,description
0,R,CH,-3.25,5.58,86.6,86.8,1523.0,6.7,-0.46,1.74,1,1,1.0,out,in_play
1,R,CH,-3.32,5.56,86.2,86.5,1505.0,6.8,-0.85,0.58,0,1,,ball,ball
2,R,FF,-2.94,5.93,95.1,95.0,2406.0,6.8,-0.18,3.34,0,0,,strike,strike
3,R,FF,-2.92,5.98,93.6,94.1,2307.0,6.9,-0.42,2.18,1,1,1.0,single,in_play
4,R,FF,-2.99,5.97,93.2,93.0,2423.0,6.7,0.24,2.09,1,0,,strike,strike


In [7]:
# drop extra columns and cs rows
pitcher_data.drop(columns = ['woba_denom', 'description'], inplace = True, axis = 1)

pitcher_dropcs = pitcher_data[pitcher_data['events'] != 'cs']

# export data
pitcher_dropcs.to_csv('data/pitcher_clean.csv', index = False)