In [1]:
import pandas as pd
import numpy as np
import pickle
import chardet

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

In [3]:
pbp_filepath = "pitches_folder/pitch_by_pitch_metadata.csv"
pitches_filepath = "pitches_folder/pitches"

with open(pbp_filepath, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))

pitch_by_pitch_metadata = pd.read_csv("pitches_folder/pitch_by_pitch_metadata.csv", encoding=result['encoding'])
pitches = pd.read_csv("pitches_folder/pitches")

  pitches = pd.read_csv("pitches_folder/pitches")


In [4]:
pitches_modified = pitches.copy(deep=True)
pitch_types_to_keep = ['FF', 'FT', 'SI', 'FC', 'FS', 'SL', 'CU', 'CH', 'KC', 'KN', 'EP', 'FO', 'SC']    # 13 seems to be common
pitches_modified = pitches_modified.query("pitch_type in @pitch_types_to_keep").reset_index(drop=True)

In [5]:
print(f"Size = {pitches_modified.shape}")

Size = (711841, 125)


In [6]:
print(list(pitches_modified.columns))

['uid', 'game_pk', 'year', 'date', 'team_id_b', 'team_id_p', 'inning', 'top', 'at_bat_num', 'pcount_at_bat', 'pcount_pitcher', 'balls', 'strikes', 'fouls', 'outs', 'is_final_pitch', 'final_balls', 'final_strikes', 'final_outs', 'start_tfs', 'start_tfs_zulu', 'batter_id', 'stand', 'b_height', 'pitcher_id', 'p_throws', 'at_bat_des', 'event', 'event2', 'event3', 'event4', 'away_team_runs', 'home_team_runs', 'score', 'pitch_des', 'pitch_id', 'type', 'pitch_tfs', 'pitch_tfs_zulu', 'x', 'y', 'sv_id', 'start_speed', 'end_speed', 'sz_top', 'sz_bot', 'pfx_x', 'pfx_z', 'px', 'pz', 'x0', 'z0', 'y0', 'vx0', 'vz0', 'vy0', 'ax', 'az', 'ay', 'break_length', 'break_y', 'break_angle', 'pitch_type', 'type_confidence', 'zone', 'nasty', 'spin_dir', 'spin_rate', 'cc', 'on_1b', 'on_2b', 'on_3b', 'runner1_id', 'runner1_start', 'runner1_end', 'runner1_event', 'runner1_score', 'runner1_rbi', 'runner1_earned', 'runner2_id', 'runner2_start', 'runner2_end', 'runner2_event', 'runner2_score', 'runner2_rbi', 'runner

In [7]:
# leaving out time for now (start_tfs_zulu)

targets = ['pitch_type', 'type_confidence']
games_features = ['inning', 'top', 'score_diff', 'at_bat_num', 'p_throws', 'pcount_pitcher', 'bases_state', 'pitcher_id']
count_features = ['pcount_at_bat', 'balls', 'strikes', 'outs']
batter_features = ['stand', 'height_inches']

In [8]:
pitches_modified.on_1b.where(pitches_modified.on_1b.isna(),1, inplace=True)
pitches_modified.on_1b.fillna(0, inplace=True)
pitches_modified.on_2b.where(pitches_modified.on_2b.isna(),1, inplace=True)
pitches_modified.on_2b.fillna(0, inplace=True)
pitches_modified.on_3b.where(pitches_modified.on_3b.isna(),1, inplace=True)
pitches_modified.on_3b.fillna(0, inplace=True)

pitches_modified.on_1b = pitches_modified.on_1b.astype(int).astype(str)
pitches_modified.on_2b = pitches_modified.on_2b.astype(int).astype(str)
pitches_modified.on_3b = pitches_modified.on_3b.astype(int).astype(str)

pitches_modified['bases_state'] = pitches_modified.on_1b + pitches_modified.on_2b + pitches_modified.on_3b 
# Categories (8, object): ['000', '001', '010', '011', '100', '101', '110', '111']
pitches_modified['bases_state'] = pitches_modified['bases_state'].astype('category')

In [9]:

pitches_modified['is_home'] = pitches_modified['top']
pitches_modified['score_diff'] = (pitches_modified.is_home)*(pitches_modified.home_team_runs - pitches_modified.away_team_runs) + (pitches_modified.is_home - 1)*(pitches_modified.home_team_runs - pitches_modified.away_team_runs)
pitches_modified['height_inches'] = pitches_modified['b_height'].apply(lambda x : int(x.split("-")[0])*12 + int(x.split("-")[1]))

In [10]:
cols_for_previous = ['pitch_type', 'type', 'end_speed', 'break_length', 'break_angle', 'break_y', 'zone', 'spin_dir', 'spin_rate']
cols_for_previous_names = ['prev_' + x for x in cols_for_previous]

pitches_modified[cols_for_previous_names] = pitches_modified.groupby(["pitcher_id", "game_pk", "at_bat_num"])[cols_for_previous].shift(1)

In [11]:
cols_to_keep = ['uid'] + targets + games_features + count_features + batter_features + cols_for_previous_names

In [12]:
dataset = pitches_modified[cols_to_keep].copy(deep=True)


dataset.pitch_type = dataset.pitch_type.astype("category")
dataset.inning = dataset.inning.astype("category")
dataset.p_throws = dataset.p_throws.astype("category")
dataset.stand = dataset.stand.astype("category")
dataset.top = dataset.top.astype("boolean")

dataset.prev_pitch_type = dataset.prev_pitch_type.astype("category")
dataset.prev_type = dataset.stand.astype("category")

In [13]:
#dataset.to_pickle("data/dataset.pkl")

In [14]:
'''
Compile "advanced dataset" with better pitcher/batter profiles

Previous Pitch tendencies:
    Previous 5 Pitch Tendency Continuous
    Previous 10 Pitch Tendency Continuous
    Previous 20 Pitch Tendency Continuous
Previous Strike tendencies:
    Previous 5 Pitch Strike Tendency Continuous
    Previous 10 Pitch Strike Tendency Continuous
    Previous 20 Pitch Strike Tendency Continuous
Pitcher Historical Tendency Continuous
Pitcher Tendency vs. Batter Continuous

Batter Stats:
    Batter Strike Tendency Continuous
    Batter In-Play Tendency Continuous
    Batter Ball Tendency Continuous
'''

advanced_dataset = pitches_modified.sort_values(['pitcher_id', 'game_pk', 'pcount_pitcher']).reset_index(drop=True)
all_type_names = pitches_modified.pitch_type.unique()

In [23]:
def group_func(pitcher_specific_df, K):
    k_previous_types_dist = np.zeros((pitcher_specific_df.shape[0], len(all_type_names)))
    k_previous_types_dist[0:K, :] = np.nan
    
    for i in range(K, pitcher_specific_df.shape[0]):    # first pitch has no previous, store with nan
        min_index = max(0,i-K)
        slice = pitcher_specific_df.iloc[min_index:i,:] # get K previous pitches
        type_distribution = slice.pitch_type.value_counts(normalize=True)   # get rates each type thrown
        not_present_types = list(set(all_type_names) - set(type_distribution.index))  # get all the types not thrown by that pitcher within K pitches
        not_present_types_with_zeros = pd.Series(len(not_present_types)*[0.0], index = not_present_types)   # identify that these types were thrown 0% of the time
        full_type_distribution = pd.concat([type_distribution, not_present_types_with_zeros]).sort_index()  # sort so order the same
        k_previous_types_dist[i] = full_type_distribution   # store in array

    return pd.DataFrame(k_previous_types_dist, columns=all_type_names)


In [26]:
prev_5_col_names = [f'prev_5_' + f"{name}_%" for name in np.sort(all_type_names).tolist()]
prev_5_new_cols = advanced_dataset.groupby("pitcher_id").apply(group_func, K=5)
advanced_dataset.loc[:,prev_5_col_names] = prev_5_new_cols.values

In [64]:
prev_10_col_names = [f'prev_10_' + f"{name}_%" for name in np.sort(all_type_names).tolist()]
prev_10_new_cols = advanced_dataset.groupby("pitcher_id").apply(group_func, K=10)
advanced_dataset.loc[:,prev_10_col_names] = prev_10_new_cols.values

In [65]:
prev_20_col_names = [f'prev_20_' + f"{name}_%" for name in np.sort(all_type_names).tolist()]
prev_20_new_cols = advanced_dataset.groupby("pitcher_id").apply(group_func, K=20)
advanced_dataset.loc[:,prev_20_col_names] = prev_20_new_cols.values

In [154]:
strike_ball_hit_names = advanced_dataset.type.unique()
def add_pitch_strike_tendencies(pitcher_specific_df, K):
    k_previous_types_dist = np.zeros((pitcher_specific_df.shape[0], len(strike_ball_hit_names)))
    k_previous_types_dist[0:K, :] = np.nan
    
    for i in range(K, pitcher_specific_df.shape[0]):    # first pitch has no previous, store with nan
        min_index = max(0,i-K)
        slice = pitcher_specific_df.iloc[min_index:i,:] # get K previous pitches
        type_distribution = slice.type.value_counts(normalize=True)   # get rates each type thrown
        not_present_types = list(set(strike_ball_hit_names) - set(type_distribution.index))  # get all the types not thrown by that pitcher within K pitches
        not_present_types_with_zeros = pd.Series(len(not_present_types)*[0.0], index = not_present_types)   # identify that these types were thrown 0% of the time
        full_type_distribution = pd.concat([type_distribution, not_present_types_with_zeros]).sort_index()  # sort so order the same
        k_previous_types_dist[i] = full_type_distribution   # store in array

    return pd.DataFrame(k_previous_types_dist, columns=strike_ball_hit_names)

In [188]:
# add previous strike tendencies
for k in [5,10,20]:
    prev_k_col_names = [f'prev_{k}_' + f"{name}_%" for name in np.sort(strike_ball_hit_names).tolist()]
    prev_k_new_cols = advanced_dataset.groupby("pitcher_id").apply(add_pitch_strike_tendencies, K=k)
    advanced_dataset.loc[:,prev_k_col_names] = prev_k_new_cols.values

In [158]:
# don't start counting till K_start
def add_pitcher_historical_tendencies(pitcher_specific_df, K_start):
    k_previous_types_dist = np.zeros((pitcher_specific_df.shape[0], len(all_type_names)))
    k_previous_types_dist[0:K_start, :] = np.nan
    
    for i in range(K_start, pitcher_specific_df.shape[0]):    # first pitch has no previous, store with nan
        slice = pitcher_specific_df.iloc[0:i,:] # get K previous pitches
        type_distribution = slice.pitch_type.value_counts(normalize=True)   # get rates each type thrown
        not_present_types = list(set(all_type_names) - set(type_distribution.index))  # get all the types not thrown by that pitcher within K pitches
        not_present_types_with_zeros = pd.Series(len(not_present_types)*[0.0], index = not_present_types)   # identify that these types were thrown 0% of the time
        full_type_distribution = pd.concat([type_distribution, not_present_types_with_zeros]).sort_index()  # sort so order the same
        k_previous_types_dist[i] = full_type_distribution   # store in array

    return pd.DataFrame(k_previous_types_dist, columns=all_type_names)

In [160]:
historical_col_names = [f'historical_' + f"{name}_%" for name in np.sort(all_type_names).tolist()]
historical_new_cols = advanced_dataset.groupby("pitcher_id").apply(add_pitcher_historical_tendencies, K_start=21)
advanced_dataset.loc[:,historical_col_names] = historical_new_cols.values

In [165]:
pit_v_bat_types_dist = np.zeros((advanced_dataset.shape[0], len(all_type_names)))
# only take last 20 instances
for i in range(advanced_dataset.shape[0]):
    pitcher_id = advanced_dataset.iloc[i,:]['pitcher_id']
    batter_id = advanced_dataset.iloc[i,:]['batter_id']
    slice = advanced_dataset.iloc[0:i,:].query("pitcher_id == @pitcher_id and batter_id == @batter_id").tail(20)
    if slice.shape[0] == 0:
        pit_v_bat_types_dist[i] = np.nan    
    else:
        type_distribution = slice.pitch_type.value_counts(normalize=True)   # get rates each type thrown
        not_present_types = list(set(all_type_names) - set(type_distribution.index))  # get all the types not thrown by that pitcher within K pitches
        not_present_types_with_zeros = pd.Series(len(not_present_types)*[0.0], index = not_present_types)   # identify that these types were thrown 0% of the time
        full_type_distribution = pd.concat([type_distribution, not_present_types_with_zeros]).sort_index()  # sort so order the same
        pit_v_bat_types_dist[i] = full_type_distribution   # store in array

In [168]:
pit_v_bat_col_names = [f'pit_v_bat_' + f"{name}_%" for name in np.sort(all_type_names).tolist()]
advanced_dataset.loc[:,pit_v_bat_col_names] = pit_v_bat_types_dist

In [184]:
strike_ball_hit_names = advanced_dataset.type.unique()
def add_batter_strike_tendencies(batter_specific_df, K_start):
    k_previous_types_dist = np.zeros((batter_specific_df.shape[0], len(strike_ball_hit_names)))
    k_previous_types_dist[0:K_start, :] = np.nan
    
    for i in range(K_start, batter_specific_df.shape[0]):    # first pitch has no previous, store with nan
        slice = batter_specific_df.iloc[0:i,:] # get K previous pitches
        type_distribution = slice.type.value_counts(normalize=True)   # get rates each type thrown
        not_present_types = list(set(strike_ball_hit_names) - set(type_distribution.index))  # get all the types not thrown by that pitcher within K pitches
        not_present_types_with_zeros = pd.Series(len(not_present_types)*[0.0], index = not_present_types)   # identify that these types were thrown 0% of the time
        full_type_distribution = pd.concat([type_distribution, not_present_types_with_zeros]).sort_index()  # sort so order the same
        k_previous_types_dist[i] = full_type_distribution   # store in array

    return pd.DataFrame(k_previous_types_dist, columns=strike_ball_hit_names)

In [189]:
advanced_dataset = advanced_dataset.sort_values(['batter_id', 'game_pk', 'pcount_at_bat']).reset_index(drop=True)
historical_batter_col_names = [f'historical_batter_' + f"{name}_%" for name in np.sort(strike_ball_hit_names).tolist()]
historical_batter_new_cols = advanced_dataset.groupby("batter_id").apply(add_batter_strike_tendencies, K_start=21)
advanced_dataset.loc[:,historical_batter_col_names] = historical_batter_new_cols.values
advanced_dataset = advanced_dataset.sort_values(['pitcher_id', 'game_pk', 'pcount_pitcher']).reset_index(drop=True)

In [196]:
# all_type_names = advanced_dataset.pitch_type.unique()
# strike_ball_hit_names = advanced_dataset.type.unique()

# targets = ['pitch_type', 'type_confidence']
# games_features = ['inning', 'top', 'score_diff', 'at_bat_num', 'p_throws', 'pcount_pitcher', 'bases_state', 'pitcher_id']
# count_features = ['pcount_at_bat', 'balls', 'strikes', 'outs']
# batter_features = ['stand', 'height_inches']
# cols_for_previous = ['pitch_type', 'type', 'end_speed', 'break_length', 'break_angle', 'break_y', 'zone', 'spin_dir', 'spin_rate']
# cols_for_previous_names = ['prev_' + x for x in cols_for_previous]

# prev_5_col_names = [f'prev_5_' + f"{name}_%" for name in np.sort(all_type_names).tolist()]
# prev_10_col_names = [f'prev_10_' + f"{name}_%" for name in np.sort(all_type_names).tolist()]
# prev_20_col_names = [f'prev_20_' + f"{name}_%" for name in np.sort(all_type_names).tolist()]
prev_5_col_names_strike = [f'prev_5_' + f"{name}_%" for name in np.sort(strike_ball_hit_names).tolist()]
prev_10_col_names_strike = [f'prev_10_' + f"{name}_%" for name in np.sort(strike_ball_hit_names).tolist()]
prev_20_col_names_strike = [f'prev_20_' + f"{name}_%" for name in np.sort(strike_ball_hit_names).tolist()]
# historical_col_names = [f'historical_' + f"{name}_%" for name in np.sort(all_type_names).tolist()]
# pit_v_bat_col_names = [f'pit_v_bat_' + f"{name}_%" for name in np.sort(all_type_names).tolist()]
# historical_batter_col_names = [f'historical_batter_' + f"{name}_%" for name in np.sort(strike_ball_hit_names).tolist()]

cols_to_keep = ['uid'] + targets + games_features + count_features + batter_features + cols_for_previous_names \
                + prev_5_col_names + prev_10_col_names + prev_20_col_names + prev_5_col_names_strike + prev_10_col_names_strike \
                + prev_20_col_names_strike + historical_col_names + pit_v_bat_col_names + historical_batter_col_names

advanced_dataset_to_keep = advanced_dataset[cols_to_keep].copy(deep=True)

advanced_dataset_to_keep.pitch_type = advanced_dataset_to_keep.pitch_type.astype("category")
advanced_dataset_to_keep.inning = advanced_dataset_to_keep.inning.astype("category")
advanced_dataset_to_keep.p_throws = advanced_dataset_to_keep.p_throws.astype("category")
advanced_dataset_to_keep.stand = advanced_dataset_to_keep.stand.astype("category")
advanced_dataset_to_keep.top = advanced_dataset_to_keep.top.astype("boolean")

advanced_dataset_to_keep.prev_pitch_type = advanced_dataset_to_keep.prev_pitch_type.astype("category")
advanced_dataset_to_keep.prev_type = advanced_dataset_to_keep.stand.astype("category")
advanced_dataset_to_keep = advanced_dataset_to_keep.sort_values("uid") # so same order as dataset

In [197]:
advanced_dataset_to_keep.to_pickle("data/advanced_dataset_final.pkl")

In [169]:
advanced_dataset.to_pickle("data/advanced_dataset_final.pkl")

In [106]:
pitches_modified.groupby(["pitcher_id"])[["pitcher_id",'pitch_type']].tail(5)#.value_counts(normalize=True).unstack().iloc[0:10]

Unnamed: 0,pitcher_id,pitch_type
9388,431148,CH
9389,431148,CH
9390,431148,SI
9391,431148,SL
9392,431148,SI
...,...,...
718956,435400,FF
718957,435400,FC
718958,435400,FF
718959,435400,FT


In [93]:
pitches_modified.groupby(["pitcher_id", "game_pk", "at_bat_num"])[["pitcher_id", "game_pk", "at_bat_num", 'pitch_type']].tail(5).iloc[0:10]

Unnamed: 0,pitcher_id,game_pk,at_bat_num,pitch_type
30,450308,286874,7,FF
31,450308,286874,7,SL
32,450308,286874,7,FF
33,450308,286874,7,FF
34,450308,286874,7,SL
35,450308,286874,8,FF
36,450308,286874,8,CU
37,450308,286874,8,CU
38,450308,286874,8,SL
39,450308,286874,9,FF


In [112]:
pitches_modified.groupby(["pitcher_id", "game_pk", "at_bat_num"])[["pitcher_id", "game_pk", "at_bat_num", 'pitch_type']].tail(5).iloc[0:10].pitch_type.shift(1)

30    None
31      FF
32      SL
33      FF
34      FF
35      SL
36      FF
37      CU
38      CU
39      SL
Name: pitch_type, dtype: object

In [84]:
pitches_modified.shape

(718961, 126)

In [107]:
pitches_modified.year.unique()

array([2011])

In [81]:
pitches_modified.head()

Unnamed: 0,uid,game_pk,year,date,team_id_b,team_id_p,inning,top,at_bat_num,pcount_at_bat,pcount_pitcher,balls,strikes,fouls,outs,is_final_pitch,final_balls,final_strikes,final_outs,start_tfs,start_tfs_zulu,batter_id,stand,b_height,pitcher_id,p_throws,at_bat_des,event,event2,event3,event4,away_team_runs,home_team_runs,score,pitch_des,pitch_id,type,pitch_tfs,pitch_tfs_zulu,x,y,sv_id,start_speed,end_speed,sz_top,sz_bot,pfx_x,pfx_z,px,pz,x0,z0,y0,vx0,vz0,vy0,ax,az,ay,break_length,break_y,break_angle,pitch_type,type_confidence,zone,nasty,spin_dir,spin_rate,cc,on_1b,on_2b,on_3b,runner1_id,runner1_start,runner1_end,runner1_event,runner1_score,runner1_rbi,runner1_earned,runner2_id,runner2_start,runner2_end,runner2_event,runner2_score,runner2_rbi,runner2_earned,runner3_id,runner3_start,runner3_end,runner3_event,runner3_score,runner3_rbi,runner3_earned,runner4_id,runner4_start,runner4_end,runner4_event,runner4_score,runner4_rbi,runner4_earned,runner5_id,runner5_start,runner5_end,runner5_event,runner5_score,runner5_rbi,runner5_earned,runner6_id,runner6_start,runner6_end,runner6_event,runner6_score,runner6_rbi,runner6_earned,runner7_id,runner7_start,runner7_end,runner7_event,runner7_score,runner7_rbi,runner7_earned,created_at,added_at,modified_at,modified_by,bases_state
0,14143226,286874,2011,2011-03-31,108,118,1,1,1,1,1,0,0,0,0,0,2,1,1,201226,2011-03-31 20:12:26,430895,L,5-8,460024,R,"Maicer Izturis grounds out, second baseman Chr...",Groundout,,,,0,0,,Ball,3,B,201301.0,2011-03-31 20:13:01,105.58,180.46,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2016-03-03 21:33:20,2016-03-03 21:33:20,2016-03-03 21:33:20,1,0
1,14143227,286874,2011,2011-03-31,108,118,1,1,1,2,2,1,0,0,0,0,2,1,1,201226,2011-03-31 20:12:26,430895,L,5-8,460024,R,"Maicer Izturis grounds out, second baseman Chr...",Groundout,,,,0,0,,Ball,4,B,201319.0,2011-03-31 20:13:19,99.57,170.96,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2016-03-03 21:33:20,2016-03-03 21:33:20,2016-03-03 21:33:20,1,0
2,14143228,286874,2011,2011-03-31,108,118,1,1,1,3,3,2,0,0,0,0,2,1,1,201226,2011-03-31 20:12:26,430895,L,5-8,460024,R,"Maicer Izturis grounds out, second baseman Chr...",Groundout,,,,0,0,,Called Strike,5,S,201327.0,2011-03-31 20:13:27,95.28,152.83,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2016-03-03 21:33:20,2016-03-03 21:33:20,2016-03-03 21:33:20,1,0
3,14143229,286874,2011,2011-03-31,108,118,1,1,1,4,4,2,1,0,0,1,2,1,1,201226,2011-03-31 20:12:26,430895,L,5-8,460024,R,"Maicer Izturis grounds out, second baseman Chr...",Groundout,,,,0,0,,"In play, out(s)",6,X,180441.0,2011-03-31 18:04:41,93.56,168.37,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2016-03-03 21:33:20,2016-03-03 21:33:20,2016-03-03 21:33:20,1,0
4,14143230,286874,2011,2011-03-31,108,118,1,1,2,1,5,0,0,0,1,0,2,2,1,201354,2011-03-31 20:13:54,435062,R,5-10,460024,R,Howie Kendrick doubles (1) on a line drive to ...,Double,,,,0,0,,Called Strike,10,S,201404.0,2011-03-31 20:14:04,99.57,170.96,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2016-03-03 21:33:20,2016-03-03 21:33:20,2016-03-03 21:33:20,1,0


In [None]:
''' Get Previous Pitch info '''