In [2]:
import pandas as pd
import pybaseball
from pybaseball import statcast
pd.set_option('display.max_columns', 500)

In [3]:
def fetch_statcast_data(start_date, end_date):
    # Fetch data from Statcast
    data = statcast(start_date, end_date)
    return data

In [4]:
def process_data(data):
    # Filter necessary columns
    data = data[['game_date', 'batter', 'pitcher', 'pitch_type']]
    
    # Drop rows where pitch_type is NaN (happens occasionally in real datasets)
    data = data.dropna(subset=['pitch_type'])

    # Group data by game, batter, pitcher, and at_bat_number, then sort by pitch_number
    # Use apply to ensure the group maintains DataFrame format after sorting
    grouped = data.groupby(['game_date', 'batter', 'pitcher', 'pitch_type']).apply(lambda x: x.sort_values('pitch_number')).reset_index(drop=True)

    # We can't directly iterate with multiple unpacks if using apply without specifying 'group_keys=False' in the groupby
    # However, by resetting the index, we revert to a single DataFrame and thus lose the direct tuple unpacking ability
    # Create a DataFrame to store the sequence, pitcher ID, and at-bat outcome
    at_bats = []
    for _, group in grouped.groupby(['game_date', 'batter', 'pitcher', 'at_bat_number']):
        pitch_sequence = ','.join(group['pitch_type'].tolist())
        count_sequence = ','.join(group['combined_count'].tolist())
        zone_sequence = ','.join(group['zone'].tolist())
        outcome = group['events'].iloc[-1] if not pd.isna(group['events'].iloc[-1]) else 'No event'
        pitcher_id = group['pitcher'].iloc[0]
        batter_id = group['batter'].iloc[0]
        at_bat_number = group['at_bat_number'].iloc[0]
        p_throws = group['p_throws'].iloc[0]
        stand = group['stand'].iloc[0]
        at_bats.append([pitch_sequence, count_sequence, zone_sequence, p_throws, stand, pitcher_id, batter_id, at_bat_number, outcome])

    # Convert list to DataFrame
    result_df = pd.DataFrame(at_bats, columns=['pitch_sequence', 'count_sequence', 'zone_sequence', 'p_throws', 'stand', 'pitcher_id', 'batter_id', 'at_bat_number', 'outcome'])
    return result_df


In [6]:
def consolidate_pitch_types_cur(data):
    # Mapping dictionary for pitch types
    pitch_type_map = {
        'CH': 'CH',  # Changeup
        'CU': 'CB',  # Curveball
        'FC': 'FC',  # Cutter
        'EP': 'CH',  # Eephus
        'FO': 'CH',  # Forkball
        'FF': 'FF',  # Four-Seam Fastball
        'KN': 'KN',  # Knuckleball
        'KC': 'CB',  # Knuckle-curve
        'SC': 'CH',  # Screwball
        'SI': 'SI',  # Sinker
        'SL': 'SL',  # Slider
        'SV': 'CB',  # Slurve
        'FS': 'FS',  # Splitter
        'ST': 'ST',  # Sweeper
        'FA': 'FF',  # Fastball
        'PO': 'PO',  # Pitch out
        'CS': 'CB'   # Slow Curveball
    }
    
    # Apply the mapping to the pitch_type column
    df1 = data.copy()
    df1['pitch_type'] = df1['pitch_type'].map(pitch_type_map).fillna(data['pitch_type'])
    return df1

In [7]:
def consolidate_pitch_types_new(data):
    # Mapping dictionary for pitch types
    pitch_type_map = {
        'CH': 'CH',  # Changeup
        'CU': 'CB',  # Curveball
        'FC': 'FC',  # Cutter
        'EP': 'CH',  # Eephus
        'FO': 'CH',  # Forkball
        'FF': 'FF',  # Four-Seam Fastball
        'KN': 'KN',  # Knuckleball
        'KC': 'CB',  # Knuckle-curve
        'SC': 'CH',  # Screwball
        'SI': 'SI',  # Sinker
        'SL': 'SL',  # Slider
        'SV': 'CB',  # Slurve
        'FS': 'CH',  # Splitter
        'ST': 'SL',  # Sweeper
        'FA': 'FF',  # Fastball
        'PO': 'PO',  # Pitch out
        'CS': 'CB'   # Slow Curveball
    }
    
    # Apply the mapping to the pitch_type column
    df2 = data.copy()
    df2['pitch_type'] = df2['pitch_type'].map(pitch_type_map).fillna(data['pitch_type'])
    return df2

In [8]:
#pybaseball.cache.enable()
pybaseball.cache.disable()

In [9]:
start_date = '2017-04-01'
end_date = '2023-10-30'
data = fetch_statcast_data(start_date, end_date)

This is a large query, it may take a moment to complete


That's a nice request you got there. It'd be a shame if something were to happen to it.
We strongly recommend that you enable caching before running this. It's as simple as `pybaseball.cache.enable()`.
Since the Statcast requests can take a *really* long time to run, if something were to happen, like: a disconnect;
gremlins; computer repair by associates of Rudy Giuliani; electromagnetic interference from metal trash cans; etc.;
you could lose a lot of progress. Enabling caching will allow you to immediately recover all the successful
subqueries if that happens.


Skipping offseason dates
Skipping offseason dates
Skipping offseason dates
Skipping offseason dates
Skipping offseason dates
Skipping offseason dates
Skipping offseason dates


  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_cop

In [10]:
data.sort_values('game_date', inplace=True)

In [11]:
data.columns

Index(['pitch_type', 'game_date', 'release_speed', 'release_pos_x',
       'release_pos_z', 'player_name', 'batter', 'pitcher', 'events',
       'description',
       ...
       'n_thruorder_pitcher', 'n_priorpa_thisgame_player_at_bat',
       'pitcher_days_since_prev_game', 'batter_days_since_prev_game',
       'pitcher_days_until_next_game', 'batter_days_until_next_game',
       'api_break_z_with_gravity', 'api_break_x_arm', 'api_break_x_batter_in',
       'arm_angle'],
      dtype='object', length=113)

In [12]:
data.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,tfs_deprecated,tfs_zulu_deprecated,umpire,sv_id,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,fielder_2,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,bat_speed,swing_length,estimated_slg_using_speedangle,delta_pitcher_run_exp,hyper_speed,home_score_diff,bat_score_diff,home_win_exp,bat_win_exp,age_pit_legacy,age_bat_legacy,age_pit,age_bat,n_thruorder_pitcher,n_priorpa_thisgame_player_at_bat,pitcher_days_since_prev_game,batter_days_since_prev_game,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle
909,FF,2017-04-02,94.9,-1.33,5.59,"Martinez, Carlos",656941,593372,,ball,,,,,12,Kyle Schwarber singles on a line drive to righ...,R,L,R,STL,CHC,B,,,0,0,2017,-0.78,1.1,0.28,4.33,,,,0,1,Top,,,,,,,5.773372,-137.72977,0.00795,-10.84801,29.457908,-18.784685,3.32,1.68,,,,93.3,2146,5.5,490099,425877,572761,576397,425509,649557,545341,451594,572039,55.07,,,,,,,,1,1,4-Seam Fastball,0,0,0,0,0,0,0,0,Standard,Standard,218,0.0,0.036,,,,-0.036,,0,0,0.5,0.5,25,24,26,24,1,0,,,7,2,1.52,0.78,-0.78,
757,FF,2017-04-02,92.6,-0.23,5.99,"Pruitt, Austin",592450,643493,fielders_choice,hit_into_play,,,,,5,"Aaron Judge reaches on a fielder's choice, fie...",R,R,R,TB,NYY,X,5.0,ground_ball,2,1,2017,-0.49,1.23,0.07,3.16,,,452104.0,0,9,Top,106.14,173.07,,,,,1.759204,-134.375724,-3.911116,-6.12212,31.487097,-17.020714,4.35,1.84,6.0,71.5,-29.0,91.0,2302,5.6,490106,519083,489149,543543,446334,542921,605480,595281,519306,54.85,0.047,0.055,0.9,1.0,0.0,0.0,2.0,75,4,4-Seam Fastball,7,2,2,7,2,7,2,7,Infield shift,Standard,204,-0.018,0.668,,,0.055,-0.668,88.0,5,-5,0.988,0.012,27,25,28,25,1,3,,,2,2,1.53,0.49,0.49,
734,CU,2017-04-02,78.8,-0.14,6.08,"Pruitt, Austin",543305,643493,,called_strike,,,,,9,Aaron Hicks singles on a line drive to shortst...,R,L,R,TB,NYY,S,,,0,0,2017,0.44,-1.23,0.32,2.06,,452104.0,592450.0,0,9,Top,,,,,,,0.227042,-114.233302,0.294083,3.592839,23.356016,-42.626177,3.36,1.74,,,,76.0,2917,4.7,490106,519083,489149,543543,446334,542921,605480,595281,519306,55.81,,,,,,,,76,1,Curveball,7,2,2,7,2,7,2,7,Standard,Standard,17,0.0,-0.096,,,,0.096,,5,-5,0.97,0.03,27,27,28,28,1,1,,,2,5,5.19,-0.44,0.44,
723,CH,2017-04-02,84.2,-0.16,5.99,"Pruitt, Austin",543305,643493,,blocked_ball,,,,,14,Aaron Hicks singles on a line drive to shortst...,R,L,R,TB,NYY,B,,,0,1,2017,-1.04,0.73,0.71,0.52,,452104.0,592450.0,0,9,Top,,,,,,,3.905027,-122.014464,-7.483595,-10.848316,25.247823,-23.839578,3.54,1.59,,,,82.3,1926,5.4,490106,519083,489149,543543,446334,542921,605480,595281,519306,55.13,,,,,,,,76,2,Changeup,7,2,2,7,2,7,2,7,Standard,Standard,241,0.0,0.047,,,,-0.047,,5,-5,0.97,0.03,27,27,28,28,1,1,,,2,5,2.64,1.04,-1.04,
687,SI,2017-04-02,92.3,-0.17,5.95,"Pruitt, Austin",543305,643493,,ball,,,,,13,Aaron Hicks singles on a line drive to shortst...,R,L,R,TB,NYY,B,,,1,1,2017,-1.65,0.94,-1.78,1.91,,452104.0,592450.0,0,9,Top,,,,,,,-0.624709,-133.822511,-6.257732,-18.986299,32.394987,-20.023656,3.43,1.59,,,,90.5,2327,5.7,490106,519083,489149,543543,446334,542921,605480,595281,519306,54.84,,,,,,,,76,3,Sinker,7,2,2,7,2,7,2,7,Standard,Standard,244,0.0,0.098,,,,-0.098,,5,-5,0.97,0.03,27,27,28,28,1,1,,,2,5,1.85,1.65,-1.65,


In [None]:
game_data = data[data['game_pk'] == 490106]
game_data.sort_values('at_bat_number', inplace=True)
game_data.head(20)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  game_data.sort_values('at_bat_number', inplace=True)


Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,tfs_deprecated,tfs_zulu_deprecated,umpire,sv_id,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,fielder_2,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,bat_speed,swing_length,estimated_slg_using_speedangle,delta_pitcher_run_exp,hyper_speed,home_score_diff,bat_score_diff,home_win_exp,bat_win_exp,age_pit_legacy,age_bat_legacy,age_pit,age_bat,n_thruorder_pitcher,n_priorpa_thisgame_player_at_bat,pitcher_days_since_prev_game,batter_days_since_prev_game,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle
823,FF,2017-04-02,93.2,-1.49,6.39,"Archer, Chris",458731,502042,,called_strike,,,,,1,Brett Gardner flies out to left fielder Mallex...,R,L,R,TB,NYY,S,,,2,1,2017,-0.71,1.56,-0.82,2.82,,,,0,1,Top,,,,,,,3.213753,-134.958708,-6.498109,-9.271776,34.359069,-12.347069,3.25,1.46,,,,91.4,2084.0,5.8,490106,519083,489149,543543,446334,542921,605480,595281,519306,54.67,,,,,,,,1,4,4-Seam Fastball,0,0,0,0,0,0,0,0,Standard,Standard,207,0.0,-0.069,,,,0.069,,0,0,0.5,0.5,28,33,29,34,1,0,,,6,2,1.18,0.71,-0.71,
843,FF,2017-04-02,93.6,-1.58,6.32,"Archer, Chris",458731,502042,,ball,,,,,11,Brett Gardner flies out to left fielder Mallex...,R,L,R,TB,NYY,B,,,1,1,2017,-0.78,1.55,-0.95,2.88,,,,0,1,Top,,,,,,,3.249569,-135.629678,-6.269627,-10.147459,33.279117,-12.373795,3.21,1.46,,,,91.8,2096.0,5.7,490106,519083,489149,543543,446334,542921,605480,595281,519306,54.77,,,,,,,,1,3,4-Seam Fastball,0,0,0,0,0,0,0,0,Standard,Standard,209,0.0,0.05,,,,-0.05,,0,0,0.5,0.5,28,33,29,34,1,0,,,6,2,1.16,0.78,-0.78,
876,FF,2017-04-02,92.4,-1.63,6.32,"Archer, Chris",458731,502042,,called_strike,,,,,2,Brett Gardner flies out to left fielder Mallex...,R,L,R,TB,NYY,S,,,1,0,2017,-0.51,1.4,0.14,2.93,,,,0,1,Top,,,,,,,5.567151,-133.954566,-5.695133,-7.41095,32.118223,-14.527665,3.23,1.64,,,,91.4,2119.0,6.1,490106,519083,489149,543543,446334,542921,605480,595281,519306,54.38,,,,,,,,1,2,4-Seam Fastball,0,0,0,0,0,0,0,0,Standard,Standard,202,0.0,-0.046,,,,0.046,,0,0,0.5,0.5,28,33,29,34,1,0,,,6,2,1.34,0.51,-0.51,
908,FF,2017-04-02,92.1,-1.49,6.33,"Archer, Chris",458731,502042,,ball,,,,,14,Brett Gardner flies out to left fielder Mallex...,R,L,R,TB,NYY,B,,,0,0,2017,-0.56,1.27,1.0,1.3,,,,0,1,Top,,,,,,,7.445744,-133.322447,-9.522156,-8.219819,29.373083,-15.350977,3.22,1.67,,,,91.3,2106.0,6.1,490106,519083,489149,543543,446334,542921,605480,595281,519306,54.38,,,,,,,,1,1,4-Seam Fastball,0,0,0,0,0,0,0,0,Standard,Standard,206,0.0,0.036,,,,-0.036,,0,0,0.5,0.5,28,33,29,34,1,0,,,6,2,1.48,0.56,-0.56,
756,SL,2017-04-02,87.9,-1.41,6.43,"Archer, Chris",458731,502042,field_out,hit_into_play,,,,,8,Brett Gardner flies out to left fielder Mallex...,R,L,R,TB,NYY,X,7.0,fly_ball,2,2,2017,0.24,-0.17,0.08,1.87,,,,0,1,Top,65.59,123.5,,,,,3.073256,-127.679036,-4.557735,1.981156,24.29787,-33.357414,3.48,1.46,236.0,76.2,38.0,86.6,,5.4,490106,519083,489149,543543,446334,542921,605480,595281,519306,55.12,0.047,0.042,0.0,1.0,0.0,0.0,3.0,1,6,Slider,0,0,0,0,0,0,0,0,Strategic,Standard,40,0.022,-0.216,,,0.058,0.216,88.0,0,0,0.5,0.5,28,33,29,34,1,0,,,6,2,3.22,-0.24,0.24,
790,SL,2017-04-02,88.0,-1.39,6.46,"Archer, Chris",458731,502042,,foul,,,,,9,Brett Gardner flies out to left fielder Mallex...,R,L,R,TB,NYY,S,,,2,2,2017,0.37,0.14,0.78,1.63,,,,0,1,Top,,,,,,,4.450516,-127.684319,-5.766872,3.054172,25.602945,-29.786894,3.48,1.46,113.0,100.9,5.0,86.5,2612.0,5.4,490106,519083,489149,543543,446334,542921,605480,595281,519306,55.07,,,,,,,,1,5,Slider,0,0,0,0,0,0,0,0,Strategic,Standard,92,0.0,0.0,,,,0.0,100.9,0,0,0.5,0.5,28,33,29,34,1,0,,,6,2,2.91,-0.37,0.37,
752,FF,2017-04-02,94.7,-1.45,6.38,"Archer, Chris",596142,502042,field_out,hit_into_play,,,,,8,"Gary Sanchez grounds out sharply, pitcher Chri...",R,R,R,TB,NYY,X,1.0,ground_ball,0,0,2017,-0.94,1.33,0.08,1.87,,,,1,1,Top,128.19,193.88,,,,,6.00912,-137.261161,-8.856631,-13.098427,31.020978,-13.887506,3.48,1.46,33.0,115.7,-1.0,93.7,2317.0,6.0,490106,519083,489149,543543,446334,542921,605480,595281,519306,54.51,0.54,0.486,0.0,1.0,0.0,0.0,4.0,2,1,4-Seam Fastball,0,0,0,0,0,0,0,0,Standard,Standard,218,0.015,-0.172,,,0.559,0.172,115.7,0,0,0.522,0.478,28,24,29,25,1,0,,,6,2,1.27,0.94,0.94,
637,FF,2017-04-02,94.9,-1.43,6.32,"Archer, Chris",595885,502042,walk,ball,,,,,1,Greg Bird walks.,R,L,R,TB,NYY,B,,,3,1,2017,-0.38,1.44,-0.77,3.96,,,,2,1,Top,,,,,,,2.517989,-137.71678,-3.571002,-5.242036,31.711739,-13.740018,3.85,1.67,,,,93.5,2155.0,5.7,490106,519083,489149,543543,446334,542921,605480,595281,519306,54.78,,0.692994,0.7,1.0,0.0,0.0,,3,5,4-Seam Fastball,0,0,0,0,0,0,0,0,Infield shade,Standard,196,-0.012,0.066,,,,-0.066,,0,0,0.537,0.463,28,24,29,25,1,0,,,6,2,1.17,0.38,-0.38,
720,FF,2017-04-02,94.9,-1.56,6.27,"Archer, Chris",595885,502042,,called_strike,,,,,7,Greg Bird walks.,R,L,R,TB,NYY,S,,,0,0,2017,-0.74,1.37,-0.56,1.89,,,,2,1,Top,,,,,,,4.171423,-137.421623,-8.582656,-10.085573,32.492852,-13.443625,3.48,1.8,,,,93.3,2207.0,5.8,490106,519083,489149,543543,446334,542921,605480,595281,519306,54.73,,,,,,,,3,1,4-Seam Fastball,0,0,0,0,0,0,0,0,Strategic,Standard,211,0.0,-0.018,,,,0.018,,0,0,0.537,0.463,28,24,29,25,1,0,,,6,2,1.25,0.74,-0.74,
645,FF,2017-04-02,95.0,-1.56,6.26,"Archer, Chris",595885,502042,,ball,,,,,11,Greg Bird walks.,R,L,R,TB,NYY,B,,,2,1,2017,0.0,1.66,-0.95,4.05,,,,2,1,Top,,,,,,,1.577377,-137.769684,-3.658296,-0.321075,33.340106,-10.885719,3.85,1.67,,,,93.7,2200.0,5.9,490106,519083,489149,543543,446334,542921,605480,595281,519306,54.59,,,,,,,,3,4,4-Seam Fastball,0,0,0,0,0,0,0,0,Strategic,Standard,180,0.0,0.038,,,,-0.038,,0,0,0.537,0.463,28,24,29,25,1,0,,,6,2,0.94,0.0,0.0,


In [20]:
data_with_pitch_id = data
data_with_pitch_id['pitch_id'] = data.apply(lambda row: f"{row['game_pk']}_{row['at_bat_number']}_{row['pitch_number']}", axis=1)
#data_with_pitch_id['pitch_id'] = f"{data['game_pk']}_{data['at_bat_number']}_{data['pitch_number']}"
data_with_pitch_id.head(10)

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,tfs_deprecated,tfs_zulu_deprecated,umpire,sv_id,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,fielder_2,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,bat_speed,swing_length,estimated_slg_using_speedangle,delta_pitcher_run_exp,hyper_speed,home_score_diff,bat_score_diff,home_win_exp,bat_win_exp,age_pit_legacy,age_bat_legacy,age_pit,age_bat,n_thruorder_pitcher,n_priorpa_thisgame_player_at_bat,pitcher_days_since_prev_game,batter_days_since_prev_game,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,pitch_id
909,FF,2017-04-02,94.9,-1.33,5.59,"Martinez, Carlos",656941,593372,,ball,,,,,12,Kyle Schwarber singles on a line drive to righ...,R,L,R,STL,CHC,B,,,0,0,2017,-0.78,1.1,0.28,4.33,,,,0,1,Top,,,,,,,5.773372,-137.72977,0.00795,-10.84801,29.457908,-18.784685,3.32,1.68,,,,93.3,2146,5.5,490099,425877,572761,576397,425509,649557,545341,451594,572039,55.07,,,,,,,,1,1,4-Seam Fastball,0,0,0,0,0,0,0,0,Standard,Standard,218,0.0,0.036,,,,-0.036,,0,0,0.5,0.5,25,24,26,24,1,0,,,7,2,1.52,0.78,-0.78,,490099_1_1
757,FF,2017-04-02,92.6,-0.23,5.99,"Pruitt, Austin",592450,643493,fielders_choice,hit_into_play,,,,,5,"Aaron Judge reaches on a fielder's choice, fie...",R,R,R,TB,NYY,X,5.0,ground_ball,2,1,2017,-0.49,1.23,0.07,3.16,,,452104.0,0,9,Top,106.14,173.07,,,,,1.759204,-134.375724,-3.911116,-6.12212,31.487097,-17.020714,4.35,1.84,6.0,71.5,-29.0,91.0,2302,5.6,490106,519083,489149,543543,446334,542921,605480,595281,519306,54.85,0.047,0.055,0.9,1.0,0.0,0.0,2.0,75,4,4-Seam Fastball,7,2,2,7,2,7,2,7,Infield shift,Standard,204,-0.018,0.668,,,0.055,-0.668,88.0,5,-5,0.988,0.012,27,25,28,25,1,3,,,2,2,1.53,0.49,0.49,,490106_75_4
734,CU,2017-04-02,78.8,-0.14,6.08,"Pruitt, Austin",543305,643493,,called_strike,,,,,9,Aaron Hicks singles on a line drive to shortst...,R,L,R,TB,NYY,S,,,0,0,2017,0.44,-1.23,0.32,2.06,,452104.0,592450.0,0,9,Top,,,,,,,0.227042,-114.233302,0.294083,3.592839,23.356016,-42.626177,3.36,1.74,,,,76.0,2917,4.7,490106,519083,489149,543543,446334,542921,605480,595281,519306,55.81,,,,,,,,76,1,Curveball,7,2,2,7,2,7,2,7,Standard,Standard,17,0.0,-0.096,,,,0.096,,5,-5,0.97,0.03,27,27,28,28,1,1,,,2,5,5.19,-0.44,0.44,,490106_76_1
723,CH,2017-04-02,84.2,-0.16,5.99,"Pruitt, Austin",543305,643493,,blocked_ball,,,,,14,Aaron Hicks singles on a line drive to shortst...,R,L,R,TB,NYY,B,,,0,1,2017,-1.04,0.73,0.71,0.52,,452104.0,592450.0,0,9,Top,,,,,,,3.905027,-122.014464,-7.483595,-10.848316,25.247823,-23.839578,3.54,1.59,,,,82.3,1926,5.4,490106,519083,489149,543543,446334,542921,605480,595281,519306,55.13,,,,,,,,76,2,Changeup,7,2,2,7,2,7,2,7,Standard,Standard,241,0.0,0.047,,,,-0.047,,5,-5,0.97,0.03,27,27,28,28,1,1,,,2,5,2.64,1.04,-1.04,,490106_76_2
687,SI,2017-04-02,92.3,-0.17,5.95,"Pruitt, Austin",543305,643493,,ball,,,,,13,Aaron Hicks singles on a line drive to shortst...,R,L,R,TB,NYY,B,,,1,1,2017,-1.65,0.94,-1.78,1.91,,452104.0,592450.0,0,9,Top,,,,,,,-0.624709,-133.822511,-6.257732,-18.986299,32.394987,-20.023656,3.43,1.59,,,,90.5,2327,5.7,490106,519083,489149,543543,446334,542921,605480,595281,519306,54.84,,,,,,,,76,3,Sinker,7,2,2,7,2,7,2,7,Standard,Standard,244,0.0,0.098,,,,-0.098,,5,-5,0.97,0.03,27,27,28,28,1,1,,,2,5,1.85,1.65,-1.65,,490106_76_3
674,SI,2017-04-02,92.6,-0.1,5.97,"Pruitt, Austin",543305,643493,single,hit_into_play,,,,,4,Aaron Hicks singles on a line drive to shortst...,R,L,R,TB,NYY,X,6.0,line_drive,2,1,2017,-1.53,0.82,-0.81,2.37,,452104.0,592450.0,0,9,Top,111.0,145.3,,,,,1.399926,-134.311355,-4.965501,-18.297036,32.055985,-21.604822,3.85,1.59,191.0,85.0,12.0,90.9,2327,5.7,490106,519083,489149,543543,446334,542921,605480,595281,519306,54.81,0.807,0.737,0.9,1.0,1.0,0.0,4.0,76,4,Sinker,7,2,2,7,2,7,2,7,Standard,Standard,246,-0.038,0.895,,,0.868,-0.895,88.0,5,-5,0.97,0.03,27,27,28,28,1,1,,,2,5,1.94,1.53,-1.53,,490106_76_4
887,FC,2017-04-02,89.9,-1.34,6.18,"Colomé, Alex",474892,517008,,foul,,,,,7,Chris Carter out on a sacrifice fly to left fi...,R,R,R,TB,NYY,S,,,0,0,2017,-0.43,0.8,-0.73,2.39,452104.0,592450.0,543305.0,0,9,Top,,,,,,,2.380513,-130.421646,-4.925028,-5.368851,30.62088,-22.362495,4.08,1.78,8.0,87.6,-16.0,88.7,2169,5.9,490106,519083,489149,543543,446334,542921,605480,595281,519306,54.56,,,,,,,,77,1,Cutter,7,2,2,7,2,7,2,7,Infield shift,Strategic,213,0.0,-0.124,,,,0.124,88.0,5,-5,0.932,0.068,28,30,29,31,1,0,,,3,5,2.11,0.43,0.43,,490106_77_1
872,FC,2017-04-02,88.3,-1.16,6.22,"Colomé, Alex",474892,517008,,ball,,,,,13,Chris Carter out on a sacrifice fly to left fi...,R,R,R,TB,NYY,B,,,0,1,2017,-0.13,0.57,-0.2,1.45,452104.0,592450.0,543305.0,0,9,Top,,,,,,,2.563376,-128.074442,-6.550282,-1.938957,28.919765,-24.782773,3.64,1.74,,,,87.1,2168,5.9,490106,519083,489149,543543,446334,542921,605480,595281,519306,54.61,,,,,,,,77,2,Cutter,7,2,2,7,2,7,2,7,Infield shift,Strategic,197,0.0,0.004,,,,-0.004,,5,-5,0.932,0.068,28,30,29,31,1,0,,,3,5,2.44,0.13,0.13,,490106_77_2
835,FC,2017-04-02,89.2,-1.08,6.3,"Colomé, Alex",474892,517008,,foul,,,,,14,Chris Carter out on a sacrifice fly to left fi...,R,R,R,TB,NYY,S,,,1,1,2017,-0.01,0.53,0.45,1.61,452104.0,592450.0,543305.0,0,9,Top,,,,,,,3.745212,-129.307935,-6.436178,-1.001437,29.560063,-25.119583,4.08,1.78,,,,87.9,2103,5.9,490106,519083,489149,543543,446334,542921,605480,595281,519306,54.62,,,,,,,,77,3,Cutter,7,2,2,7,2,7,2,7,Infield shift,Standard,182,0.0,-0.173,,,,0.173,,5,-5,0.932,0.068,28,30,29,31,1,0,,,3,5,2.43,0.01,0.01,,490106_77_3
812,FC,2017-04-02,89.3,-1.09,6.28,"Colomé, Alex",474892,517008,,blocked_ball,,,,,14,Chris Carter out on a sacrifice fly to left fi...,R,R,R,TB,NYY,B,,,1,2,2017,0.12,0.45,0.48,0.07,452104.0,592450.0,543305.0,0,9,Top,,,,,,,3.573613,-129.321542,-9.979251,0.511388,28.007514,-25.366393,3.64,1.78,,,,87.8,2094,5.7,490106,519083,489149,543543,446334,542921,605480,595281,519306,54.81,,,,,,,,77,4,Cutter,7,2,2,7,2,7,2,7,Infield shift,Standard,160,0.0,0.112,,,,-0.112,,5,-5,0.932,0.068,28,30,29,31,1,0,,,3,5,2.52,-0.12,-0.12,,490106_77_4


In [21]:
len(data_with_pitch_id)

4811117

In [22]:
# Write full raw data to csv so we can easily add new features if needed.
# NOTE: this isn't able to be tracked by github due to size limits (file is > 2GB). So just write to GCS for now.
#data_with_pitch_id.to_csv('../../data/raw_statcast_data.csv', index=False)


In [None]:
data_with_pitch_id.to_csv('gs://pitch-sequencing/raw/statcast/2017-04-01_2023-10-31.csv')

In [12]:
data_pt_cur = consolidate_pitch_types_cur(data)

In [17]:
data_pt_new = consolidate_pitch_types_new(data)

In [16]:
data_pt_cur = data_pt_cur.iloc[2:]

In [17]:
data_pt_cur[['game_date', 'pitch_number','batter', 'pitcher', 'pitch_type', 'events', 'at_bat_number','zone', 'outs_when_up','p_throws','stand','type','bb_type','balls','strikes', 'player_name','on_3b', 'on_2b', 'on_1b']].head(10)

Unnamed: 0,game_date,pitch_number,batter,pitcher,pitch_type,events,at_bat_number,zone,outs_when_up,p_throws,stand,type,bb_type,balls,strikes,player_name,on_3b,on_2b,on_1b
786,2017-04-02,1,592178,593372,SI,,2,9,0,R,R,S,,0,0,"Martinez, Carlos",,,656941
760,2017-04-02,2,592178,593372,SI,,2,6,0,R,R,B,,0,1,"Martinez, Carlos",,,656941
737,2017-04-02,3,592178,593372,SL,,2,14,0,R,R,B,,1,1,"Martinez, Carlos",,,656941
727,2017-04-02,4,592178,593372,SL,,2,8,0,R,R,S,,2,1,"Martinez, Carlos",,,656941
687,2017-04-02,5,592178,593372,FF,strikeout,2,1,0,R,R,S,,2,2,"Martinez, Carlos",,,656941
663,2017-04-02,1,519203,593372,FF,,3,11,1,R,L,B,,0,0,"Martinez, Carlos",,,656941
647,2017-04-02,2,519203,593372,CH,,3,7,1,R,L,S,,1,0,"Martinez, Carlos",,,656941
639,2017-04-02,3,519203,593372,FF,single,3,6,1,R,L,X,line_drive,1,1,"Martinez, Carlos",,,656941
614,2017-04-02,1,450314,593372,CH,,4,14,1,R,L,B,,0,0,"Martinez, Carlos",,656941.0,519203
594,2017-04-02,2,450314,593372,CH,,4,14,1,R,L,S,,1,0,"Martinez, Carlos",,656941.0,519203


In [18]:
data_pt_new.pitch_type.value_counts(normalize=True)

pitch_type
FF    0.346329
SL    0.186511
SI    0.169469
CH    0.125275
CB    0.104744
FC    0.066763
KN    0.000822
PO    0.000086
Name: proportion, dtype: float64

In [19]:
data_pt_cur.pitch_type.value_counts(normalize=True)

pitch_type
FF    0.346329
SI    0.169469
SL    0.166628
CH    0.108219
CB    0.104744
FC    0.066763
ST    0.019883
FS    0.017057
KN    0.000822
PO    0.000086
Name: proportion, dtype: float64

In [20]:
data_toexport_cur = data_pt_cur[['game_date', 'pitch_number','batter', 'pitcher', 'pitch_type', 'events', 'at_bat_number','zone', 'outs_when_up','p_throws','stand','type','bb_type','balls','strikes', 'player_name','on_3b', 'on_2b', 'on_1b']]

In [21]:
data_toexport_new = data_pt_new[['game_date', 'pitch_number','batter', 'pitcher', 'pitch_type', 'events', 'at_bat_number','zone', 'outs_when_up','p_throws','stand','type','bb_type','balls','strikes', 'player_name','on_3b', 'on_2b', 'on_1b']]

In [22]:
data_toexport_cur.to_csv('../../data/raw_data_curmap.csv', index=False)

In [23]:
data_toexport_new.to_csv('../../data/raw_data_newmap.csv', index=False)

In [24]:
data_toexport_new.head()

Unnamed: 0,game_date,pitch_number,batter,pitcher,pitch_type,events,at_bat_number,zone,outs_when_up,p_throws,stand,type,bb_type,balls,strikes,player_name,on_3b,on_2b,on_1b
901,2017-04-02,1,656941,593372,FF,,1,12,0,R,L,B,,0,0,"Martinez, Carlos",,,
772,2017-04-02,4,592450,643493,FF,fielders_choice,75,5,0,R,R,X,ground_ball,2,1,"Pruitt, Austin",,,452104.0
744,2017-04-02,1,543305,643493,CB,,76,9,0,R,L,S,,0,0,"Pruitt, Austin",,452104.0,592450.0
725,2017-04-02,2,543305,643493,CH,,76,14,0,R,L,B,,0,1,"Pruitt, Austin",,452104.0,592450.0
706,2017-04-02,3,543305,643493,SI,,76,13,0,R,L,B,,1,1,"Pruitt, Austin",,452104.0,592450.0


In [43]:
data_toexport.head()

Unnamed: 0,game_date,pitch_number,batter,pitcher,balls,strikes,type,pitch_type,events,at_bat_number,player_name
743,2017-04-02,1,543305,643493,0,0,S,CB,,76,"Pruitt, Austin"
715,2017-04-02,2,543305,643493,0,1,B,CH,,76,"Pruitt, Austin"
692,2017-04-02,3,543305,643493,1,1,B,SI,,76,"Pruitt, Austin"
671,2017-04-02,4,543305,643493,2,1,X,SI,single,76,"Pruitt, Austin"
901,2017-04-02,1,474892,517008,0,0,S,FC,,77,"Colomé, Alex"


In [None]:
result_df = process_data(data_pt_cur)
print(result_df.head())

In [11]:
data_pt[data_pt['pitcher']==621107]['player_name']

83      Eflin, Zach
92      Eflin, Zach
96      Eflin, Zach
101     Eflin, Zach
107     Eflin, Zach
           ...     
3860    Eflin, Zach
3993    Eflin, Zach
4188    Eflin, Zach
4258    Eflin, Zach
4344    Eflin, Zach
Name: player_name, Length: 2652, dtype: object

In [12]:
result_df.shape

(185538, 3)

In [13]:
result_df.head()

Unnamed: 0,Pitch Sequence,Pitcher ID,At-Bat Outcome
0,SI,621107,field_error
1,"SI,CB,FC,SI,CB,SI,FF",621107,single
2,"ST,ST,SI,SI,ST,ST",676534,walk
3,"SI,ST,SI,ST,SI",687330,grounded_into_double_play
4,"FF,FF,FF,SL,FF,SL",477132,strikeout


In [14]:
result_df.to_csv('../../data/sequence_data_opt.csv', index=False)