In [2]:
import pandas as pd
from pybaseball import statcast
pd.set_option('display.max_columns', 500)

In [3]:
def fetch_statcast_data(start_date, end_date):
    # Fetch data from Statcast
    data = statcast(start_date, end_date)
    return data

In [1]:
def process_data(data):
    # Filter necessary columns
    data = data[['game_date', 'batter', 'pitcher', 'pitch_type', 'events', 'at_bat_number', 'pitch_number']]
    
    # Drop rows where pitch_type is NaN (happens occasionally in real datasets)
    data = data.dropna(subset=['pitch_type'])

    # Group data by game, batter, pitcher, and at_bat_number, then sort by pitch_number
    # Use apply to ensure the group maintains DataFrame format after sorting
    grouped = data.groupby(['game_date', 'batter', 'pitcher', 'at_bat_number']).apply(lambda x: x.sort_values('pitch_number')).reset_index(drop=True)

    # We can't directly iterate with multiple unpacks if using apply without specifying 'group_keys=False' in the groupby
    # However, by resetting the index, we revert to a single DataFrame and thus lose the direct tuple unpacking ability
    # Create a DataFrame to store the sequence, pitcher ID, and at-bat outcome
    at_bats = []
    for _, group in grouped.groupby(['game_date', 'batter', 'pitcher', 'at_bat_number']):
        sequence = ','.join(group['pitch_type'].tolist())
        outcome = group['events'].iloc[-1] if not pd.isna(group['events'].iloc[-1]) else 'No event'
        pitcher_id = group['pitcher'].iloc[0]
        at_bats.append([sequence, pitcher_id, outcome])

    # Convert list to DataFrame
    result_df = pd.DataFrame(at_bats, columns=['Pitch Sequence', 'Pitcher ID', 'At-Bat Outcome'])
    return result_df


In [4]:
def consolidate_pitch_types(data):
    # Mapping dictionary for pitch types
    pitch_type_map = {
        'CH': 'CH',  # Changeup
        'CU': 'CB',  # Curveball
        'FC': 'FC',  # Cutter
        'EP': 'CH',  # Eephus
        'FO': 'CH',  # Forkball
        'FF': 'FF',  # Four-Seam Fastball
        'KN': 'KN',  # Knuckleball
        'KC': 'CB',  # Knuckle-curve
        'SC': 'CH',  # Screwball
        'SI': 'SI',  # Sinker
        'SL': 'SL',  # Slider
        'SV': 'CB',  # Slurve
        'FS': 'FS',  # Splitter
        'ST': 'ST',  # Sweeper
        'FA': 'FF',  # Fastball
        'PO': 'PO',  # Pitch out
        'CS': 'CB'   # Slow Curveball
    }
    
    # Apply the mapping to the pitch_type column
    data['pitch_type'] = data['pitch_type'].map(pitch_type_map).fillna(data['pitch_type'])
    return data

In [None]:
pybaseball.cache.enable()

In [5]:
start_date = '2023-04-01'
end_date = '2023-10-30'
data = fetch_statcast_data(start_date, end_date)

This is a large query, it may take a moment to complete


That's a nice request you got there. It'd be a shame if something were to happen to it.
We strongly recommend that you enable caching before running this. It's as simple as `pybaseball.cache.enable()`.
Since the Statcast requests can take a *really* long time to run, if something were to happen, like: a disconnect;
gremlins; computer repair by associates of Rudy Giuliani; electromagnetic interference from metal trash cans; etc.;
you could lose a lot of progress. Enabling caching will allow you to immediately recover all the successful
subqueries if that happens.
100%|████████████████████████████████████████████████████████████████████████████████| 213/213 [02:10<00:00,  1.63it/s]


In [6]:
data.columns

Index(['pitch_type', 'game_date', 'release_speed', 'release_pos_x',
       'release_pos_z', 'player_name', 'batter', 'pitcher', 'events',
       'description', 'spin_dir', 'spin_rate_deprecated',
       'break_angle_deprecated', 'break_length_deprecated', 'zone', 'des',
       'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'type',
       'hit_location', 'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x',
       'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b',
       'outs_when_up', 'inning', 'inning_topbot', 'hc_x', 'hc_y',
       'tfs_deprecated', 'tfs_zulu_deprecated', 'fielder_2', 'umpire', 'sv_id',
       'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot',
       'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed',
       'release_spin_rate', 'release_extension', 'game_pk', 'pitcher.1',
       'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',
       'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y',
       'estima

In [7]:
data_pt = consolidate_pitch_types(data)

In [8]:
data[['game_date', 'batter', 'pitcher', 'pitch_type', 'events', 'at_bat_number','pitch_number','player_name']].head(11)

Unnamed: 0,game_date,batter,pitcher,pitch_type,events,at_bat_number,pitch_number,player_name
114,2023-10-30,502054,600917,FF,strikeout,66,4,"Leclerc, José"
116,2023-10-30,502054,600917,SL,,66,3,"Leclerc, José"
124,2023-10-30,502054,600917,SL,,66,2,"Leclerc, José"
131,2023-10-30,502054,600917,SI,,66,1,"Leclerc, José"
142,2023-10-30,572233,600917,SL,strikeout,65,6,"Leclerc, José"
145,2023-10-30,572233,600917,FC,,65,5,"Leclerc, José"
162,2023-10-30,572233,600917,SL,,65,4,"Leclerc, José"
168,2023-10-30,572233,600917,FF,,65,3,"Leclerc, José"
177,2023-10-30,572233,600917,SL,,65,2,"Leclerc, José"
183,2023-10-30,572233,600917,FC,,65,1,"Leclerc, José"


In [9]:
data_pt.pitch_type.value_counts(normalize=True)

FF    0.324816
SL    0.169411
SI    0.152475
CH    0.110712
CB    0.091870
FC    0.078273
ST    0.049728
FS    0.022389
KN    0.000263
PO    0.000065
Name: pitch_type, dtype: float64

In [10]:
result_df = process_data(data_pt)
print(result_df.head())

         Pitch Sequence  Pitcher ID             At-Bat Outcome
0                    SI      621107                field_error
1  SI,CB,FC,SI,CB,SI,FF      621107                     single
2     ST,ST,SI,SI,ST,ST      676534                       walk
3        SI,ST,SI,ST,SI      687330  grounded_into_double_play
4     FF,FF,FF,SL,FF,SL      477132                  strikeout


In [11]:
data_pt[data_pt['pitcher']==621107]['player_name']

83      Eflin, Zach
92      Eflin, Zach
96      Eflin, Zach
101     Eflin, Zach
107     Eflin, Zach
           ...     
3860    Eflin, Zach
3993    Eflin, Zach
4188    Eflin, Zach
4258    Eflin, Zach
4344    Eflin, Zach
Name: player_name, Length: 2652, dtype: object

In [12]:
result_df.shape

(185538, 3)

In [13]:
result_df.head()

Unnamed: 0,Pitch Sequence,Pitcher ID,At-Bat Outcome
0,SI,621107,field_error
1,"SI,CB,FC,SI,CB,SI,FF",621107,single
2,"ST,ST,SI,SI,ST,ST",676534,walk
3,"SI,ST,SI,ST,SI",687330,grounded_into_double_play
4,"FF,FF,FF,SL,FF,SL",477132,strikeout


In [14]:
result_df.to_csv('../../data/sequence_data_opt.csv', index=False)