In [76]:
import pandas as pd
import numpy as np
from pybaseball import statcast_pitcher, playerid_lookup

In [78]:
ids = pd.read_csv("data/mlbids.csv")

In [195]:
# get pitcher data on yamamoto in just the 2025 season
START = "2025-03-01"
END   = "2025-11-20"
player = playerid_lookup('yamamoto', 'yoshinobu')
print(player)

  name_last name_first  key_mlbam key_retro  key_bbref  key_fangraphs  \
0  yamamoto  yoshinobu     808967  yamay001  yamamyo01          33825   

   mlb_played_first  mlb_played_last  
0            2024.0           2025.0  


In [197]:
df = statcast_pitcher(START, END, 808967)

Gathering Player Data


In [198]:
# get only regular and post season games
if "game_type" in df.columns:
    df = df[df["game_type"].isin(["R", "P"])].copy()

In [199]:
df = pd.merge(df, ids, left_on='batter', right_on='MLBID', how='left').copy()

In [200]:
# rename pitcher team score and opponent team score
df["pitcher_team"] = df["fld_score"]
df["opponent"] = df["bat_score"]

In [201]:
# change types
df["balls"]   = df["balls"].astype(int)
df["strikes"] = df["strikes"].astype(int)
df["outs"]    = df["outs_when_up"].astype(int)
df["inning"]  = df["inning"].astype(int)
df["top_bot"] = (df["inning_topbot"] == "Top").astype(int)

# change name of batter name and id column
df["batter_name"] = df["PLAYERNAME"]
df["batter_id"] = df["batter"]

In [202]:
# change pitcher and batter hand column names to be more intuitive
df["p_hand"] = df["p_throws"]
df["b_hand"] = df["stand"]

df["same_hand_matchup"] = (df["p_hand"] == df["b_hand"]).astype(int)

In [203]:
# total number of runners on bases
df["num_runners_onb"] = df[["on_1b", "on_2b", "on_3b"]].sum(axis=1)

df[['on_3b', 'on_2b', 'on_1b']] = df[['on_3b', 'on_2b', 'on_1b']].notna()

In [204]:
# get previous pitch data
df["prev_pitch_type"] = (
    df.groupby(["game_pk", "at_bat_number"])["pitch_type"].shift(1)
)
df["prev_pitch_result"] = (
    df.groupby(["game_pk", "at_bat_number"])["description"]
      .shift(1)
)

In [205]:
# get number of pitches thrown in that game
df["p_pitch_count"] = df.groupby(["game_pk"])["pitch_type"].cumcount() + 1

In [206]:
cols = [
    'game_pk', 'game_date', 'at_bat_number',
    'pitch_number', 'pitch_type',
    'batter_name', 'batter_id',
    'p_hand', 'b_hand',
    'same_hand_matchup',
    'balls', 'strikes',
    'outs', 'on_3b',
    'on_2b','on_1b',
    'inning', 'top_bot',
    'p_pitch_count',
    'pitcher_team', 'opponent',
    'prev_pitch_type', 'prev_pitch_result'
]

In [207]:
df = df[cols].copy()
df

Unnamed: 0,game_pk,game_date,at_bat_number,pitch_number,pitch_type,batter_name,batter_id,p_hand,b_hand,same_hand_matchup,...,on_3b,on_2b,on_1b,inning,top_bot,p_pitch_count,pitcher_team,opponent,prev_pitch_type,prev_pitch_result
0,776185,2025-09-25,57,6,SL,Blaze Alexander,677942,R,R,1,...,False,False,True,6,0,1,8,0,,
1,776185,2025-09-25,57,5,SI,Blaze Alexander,677942,R,R,1,...,False,False,True,6,0,2,8,0,SL,hit_into_play
2,776185,2025-09-25,57,4,FF,Blaze Alexander,677942,R,R,1,...,False,False,True,6,0,3,8,0,SI,ball
3,776185,2025-09-25,57,3,FF,Blaze Alexander,677942,R,R,1,...,False,False,True,6,0,4,8,0,FF,foul
4,776185,2025-09-25,57,2,FS,Blaze Alexander,677942,R,R,1,...,False,False,True,6,0,5,8,0,FF,ball
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2784,778563,2025-03-18,4,6,FF,Ian Happ,664023,R,L,0,...,False,False,False,1,0,68,0,0,,
2785,778563,2025-03-18,4,5,FF,Ian Happ,664023,R,L,0,...,False,False,False,1,0,69,0,0,FF,ball
2786,778563,2025-03-18,4,4,FF,Ian Happ,664023,R,L,0,...,False,False,False,1,0,70,0,0,FF,called_strike
2787,778563,2025-03-18,4,3,FF,Ian Happ,664023,R,L,0,...,False,False,False,1,0,71,0,0,FF,called_strike


In [179]:
df[[df['game_pk'] == 776185]]

KeyError: "None of [Index([(True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, ...)], dtype='object')] are in the [columns]"