In [20]:
import pandas as pd
import numpy as np
from pybaseball import statcast_pitcher, playerid_lookup, statcast_batter

In [21]:
ids = pd.read_csv("data/mlbids.csv")

In [22]:
# get pitcher data on yamamoto in just the 2025 season
START = "2025-03-01"
END   = "2025-11-20"
player = playerid_lookup('yamamoto', 'yoshinobu')
print(player)

  name_last name_first  key_mlbam key_retro  key_bbref  key_fangraphs  \
0  yamamoto  yoshinobu     808967  yamay001  yamamyo01          33825   

   mlb_played_first  mlb_played_last  
0            2024.0           2025.0  


In [23]:
df = statcast_pitcher(START, END, 808967)

Gathering Player Data


In [24]:
df = df[~df["pitch_type"].isna()] # remove all rows where pitch type is NaN
df = df.sort_values(["game_pk", "at_bat_number", "pitch_number"]).reset_index(drop=True) # sort by game, at bat, pitch number

In [25]:
group_cols = ['game_pk', 'at_bat_number']
df["pitch_in_pa"] = df.groupby(group_cols).cumcount() # group by plate appearances (PA)

In [None]:
# create mapping from pitch types to indices
pitch_types = sorted(df['pitch_type'].unique())
pitchtoIndex = {pt: i for i, pt in enumerate(pitch_types)}
indexToPitch = {i: pt for pt, i in enumerate(pitch_types)}

df["pitch_type_idx"] = df["pitch_type"].map(pitchtoIndex)

In [None]:
# create mapping from batter handedness to indices
stands = sorted(df['stand'].dropna().unique())
standToIndex = {st: i for i, st in enumerate(stands)}
indexToStand = {i: st for st, i in enumerate(stands)}

df["batter_hand_idx"] = df["stand"].map(standToIndex)

In [28]:
# get previous and prior pitch types
df["prev_pitch_type"] = df.groupby(group_cols)["pitch_type"].shift(1).fillna("START")
df["prev_prev_pitch_type"] = df.groupby(group_cols)["pitch_type"].shift(2).fillna("START")

prev_types = ["START"] + pitch_types
prevtoIndex = {pt: i for i, pt in enumerate(prev_types)}
df["prev_pitch_type_index"] = df["prev_pitch_type"].map(prevtoIndex)

In [29]:
# get only regular and post season games
if "game_type" in df.columns:
    df = df[df["game_type"].isin(["R", "P"])].copy()

In [30]:
# get batter mlbid numbers
df = pd.merge(df, ids, left_on='batter', right_on='MLBID', how='left').copy()

In [31]:
# rename for more intuitive understanding
df["dodgers_score"] = df["fld_score"]
df["opponent_score"] = df["bat_score"]

# score diff in perspective of the Dodgers
df["score_diff"] = df["dodgers_score"] - df["opponent_score"]

In [32]:
# change types
df["balls"]   = df["balls"].astype(int)
df["strikes"] = df["strikes"].astype(int)
df["outs"]    = df["outs_when_up"].astype(int)
df["inning"]  = df["inning"].astype(int)
df["is_top_inning"] = (df["inning_topbot"] == "Top").astype(int)
df["home_score"] = df["home_score"].astype(int)
df["away_score"] = df["away_score"].astype(int)

In [33]:
# change name of batter name and id column
df["batter_name"] = df["PLAYERNAME"]
df["batter_id"] = df["batter"]

In [34]:
# change on 1-3 base to be numerical values rather than player ids
df["on_1b"] = df["on_1b"].notna().astype(int)
df["on_2b"] = df["on_2b"].notna().astype(int)
df["on_3b"] = df["on_3b"].notna().astype(int)

In [39]:
cols = [
    "game_pk",
    "at_bat_number",
    "pitch_in_pa",
    "batter_name",
    "batter_id",
    "batter_hand_idx",

    "pitch_type",
    "pitch_type_idx",
    "prev_pitch_type",
    "prev_pitch_type_index",
    "prev_prev_pitch_type",

    "balls",
    "strikes",
    "outs",
    "inning",
    "is_top_inning",
    "on_1b",
    "on_2b",
    "on_3b",
    "score_diff"
]

In [41]:
df = df[cols].copy()
df

Unnamed: 0,game_pk,at_bat_number,pitch_in_pa,batter_name,batter_id,batter_hand_idx,pitch_type,pitch_type_idx,prev_pitch_type,prev_pitch_type_index,prev_prev_pitch_type,balls,strikes,outs,inning,is_top_inning,on_1b,on_2b,on_3b,score_diff
0,776185,4,0,Geraldo Perdomo,672695,0,FC,1,START,0,START,0,0,0,1,0,0,0,0,0
1,776185,4,1,Geraldo Perdomo,672695,0,FS,3,FC,2,START,0,1,0,1,0,0,0,0,0
2,776185,4,2,Geraldo Perdomo,672695,0,FS,3,FS,4,FC,1,1,0,1,0,0,0,0,0
3,776185,4,3,Geraldo Perdomo,672695,0,FS,3,FS,4,FS,1,2,0,1,0,0,0,0,0
4,776185,5,0,Ketel Marte,606466,0,CU,0,START,0,START,0,0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2784,778563,42,2,Ian Happ,664023,0,FF,2,CU,1,FF,1,1,2,5,0,0,0,0,2
2785,778563,42,3,Ian Happ,664023,0,FF,2,FF,3,CU,2,1,2,5,0,0,0,0,2
2786,778563,42,4,Ian Happ,664023,0,FF,2,FF,3,FF,3,1,2,5,0,0,0,0,2
2787,778563,42,5,Ian Happ,664023,0,CU,0,FF,3,FF,3,2,2,5,0,0,0,0,2


In [20]:
all_rows = []
unique_batters = df['batter_id'].unique()
for bid in unique_batters: #Get all unique batters Yamamoto faced
    bf = statcast_batter('2025-03-01', '2025-10-01', bid)
    if bf.empty:
        continue

    bf = bf[bf['events'].notna()].copy() #Remove entries that had no results

    #Mark hits, etc.
    tb_map = {'single':1,'double':2,'triple':3,'home_run':4}

    bf['is_hit'] = bf['events'].isin(tb_map.keys())
    bf['total_bases'] = bf['events'].map(tb_map).fillna(0)
    bf['is_walk'] = bf['events'].isin(['walk','intent_walk'])
    bf['is_hbp']  = bf['events'] == 'hit_by_pitch'
    bf['is_sf']   = bf['events'] == 'sac_fly'

    bf['AB'] = (~bf['is_walk'] & ~bf['is_hbp'] & ~bf['is_sf']).astype(int)

    grouped = bf.groupby('pitch_type').agg( #Calculate stats vs Yamamoto
        AB=('AB','sum'),
        H=('is_hit','sum'),
        BB=('is_walk','sum'),
        HBP=('is_hbp','sum'),
        SF=('is_sf','sum'),
        TB=('total_bases','sum')
    ).reset_index()

    grouped['batter_id'] = bid
    all_rows.append(grouped)

#Combine results
full = pd.concat(all_rows, ignore_index=True)

full['AVG'] = full['H'] / full['AB'].replace(0, pd.NA)
full['OBP'] = (full['H'] + full['BB'] + full['HBP']) / \
              (full['AB'] + full['BB'] + full['HBP'] + full['SF']).replace(0, pd.NA)
full['SLG'] = full['TB'] / full['AB'].replace(0, pd.NA)

final_df = full.pivot(
    index='batter_id',
    columns='pitch_type',
    values=['AVG','OBP','SLG']
)

final_df.columns = [f"{stat}_{ptype}" for (stat, ptype) in final_df.columns]
final_df = final_df.reset_index()

#Add the names to table, given batter ID
name_map = (
    df[['batter_id','batter_name']]
      .drop_duplicates()
      .set_index('batter_id')['batter_name']
      .to_dict()
)

final_df['batter_name'] = final_df['batter_id'].map(name_map)

cols = ['batter_name','batter_id'] + [c for c in final_df.columns if c not in ['batter_name','batter_id']]
final_df = final_df[cols]




Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering 

In [21]:
final_df

Unnamed: 0,batter_name,batter_id,AVG_CH,AVG_CS,AVG_CU,AVG_EP,AVG_FA,AVG_FC,AVG_FF,AVG_FO,...,SLG_FO,SLG_FS,SLG_KC,SLG_KN,SLG_PO,SLG_SC,SLG_SI,SLG_SL,SLG_ST,SLG_SV
0,Martin Maldonado,455117,0.0,1.0,0.090909,,,0.272727,0.2,,...,,0.5,0.0,,,,0.285714,0.275862,0.5,
1,Andrew McCutchen,457705,0.289474,,0.236842,1.0,,0.384615,0.184713,,...,,0.692308,0.5,,,,0.311927,0.371795,0.170732,
2,Tommy Pham,502054,0.310345,0.0,0.142857,,0.0,0.272727,0.239316,,...,,0.727273,0.0,,,,0.354167,0.416667,0.125,0.5
3,Paul Goldschmidt,502671,0.293103,,0.285714,,1.0,0.28,0.308725,,...,,0.3125,0.0,,,,0.444444,0.426667,0.225806,0.333333
4,Travis D'Arnaud,518595,0.1875,,0.0,,,0.058824,0.191781,,...,,0.0,0.0,,,,0.361111,0.485714,0.647059,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,Chase Meidroth,805367,0.152174,,0.315789,,0.0,0.263158,0.304636,,...,,0.333333,0.4,,,,0.320388,0.231707,0.166667,0.0
205,Jacob Wilson,805779,0.410256,0.0,0.166667,,0.0,0.314286,0.315315,,...,,0.461538,1.0,,,,0.371069,0.641304,0.176471,0.0
206,Bryce Eldridge,805811,0.0,,0.333333,,,0.0,0.083333,,...,,0.0,0.0,,,,2.0,0.0,0.0,
207,Matt Shaw,807713,0.261905,,0.190476,0.0,0.0,0.304348,0.201835,,...,,1.714286,0.0,,,,0.25,0.4,0.363636,0.4


In [155]:
final_df[:24]

Unnamed: 0,batter_name,batter_id,AVG_CH,AVG_CS,AVG_CU,AVG_EP,AVG_FA,AVG_FC,AVG_FF,AVG_FO,...,SLG_FO,SLG_FS,SLG_KC,SLG_KN,SLG_PO,SLG_SC,SLG_SI,SLG_SL,SLG_ST,SLG_SV
0,Martin Maldonado,455117,0.0,1.0,0.090909,,,0.272727,0.2,,...,,0.5,0.0,,,,0.285714,0.275862,0.5,
1,Andrew McCutchen,457705,0.289474,,0.236842,1.0,,0.384615,0.184713,,...,,0.692308,0.5,,,,0.311927,0.367089,0.175,
2,Tommy Pham,502054,0.310345,0.0,0.142857,,0.0,0.272727,0.239316,,...,,0.727273,0.0,,,,0.354167,0.42623,0.102564,0.5
3,Paul Goldschmidt,502671,0.293103,,0.285714,,1.0,0.28,0.308725,,...,,0.3125,0.0,,,,0.444444,0.426667,0.225806,0.333333
4,Travis D'Arnaud,518595,0.1875,,0.0,,,0.058824,0.191781,,...,,0.0,0.0,,,,0.361111,0.485714,0.647059,0.0
5,DJ LeMahieu,518934,0.153846,,0.142857,0.0,,0.272727,0.361111,,...,,0.5,0.0,,,,0.095238,0.466667,0.428571,0.0
6,Wilmer Flores,527038,0.195122,,0.190476,,,0.125,0.266234,,...,,0.0,0.375,,,,0.451613,0.333333,0.387097,0.0
7,Marcell Ozuna,542303,0.137255,,0.137931,,,0.26087,0.247934,0.0,...,0.0,0.238095,0.142857,,,,0.481481,0.37037,0.409091,0.0
8,Jon Berti,542932,0.0,,0.2,0.5,1.0,0.2,0.25,,...,,0.0,0.0,,,,0.142857,0.125,0.555556,
9,Marcus Semien,543760,0.159091,,0.235294,,,0.255814,0.228916,,...,,0.0,0.777778,,,,0.408602,0.329412,0.416667,1.0


In [22]:
# Yamamoto pitch types
keep_pitch_types = ['FF', 'FS', 'CU', 'FC', 'SI', 'SL']

# metrics you computed
metrics = ['AVG', 'OBP', 'SLG']

# build list of desired column names
keep_cols = ['batter_name', 'batter_id']  # always keep identifiers
for m in metrics:
    for pt in keep_pitch_types:
        col = f"{m}_{pt}"
        if col in final_df.columns:
            keep_cols.append(col)

# filter the dataframe
final_df_filtered = final_df[keep_cols]

final_df_filtered #Keep stats with only the pitch types Yamamoto throws

Unnamed: 0,batter_name,batter_id,AVG_FF,AVG_FS,AVG_CU,AVG_FC,AVG_SI,AVG_SL,OBP_FF,OBP_FS,OBP_CU,OBP_FC,OBP_SI,OBP_SL,SLG_FF,SLG_FS,SLG_CU,SLG_FC,SLG_SI,SLG_SL
0,Martin Maldonado,455117,0.2,0.25,0.090909,0.272727,0.228571,0.172414,0.238095,0.25,0.090909,0.466667,0.25,0.2,0.333333,0.5,0.181818,0.545455,0.285714,0.275862
1,Andrew McCutchen,457705,0.184713,0.307692,0.236842,0.384615,0.247706,0.230769,0.267045,0.357143,0.25641,0.5,0.344,0.354839,0.350318,0.692308,0.315789,0.461538,0.311927,0.371795
2,Tommy Pham,502054,0.239316,0.272727,0.142857,0.272727,0.270833,0.233333,0.387755,0.333333,0.137931,0.305556,0.345794,0.328571,0.410256,0.727273,0.142857,0.484848,0.354167,0.416667
3,Paul Goldschmidt,502671,0.308725,0.25,0.285714,0.28,0.252525,0.266667,0.371951,0.314286,0.347826,0.357143,0.292453,0.303797,0.489933,0.3125,0.52381,0.36,0.444444,0.426667
4,Travis D'Arnaud,518595,0.191781,0.0,0.0,0.058824,0.25,0.257143,0.259259,0.0,0.181818,0.157895,0.289474,0.333333,0.328767,0.0,0.0,0.058824,0.361111,0.485714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,Chase Meidroth,805367,0.304636,0.333333,0.315789,0.263158,0.242718,0.207317,0.40678,0.428571,0.315789,0.317073,0.327586,0.266667,0.384106,0.333333,0.578947,0.289474,0.320388,0.231707
205,Jacob Wilson,805779,0.315315,0.384615,0.166667,0.314286,0.289308,0.347826,0.387097,0.357143,0.230769,0.368421,0.323353,0.368421,0.486486,0.461538,0.25,0.371429,0.371069,0.641304
206,Bryce Eldridge,805811,0.083333,0.0,0.333333,0.0,1.0,0.0,0.266667,0.0,0.5,0.0,1.0,0.166667,0.166667,0.0,0.333333,0.0,2.0,0.0
207,Matt Shaw,807713,0.201835,0.428571,0.190476,0.304348,0.197368,0.2625,0.276423,0.428571,0.25,0.392857,0.314607,0.313953,0.376147,1.714286,0.238095,0.521739,0.25,0.4


In [33]:
combined_df = df.merge(final_df_filtered, on="batter_name", how='left')

In [35]:
combined_df

Unnamed: 0,game_pk,game_date,at_bat_number,pitch_number,pitch_type,batter_name,batter_id_x,p_hand,b_hand,same_hand_matchup,...,OBP_CU,OBP_FC,OBP_SI,OBP_SL,SLG_FF,SLG_FS,SLG_CU,SLG_FC,SLG_SI,SLG_SL
0,776185,2025-09-25,57,5,SI,Blaze Alexander,677942,R,R,1,...,0.333333,0.526316,0.355556,0.227273,0.47619,0.333333,0.3125,0.588235,0.25,0.268293
1,776185,2025-09-25,57,4,FF,Blaze Alexander,677942,R,R,1,...,0.333333,0.526316,0.355556,0.227273,0.47619,0.333333,0.3125,0.588235,0.25,0.268293
2,776185,2025-09-25,57,3,FF,Blaze Alexander,677942,R,R,1,...,0.333333,0.526316,0.355556,0.227273,0.47619,0.333333,0.3125,0.588235,0.25,0.268293
3,776185,2025-09-25,57,2,FS,Blaze Alexander,677942,R,R,1,...,0.333333,0.526316,0.355556,0.227273,0.47619,0.333333,0.3125,0.588235,0.25,0.268293
4,776185,2025-09-25,57,1,SI,Blaze Alexander,677942,R,R,1,...,0.333333,0.526316,0.355556,0.227273,0.47619,0.333333,0.3125,0.588235,0.25,0.268293
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2086,778563,2025-03-18,5,1,FF,Seiya Suzuki,673548,R,R,1,...,0.306122,0.384615,0.366412,0.265625,0.493151,1.0,0.365854,0.470588,0.525862,0.344828
2087,778563,2025-03-18,4,5,FF,Ian Happ,664023,R,L,0,...,0.377049,0.313725,0.520548,0.166667,0.324324,0.727273,0.555556,0.413043,0.803571,0.32
2088,778563,2025-03-18,4,4,FF,Ian Happ,664023,R,L,0,...,0.377049,0.313725,0.520548,0.166667,0.324324,0.727273,0.555556,0.413043,0.803571,0.32
2089,778563,2025-03-18,4,3,FF,Ian Happ,664023,R,L,0,...,0.377049,0.313725,0.520548,0.166667,0.324324,0.727273,0.555556,0.413043,0.803571,0.32
