In [1]:
import pandas as pd
import boxball_loader as bbl
import baseball_stats_utils as bsu

In [2]:
runner_cols = ['base1_run_id', 'base2_run_id', 'base3_run_id']
cols = ['bat_id', 'pit_id', 'bat_hand_cd', 'pit_hand_cd', 'ob_fl', 'tb_ct'] + runner_cols
seasons = bbl.Seasons(1979, 2003)
ev = bbl.load_event_data(seasons, cols, pa_only=True)
ev['baserunner_fl'] = ev[runner_cols].any(axis=1) # a flag indicating whether any runners on
ev.sample(10)

Unnamed: 0,h_fl,yr,ob_fl,event_cd,game_type,date,pit_hand_cd,pit_id,base1_run_id,game_id,tb_ct,base3_run_id,ab_fl,base2_run_id,bat_event_fl,bat_id,bat_hand_cd,baserunner_fl
5441513,1,1999,1,20,RS,1999-09-28,L,ramir002,,ARI199909280,1,,True,,True,benea001,R,False
12955290,1,1992,1,20,RS,1992-06-27,L,myerr001,,SDN199206270,1,,True,,True,feldm001,R,False
7887318,0,2000,0,2,RS,2000-08-29,R,lidlc001,,TBA200008290,0,,True,,True,garcn001,R,False
3815010,0,1990,0,3,RS,1990-04-29,R,reusr001,thomm001,SFN199004290,0,guerp001,True,,True,zeilt001,R,True
13520796,0,1987,0,2,RS,1987-08-04,L,honer001,larkb001,CIN198708040,0,,True,,True,bellb001,R,True
5315314,0,2002,0,2,RS,2002-05-24,R,alfoa001,berkl001,HOU200205240,0,,True,biggc001,True,bagwj001,R,True
6807228,0,1996,1,14,RS,1996-08-18,L,leita001,gallm001,SLN199608180,0,,False,,True,jordb001,R,True
419677,0,1989,0,3,RS,1989-07-27,R,harnp001,,MIN198907270,0,,True,,True,bushr001,L,False
410820,0,1989,1,14,RS,1989-05-26,R,wegmb001,,MIL198905260,0,,False,,True,valld001,R,False
418555,0,1989,1,14,RS,1989-06-27,L,snydb001,hrbek001,MIN198906270,0,gaetg001,False,bushr001,True,backw001,R,True


In [3]:
runner_ids = ['hendr001']

rickey_fl = ev[runner_cols].isin(runner_ids).any(axis=1)
rickey_fl.value_counts()

False    4121871
True        9839
dtype: int64

In [4]:
# For now, the baseline is the career total for each batter
def generate_batter_baselines(ev):
    counting_cols = ['ob', 'ab', 'h', 'tb', 'k', 'bb', 'ibb', 'hr']

    batter_totals = bsu.summarize_events(ev.query('baserunner_fl'), ['bat_id'])
    batter_rates = pd.concat([(batter_totals[col]/batter_totals['pa']).rename(col) for col in counting_cols], axis=1)
    return batter_rates

batter_rates = generate_batter_baselines(ev)
batter_rates.sample(10)

Unnamed: 0_level_0,ob,ab,h,tb,k,bb,ibb,hr
bat_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ronam001,0.0,1.0,0.0,0.0,0.5,0.0,0.0,0.0
figun001,0.181818,0.681818,0.136364,0.136364,0.272727,0.045455,0.0,0.0
halaj001,0.0,0.666667,0.0,0.0,0.444444,0.0,0.0,0.0
karcm001,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0
cox-j101,0.246914,0.802469,0.185185,0.197531,0.123457,0.061728,0.0,0.0
wardt001,0.342923,0.826237,0.2313,0.363636,0.133487,0.103567,0.012658,0.024166
palmd002,0.323788,0.871325,0.224131,0.425353,0.233677,0.087056,0.008018,0.051546
mankp101,0.216867,0.843373,0.180723,0.216867,0.156627,0.036145,0.048193,0.0
bonnb101,0.169811,0.886792,0.150943,0.226415,0.132075,0.018868,0.0,0.0
delov001,0.125,0.625,0.0,0.0,0.375,0.125,0.0,0.0


In [5]:
def gen_rolling_baseline(pa, yr):
    df = generate_batter_baselines(ev.query('@yr-1 <= yr <= @yr+1'))
    df['yr'] = yr
    return df.reset_index().set_index(['bat_id', 'yr']) 

gen_rolling_baseline(ev, 1983)

Unnamed: 0_level_0,Unnamed: 1_level_0,ob,ab,h,tb,k,bb,ibb,hr
bat_id,yr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
adamg101,1983,0.363636,0.848485,0.303030,0.363636,0.000000,0.060606,0.000000,0.000000
adamr001,1983,0.203390,0.847458,0.118644,0.118644,0.101695,0.033898,0.000000,0.000000
adduj001,1983,0.000000,1.000000,0.000000,0.000000,0.285714,0.000000,0.000000,0.000000
agual001,1983,0.301587,0.873016,0.238095,0.396825,0.126984,0.063492,0.047619,0.031746
aikew001,1983,0.321181,0.876736,0.237847,0.385417,0.154514,0.079861,0.029514,0.032986
...,...,...,...,...,...,...,...,...,...
younm002,1983,0.359813,0.799065,0.205607,0.355140,0.233645,0.126168,0.009346,0.028037
younr001,1983,0.362487,0.858799,0.271865,0.444679,0.081138,0.088514,0.015806,0.031612
zachp001,1983,0.066667,0.866667,0.066667,0.066667,0.266667,0.000000,0.000000,0.000000
ziskr101,1983,0.322581,0.888337,0.243176,0.367246,0.176179,0.076923,0.017370,0.029777


In [6]:
rolling_baselines = pd.concat([gen_rolling_baseline(ev, yr) for yr in seasons])
rolling_baselines

Unnamed: 0_level_0,Unnamed: 1_level_0,ob,ab,h,tb,k,bb,ibb,hr
bat_id,yr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
adamg101,1979,0.346749,0.882353,0.287926,0.383901,0.068111,0.049536,0.003096,0.015480
agual001,1979,0.407407,0.925926,0.370370,0.555556,0.037037,0.037037,0.000000,0.037037
aikew001,1979,0.365631,0.837294,0.250457,0.409506,0.159049,0.106033,0.020110,0.040219
aingd101,1979,0.220339,0.909605,0.192090,0.231638,0.209040,0.022599,0.005650,0.000000
alexd001,1979,0.151515,0.909091,0.151515,0.181818,0.333333,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
zeilt001,2003,0.357298,0.834423,0.224401,0.370370,0.137255,0.124183,0.006536,0.034858
zerbc001,2003,0.166667,0.666667,0.166667,0.166667,0.333333,0.000000,0.000000,0.000000
zinta001,2003,0.166667,1.000000,0.166667,0.416667,0.500000,0.000000,0.000000,0.083333
zitob001,2003,0.000000,0.666667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [7]:
rickey_ct = ev[rickey_fl]['bat_id'].value_counts()
rickey_ct

murpd002    898
mattd001    636
lansc001    479
randw001    411
cansj001    328
           ... 
bergs001      1
willg002      1
youne002      1
noboj001      1
rossd001      1
Name: bat_id, Length: 247, dtype: int64

In [8]:
# this function will encapsulate how we find a baseline for a set of PAs
# use the rolling three-year baselines
def get_baseline_for_pas(pas):
    batter_pa_counts = pas[['bat_id', 'yr']].value_counts()

    # compute the weighted average of the batter rates, weighted by batter_pa_counts
    # TODO this function depends on the batter_rates df existing as global
    baseline = pd.concat([(rolling_baselines[col]*batter_pa_counts).dropna().rename(col) for col in rolling_baselines.columns], axis=1).sum()/len(pas)
    baseline['pa'] = 1
    baseline['ba'] = baseline['h'] / baseline['ab']
    baseline['obp'] = baseline['ob']
    baseline['slg'] = baseline['tb'] / baseline['ab']
    baseline['woba'] = bsu.get_woba(baseline)
    return baseline

teammate_baseline = get_baseline_for_pas(ev[rickey_fl])
teammate_baseline

ob      0.339016
ab      0.850243
h       0.241801
tb      0.376349
k       0.132696
bb      0.091480
ibb     0.017365
hr      0.027896
pa      1.000000
ba      0.284391
obp     0.339016
slg     0.442638
woba    0.319824
dtype: float64

In [9]:
with_rickey = bsu.summarize_events(ev, rickey_fl).loc[True]
with_rickey

pa      9839.000000
ob      3327.000000
ab      8190.000000
h       2395.000000
tb      3688.000000
k       1260.000000
bb       881.000000
ibb      181.000000
hr       248.000000
ba         0.292430
obp        0.338144
slg        0.450305
woba       0.317882
k%         0.128062
bb%        0.089542
hr%        0.025206
Name: True, dtype: float64

In [10]:
rate_cols = ['ba', 'obp', 'slg']
comp = pd.concat([teammate_baseline[rate_cols].rename('baseline'), with_rickey[rate_cols+['pa']].rename('with_runner')], axis=1).T
comp['r27'] = 31 * comp['obp'] * comp['slg']
comp

Unnamed: 0,ba,obp,slg,pa,r27
baseline,0.284391,0.339016,0.442638,,4.651905
with_runner,0.29243,0.338144,0.450305,9839.0,4.72031


In [11]:
# Estimate the size of the difference in runs
(comp['r27']['with_runner']-comp['r27']['baseline'])*comp['pa']['with_runner']/36

18.69550469399568