In [1]:
# Quick and dirty Marcel the Monkey batting projections

import pandas as pd
import boxball_loader as bbl
import numpy as np
from sklearn.metrics import mean_absolute_error


In [2]:
# Projecting 2019 from the prior three seasons
prior_yrs = bbl.Seasons(2016, 2018)
proj_yr = bbl.Seasons(2019)
yearly_weights = {2016: 3, 2017: 4, 2018: 5}

# I'll project these stats on a per-PA basis
proj_stats = ["ab", "r", "_1b", "_2b", "_3b", "hr", "rbi", "sb", "cs", "bb", "so", "ibb", "hbp", "sh", "sf", "gidp"]

In [3]:
# Some common computations
def compute_pa(df):
    return df['ab'] + df['bb'] + df['hbp'] + df['sh'] + df['sf']

def compute_1b(df):
    return df['h'] - df['_2b'] - df['_3b'] - df['hr']

In [4]:
# Load and clean up prior-year data
priors = bbl.load_batting(prior_yrs, coalesce_type=bbl.CoalesceMode.PLAYER_SEASON)
priors['pa'] = compute_pa(priors)
priors['_1b'] = compute_1b(priors)
priors = priors[['pa'] + proj_stats]
priors

Unnamed: 0_level_0,Unnamed: 1_level_0,pa,ab,r,_1b,_2b,_3b,hr,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp
player_id,yr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
abadfe01,2016,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
abadfe01,2017,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
abreujo02,2016,695,624,67,125,32,1,25,100,0,2,47,125,7,15,0,9,21
abreujo02,2017,675,621,95,107,43,6,33,102,3,0,35,119,6,15,0,4,21
abreujo02,2018,553,499,68,73,36,1,22,78,2,0,37,109,7,11,0,6,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zuninmi01,2016,192,164,16,15,7,0,12,31,0,0,21,65,0,6,0,1,0
zuninmi01,2017,435,387,52,47,25,0,25,64,1,0,39,160,0,8,0,1,8
zuninmi01,2018,405,373,37,37,18,0,20,44,0,0,24,150,0,6,0,2,7
zychto01,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
priors_weighted = pd.concat([(priors.query('yr==@yr') * weight) for yr, weight in yearly_weights.items()])
prior_weighted_avg = priors_weighted.groupby('player_id').sum()/sum(yearly_weights.values())
prior_weighted_avg


Unnamed: 0_level_0,pa,ab,r,_1b,_2b,_3b,hr,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
abadfe01,0.250000,0.250000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.250000,0.000000,0.000000,0.000000,0.000000,0.000000
abreujo02,629.166667,570.916667,76.75,97.333333,37.333333,2.666667,26.416667,91.500000,1.833333,0.500000,38.833333,116.333333,6.666667,13.333333,0.000000,6.083333,18.083333
achteaj01,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ackledu01,17.500000,15.250000,1.50,2.250000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,2.000000,2.250000,0.000000,0.000000,0.000000,0.250000,0.000000
acunaro01,202.916667,180.416667,32.50,29.583333,10.833333,1.666667,10.833333,26.666667,6.666667,2.083333,18.750000,51.250000,0.833333,2.500000,0.000000,1.250000,1.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zimmejo02,4.083333,3.833333,0.00,0.583333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.666667,0.000000,0.000000,0.250000,0.000000,0.000000
zimmery01,443.333333,401.416667,58.75,61.416667,24.250000,1.083333,21.166667,68.750000,1.750000,0.666667,34.416667,90.916667,1.000000,3.500000,0.000000,4.000000,12.500000
zobribe01,539.750000,465.333333,70.75,85.750000,26.083333,3.000000,12.250000,59.833333,3.416667,3.333333,64.916667,69.166667,2.583333,2.500000,2.083333,4.916667,11.916667
zuninmi01,361.750000,325.416667,36.75,34.833333,17.583333,0.000000,19.666667,47.416667,0.333333,0.000000,28.250000,132.083333,0.000000,6.666667,0.000000,1.416667,5.583333


In [6]:
# Compute league averages for the prior years
lg_totals = priors.sum()
lg_avg = lg_totals[proj_stats]/lg_totals['pa']
lg_avg

ab      0.894882
r       0.118863
_1b     0.145577
_2b     0.044901
_3b     0.004532
hr      0.031177
rbi     0.113372
sb      0.013585
cs      0.005214
bb      0.083986
so      0.216788
ibb     0.005102
hbp     0.009616
sh      0.004997
sf      0.006518
gidp    0.019788
dtype: float64

In [7]:
# For any player whose weighted average is under 400, regress to the mean by "filling them in" to 400 PA
# Do this in a function, so I can re-use it later
def regress_players_to_level(df, regress_level):
    pas_to_add = np.where(df['pa'] < 400, 400 - df['pa'], 0)

    # This is a little tricky.  I want to multiply a column (pas_to_add) and a row (regress_level) and get back a df
    # Pandas can't do that, but np.outer() can, and I can convert the output back into a df
    ballast = pd.DataFrame(np.outer(pas_to_add, regress_level))
    ballast.index = df.index
    ballast.columns = proj_stats
    priors_regressed = df[proj_stats] + ballast
    priors_regressed['pa'] = compute_pa(priors_regressed)
    priors_regressed = priors_regressed[['pa'] + proj_stats]
    return priors_regressed

priors_regressed = regress_players_to_level(prior_weighted_avg, lg_avg)
priors_regressed

Unnamed: 0_level_0,pa,ab,r,_1b,_2b,_3b,hr,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
abadfe01,400.000000,357.978951,47.515649,58.194351,17.949124,1.811842,12.463168,45.320547,5.430483,2.084159,33.573470,86.910804,2.039493,3.844131,1.997709,2.605739,7.910150
abreujo02,629.166667,570.916667,76.750000,97.333333,37.333333,2.666667,26.416667,91.500000,1.833333,0.500000,38.833333,116.333333,6.666667,13.333333,0.000000,6.083333,18.083333
achteaj01,400.000000,357.952672,47.545365,58.230745,17.960349,1.812975,12.470963,45.348890,5.433880,2.085462,33.594467,86.715001,2.040769,3.846535,1.998958,2.607368,7.915097
ackledu01,400.000000,357.542242,46.965255,57.933150,17.174584,1.733658,11.925358,44.364876,5.196147,1.994223,34.124709,85.171219,1.951485,3.678249,1.911504,2.743296,7.568811
acunaro01,400.000000,356.782931,55.925998,58.274107,19.682547,2.559935,16.977881,49.010442,9.343984,3.110858,35.302274,93.975203,1.838837,4.395220,0.984903,2.534672,5.566501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zimmejo02,400.000000,358.131905,47.060006,58.219640,17.777004,1.794468,12.343655,44.885953,5.378409,2.064173,33.251523,87.496452,2.019936,3.807268,2.228552,2.580751,7.834297
zimmery01,443.333333,401.416667,58.750000,61.416667,24.250000,1.083333,21.166667,68.750000,1.750000,0.666667,34.416667,90.916667,1.000000,3.500000,0.000000,4.000000,12.500000
zobribe01,539.750000,465.333333,70.750000,85.750000,26.083333,3.000000,12.250000,59.833333,3.416667,3.333333,64.916667,69.166667,2.583333,2.500000,2.083333,4.916667,11.916667
zuninmi01,400.000000,359.645891,41.296526,40.401648,19.300792,0.173366,20.859202,51.753154,0.852948,0.199422,31.462471,140.375455,0.195148,7.034492,0.191150,1.665996,6.340214


In [8]:
# These are pretty good projections right here.  We could add age adjustments, but I'm stopping here for now
# Let's add some rate stats

def get_woba(stats):
    woba_weights = {'ob': .702, 'ibb': -.702, 'tb': .37, 'h': -.21}
    return sum([stats[stat]*woba_weights[stat] for stat in woba_weights.keys()])/(stats['pa'] - stats['ibb'])

def add_batting_rate_stats(df):
    df['pa'] = df['ab'] + df['bb'] + df['hbp'] + df['sf'] + df['sh']
    df['h'] = df['_1b'] + df['_2b'] + df['_3b'] + df['hr']
    df['outs'] = df['ab'] - df['h'] + df['cs'] + df['sf']
    df['tb'] = df['h'] + df['_2b'] + 2*df['_3b'] + 3*df['hr']
    df['ob'] = df['h'] + df['bb'] + df['hbp']
    df['ba'] = df['h'] / df['ab']
    df['obp'] = df['ob'] / df['pa']
    df['slg'] = df['tb'] / df['ab']
    df['ops'] = df['obp'] + df['slg']
    df['woba'] = get_woba(df)
    return df

projections = add_batting_rate_stats(priors_regressed)

projections.sort_values('woba', ascending=False)[['pa', 'ba', 'obp', 'slg', 'woba']]

Unnamed: 0_level_0,pa,ba,obp,slg,woba
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
troutmi01,592.166667,0.311230,0.449057,0.605704,0.432177
martijd02,562.666667,0.316983,0.388033,0.624895,0.413588
judgeaa01,457.250000,0.275585,0.399854,0.565663,0.401234
vottojo01,664.500000,0.307525,0.434663,0.508646,0.398751
freemfr01,639.166667,0.306634,0.394915,0.543600,0.388242
...,...,...,...,...,...
mccanja02,414.000000,0.230383,0.283615,0.354754,0.275420
maldoma01,400.000000,0.221136,0.283294,0.359829,0.274213
hechaad01,400.000000,0.247404,0.283085,0.353669,0.268954
engelad01,400.000000,0.220381,0.272475,0.340127,0.263185


In [9]:
# Now let's evaluate the projections, for players who had at least 200 PA in 2019

actuals = bbl.load_batting(proj_yr, coalesce_type=bbl.CoalesceMode.PLAYER_SEASON)
actuals['pa'] = compute_pa(actuals)
actuals['_1b'] = compute_1b(actuals)
actuals = actuals.query('pa>=200')
actuals = add_batting_rate_stats(actuals)
actuals

Unnamed: 0_level_0,Unnamed: 1_level_0,g,ab,r,h,_2b,_3b,hr,rbi,sb,cs,...,pa,_1b,outs,tb,ob,ba,obp,slg,ops,woba
player_id,yr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
abreujo02,2019,159,634,85,180,38,1,33,123,2,2,...,693,108,466,319,229,0.283912,0.330447,0.503155,0.833602,0.345689
acunaro01,2019,156,626,127,175,22,2,41,101,37,9,...,712,110,461,324,260,0.279553,0.365169,0.517572,0.882740,0.371246
adamewi01,2019,152,531,69,135,25,1,20,52,4,2,...,584,89,399,222,184,0.254237,0.315068,0.418079,0.733148,0.312617
adamsma01,2019,111,310,42,70,14,0,20,56,0,0,...,333,36,241,144,92,0.225806,0.276276,0.464516,0.740792,0.308620
adriaeh01,2019,84,202,34,55,8,3,5,22,0,2,...,234,39,153,84,81,0.272277,0.346154,0.415842,0.761995,0.324850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wolteto01,2019,121,359,42,94,17,2,1,42,0,1,...,411,74,272,118,138,0.261838,0.335766,0.328691,0.664457,0.288882
wongko01,2019,148,478,61,136,25,4,11,59,24,4,...,549,96,351,202,196,0.284519,0.357013,0.422594,0.779607,0.331364
yastrmi01,2019,107,371,64,101,22,3,21,55,2,4,...,411,55,277,192,137,0.272237,0.333333,0.517520,0.850854,0.354395
yelicch01,2019,130,489,100,161,29,3,44,97,30,2,...,580,85,333,328,249,0.329243,0.429310,0.670757,1.100067,0.445241


In [10]:
comp_stats = ['ba', 'obp', 'slg', 'woba']
comparisons = pd.merge(projections[comp_stats], actuals[comp_stats], on='player_id', how='inner')
comparisons


Unnamed: 0_level_0,ba_x,obp_x,slg_x,woba_x,ba_y,obp_y,slg_y,woba_y
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
abreujo02,0.286819,0.343179,0.500365,0.350526,0.283912,0.330447,0.503155,0.345689
acunaro01,0.273260,0.342980,0.485535,0.348199,0.279553,0.365169,0.517572,0.371246
adamewi01,0.261155,0.328861,0.413778,0.316168,0.254237,0.315068,0.418079,0.312617
adamsma01,0.253749,0.313329,0.481793,0.330612,0.225806,0.276276,0.464516,0.308620
adriaeh01,0.253933,0.310154,0.395797,0.299371,0.272277,0.346154,0.415842,0.324850
...,...,...,...,...,...,...,...,...
winkeje01,0.273364,0.355145,0.434771,0.337309,0.269231,0.356771,0.473373,0.353037
wolteto01,0.233001,0.317856,0.356558,0.290444,0.261838,0.335766,0.328691,0.288882
wongko01,0.259411,0.343157,0.389070,0.313051,0.284519,0.357013,0.422594,0.331364
yelicch01,0.303908,0.384366,0.514546,0.379113,0.329243,0.429310,0.670757,0.445241


In [11]:
# Compute MAEs
for stat in comp_stats:
    error = mean_absolute_error(comparisons[f'{stat}_x'], comparisons[f'{stat}_y'])
    print('MAE', stat, error)


MAE ba 0.0227880351680305
MAE obp 0.02305004053640025
MAE slg 0.05140403355823716
MAE woba 0.02682500190103908


In [12]:
# OK, I've never like regressing to the mean.
# Instead of regressing players with little playing time to the mean, let's regress them to *their* mean
# E.g., the mean of players who don't average 400 PA/season

players_under_400 = prior_weighted_avg.query('pa<400')
regress_level = players_under_400.sum()[proj_stats]/players_under_400['pa'].sum()
regress_level

ab      0.897834
r       0.108381
_1b     0.138832
_2b     0.041524
_3b     0.004479
hr      0.026081
rbi     0.101531
sb      0.012327
cs      0.004962
bb      0.077494
so      0.242444
ibb     0.003674
hbp     0.009941
sh      0.008812
sf      0.005918
gidp    0.018922
dtype: float64

In [13]:
# This is just as before, but replacing lg_avg with regress_level
priors_regressed_2 = regress_players_to_level(prior_weighted_avg, regress_level)
projections_new = add_batting_rate_stats(priors_regressed_2)

# Let's pick 10 players with <400 PA and see their projections before and after
plyrs = prior_weighted_avg.query('pa<400').sample(10).index
projections.loc[plyrs][comp_stats], projections_new.loc[plyrs][comp_stats]

(                 ba       obp       slg      woba
 player_id                                        
 jenseky01  0.251476  0.318620  0.418317  0.313014
 burnsbi02  0.248404  0.308293  0.390015  0.297595
 mcneije01  0.272625  0.334053  0.431551  0.324668
 gustaja01  0.252757  0.319790  0.417581  0.313274
 tsengje01  0.252522  0.319524  0.417192  0.313011
 wrighda04  0.251874  0.318591  0.416122  0.312093
 nolasri01  0.251047  0.319842  0.414755  0.312511
 barrija01  0.251287  0.318125  0.415152  0.311634
 vinceza01  0.252721  0.320676  0.415394  0.313104
 broadmi01  0.252757  0.319590  0.417581  0.313077,
                  ba       obp       slg      woba
 player_id                                        
 jenseky01  0.234024  0.297637  0.379872  0.289554
 burnsbi02  0.234499  0.291410  0.359438  0.278726
 mcneije01  0.259381  0.318153  0.402455  0.306875
 gustaja01  0.234917  0.298352  0.378290  0.289302
 tsengje01  0.234699  0.298103  0.377939  0.289060
 wrighda04  0.234099  0.297233

In [14]:
# Note the OBPs and SLG are mostly down.  The extent will vary based on how much playing time a player had

# OK now to the MAEs:
comparisons_new = pd.merge(projections_new[comp_stats], actuals[comp_stats], on='player_id', how='inner')
for stat in comp_stats:
    error = mean_absolute_error(comparisons_new[f'{stat}_x'], comparisons_new[f'{stat}_y'])
    print('MAE', stat, error)

MAE ba 0.02436793257935354
MAE obp 0.02445024829352272
MAE slg 0.05539311280354459
MAE woba 0.029048858037889393


In [15]:
# OK, that's actually worse!  So maybe regressing to league average isn't perfect, but it's working
# better than regressing to this lower mean.  Of course, there's selective sampling by only
# evaluating the players who got to 200 PA in 2019.