In [1]:
# Quick and dirty Marcel the Monkey batting projections

import pandas as pd
import boxball_loader as bbl
import numpy as np
from sklearn.metrics import mean_absolute_error


In [2]:
# Projecting 2019 from the prior three seasons
prior_yrs = bbl.Seasons(2016, 2018)
proj_yr = bbl.Seasons(2019)
yearly_weights = {2016: 3, 2017: 4, 2018: 5}

# I'll project these stats on a per-PA basis
proj_stats = ["ab", "r", "_1b", "_2b", "_3b", "hr", "rbi", "sb", "cs", "bb", "so", "ibb", "hbp", "sh", "sf", "gidp"]

In [3]:
# Some common computations
def compute_pa(df):
    return df['ab'] + df['bb'] + df['hbp'] + df['sh'] + df['sf']

def compute_1b(df):
    return df['h'] - df['_2b'] - df['_3b'] - df['hr']

In [4]:
# Load and clean up prior-year data
priors = bbl.load_batting(prior_yrs, coalesce_type=bbl.CoalesceMode.PLAYER_SEASON)
priors['pa'] = compute_pa(priors)
priors['_1b'] = compute_1b(priors)
priors = priors[['pa'] + proj_stats]
priors

Unnamed: 0_level_0,Unnamed: 1_level_0,pa,ab,r,_1b,_2b,_3b,hr,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp
player_id,yr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
abadfe01,2016,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
abadfe01,2017,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
abreujo02,2016,695,624,67,125,32,1,25,100,0,2,47,125,7,15,0,9,21
abreujo02,2017,675,621,95,107,43,6,33,102,3,0,35,119,6,15,0,4,21
abreujo02,2018,553,499,68,73,36,1,22,78,2,0,37,109,7,11,0,6,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zuninmi01,2016,192,164,16,15,7,0,12,31,0,0,21,65,0,6,0,1,0
zuninmi01,2017,435,387,52,47,25,0,25,64,1,0,39,160,0,8,0,1,8
zuninmi01,2018,405,373,37,37,18,0,20,44,0,0,24,150,0,6,0,2,7
zychto01,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
priors_weighted = pd.concat([(priors.query('yr==@yr') * weight) for yr, weight in yearly_weights.items()])
prior_weighted_total = priors_weighted.groupby('player_id').sum()/sum(yearly_weights.values())
prior_weighted_total


Unnamed: 0_level_0,pa,ab,r,_1b,_2b,_3b,hr,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
abadfe01,0.250000,0.250000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.250000,0.000000,0.000000,0.000000,0.000000,0.000000
abreujo02,629.166667,570.916667,76.75,97.333333,37.333333,2.666667,26.416667,91.500000,1.833333,0.500000,38.833333,116.333333,6.666667,13.333333,0.000000,6.083333,18.083333
achteaj01,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ackledu01,17.500000,15.250000,1.50,2.250000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,2.000000,2.250000,0.000000,0.000000,0.000000,0.250000,0.000000
acunaro01,202.916667,180.416667,32.50,29.583333,10.833333,1.666667,10.833333,26.666667,6.666667,2.083333,18.750000,51.250000,0.833333,2.500000,0.000000,1.250000,1.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zimmejo02,4.083333,3.833333,0.00,0.583333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.666667,0.000000,0.000000,0.250000,0.000000,0.000000
zimmery01,443.333333,401.416667,58.75,61.416667,24.250000,1.083333,21.166667,68.750000,1.750000,0.666667,34.416667,90.916667,1.000000,3.500000,0.000000,4.000000,12.500000
zobribe01,539.750000,465.333333,70.75,85.750000,26.083333,3.000000,12.250000,59.833333,3.416667,3.333333,64.916667,69.166667,2.583333,2.500000,2.083333,4.916667,11.916667
zuninmi01,361.750000,325.416667,36.75,34.833333,17.583333,0.000000,19.666667,47.416667,0.333333,0.000000,28.250000,132.083333,0.000000,6.666667,0.000000,1.416667,5.583333


In [6]:
# Compute league averages for the prior years
lg_totals = priors.sum()
lg_avg = lg_totals[proj_stats]/lg_totals['pa']
lg_avg

ab      0.894882
r       0.118863
_1b     0.145577
_2b     0.044901
_3b     0.004532
hr      0.031177
rbi     0.113372
sb      0.013585
cs      0.005214
bb      0.083986
so      0.216788
ibb     0.005102
hbp     0.009616
sh      0.004997
sf      0.006518
gidp    0.019788
dtype: float64

In [7]:
# For any player whose weighted average is under 400, regress to the mean by "filling them in" to 400 PA
players_400 = prior_weighted_total.query('pa>=400')
players_under_400 = prior_weighted_total.query('pa<400')
400 - players_under_400['pa']

player_id
abadfe01     399.750000
achteaj01    400.000000
ackledu01    382.500000
acunaro01    197.083333
adamecr01    331.333333
                ...    
zieglbr01    400.000000
zimmebr01    241.833333
zimmejo02    395.916667
zuninmi01     38.250000
zychto01     400.000000
Name: pa, Length: 1773, dtype: float64

In [8]:
ballast = pd.DataFrame(np.outer(400 - players_under_400['pa'], lg_avg))
ballast.index = players_under_400.index
ballast.columns = proj_stats
players_under_400_regressed = players_under_400[proj_stats] + ballast
players_under_400_regressed['pa'] = compute_pa(players_under_400_regressed)
players_under_400_regressed = players_under_400_regressed[['pa'] + proj_stats]
players_under_400_regressed

Unnamed: 0_level_0,pa,ab,r,_1b,_2b,_3b,hr,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
abadfe01,400.0,357.978951,47.515649,58.194351,17.949124,1.811842,12.463168,45.320547,5.430483,2.084159,33.573470,86.910804,2.039493,3.844131,1.997709,2.605739,7.910150
achteaj01,400.0,357.952672,47.545365,58.230745,17.960349,1.812975,12.470963,45.348890,5.433880,2.085462,33.594467,86.715001,2.040769,3.846535,1.998958,2.607368,7.915097
ackledu01,400.0,357.542242,46.965255,57.933150,17.174584,1.733658,11.925358,44.364876,5.196147,1.994223,34.124709,85.171219,1.951485,3.678249,1.911504,2.743296,7.568811
acunaro01,400.0,356.782931,55.925998,58.274107,19.682547,2.559935,16.977881,49.010442,9.343984,3.110858,35.302274,93.975203,1.838837,4.395220,0.984903,2.534672,5.566501
adamecr01,400.0,357.087463,45.966744,57.484467,16.627156,2.251748,10.830114,41.813997,5.001064,2.477458,34.160750,85.578926,1.690437,4.186213,2.405804,2.159770,7.806338
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zieglbr01,400.0,357.952672,47.545365,58.230745,17.960349,1.812975,12.470963,45.348890,5.433880,2.085462,33.594467,86.715001,2.040769,3.846535,1.998958,2.607368,7.915097
zimmebr01,400.0,360.245553,48.245135,57.955338,17.941861,1.762761,11.039736,44.167183,10.951900,2.010836,31.893988,103.759777,1.567148,4.075551,1.208537,2.576371,6.868669
zimmejo02,400.0,358.131905,47.060006,58.219640,17.777004,1.794468,12.343655,44.885953,5.378409,2.064173,33.251523,87.496452,2.019936,3.807268,2.228552,2.580751,7.834297
zuninmi01,400.0,359.645891,41.296526,40.401648,19.300792,0.173366,20.859202,51.753154,0.852948,0.199422,31.462471,140.375455,0.195148,7.034492,0.191150,1.665996,6.340214


In [9]:
priors_regressed = pd.concat([players_400, players_under_400_regressed])
priors_regressed

Unnamed: 0_level_0,pa,ab,r,_1b,_2b,_3b,hr,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
abreujo02,629.166667,570.916667,76.750000,97.333333,37.333333,2.666667,26.416667,91.500000,1.833333,0.500000,38.833333,116.333333,6.666667,13.333333,0.000000,6.083333,18.083333
alonsyo01,545.833333,485.833333,63.666667,79.833333,23.750000,0.000000,20.666667,70.916667,1.416667,0.250000,55.166667,109.083333,2.666667,1.750000,0.000000,3.083333,10.916667
altuvjo01,649.500000,579.166667,99.333333,134.000000,35.583333,3.416667,19.416667,76.416667,25.250000,6.166667,57.250000,78.416667,5.416667,7.250000,2.333333,3.500000,17.166667
anderti01,562.250000,534.416667,70.333333,89.833333,25.833333,4.083333,16.250000,52.833333,18.333333,4.166667,20.083333,145.333333,0.833333,2.916667,3.000000,1.833333,14.333333
andruel01,550.000000,505.416667,74.166667,97.750000,30.750000,4.333333,11.166667,60.333333,16.416667,6.583333,36.083333,78.666667,0.500000,3.250000,1.333333,3.916667,13.833333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zieglbr01,400.000000,357.952672,47.545365,58.230745,17.960349,1.812975,12.470963,45.348890,5.433880,2.085462,33.594467,86.715001,2.040769,3.846535,1.998958,2.607368,7.915097
zimmebr01,400.000000,360.245553,48.245135,57.955338,17.941861,1.762761,11.039736,44.167183,10.951900,2.010836,31.893988,103.759777,1.567148,4.075551,1.208537,2.576371,6.868669
zimmejo02,400.000000,358.131905,47.060006,58.219640,17.777004,1.794468,12.343655,44.885953,5.378409,2.064173,33.251523,87.496452,2.019936,3.807268,2.228552,2.580751,7.834297
zuninmi01,400.000000,359.645891,41.296526,40.401648,19.300792,0.173366,20.859202,51.753154,0.852948,0.199422,31.462471,140.375455,0.195148,7.034492,0.191150,1.665996,6.340214


In [10]:
# These are pretty good projections right here.  We could add age adjustments, but I'm stopping here for now
# Let's add some rate stats

def get_woba(stats):
    woba_weights = {'ob': .702, 'ibb': -.702, 'tb': .37, 'h': -.21}
    return sum([stats[stat]*woba_weights[stat] for stat in woba_weights.keys()])/(stats['pa'] - stats['ibb'])

def add_batting_rate_stats(df):
    df['pa'] = df['ab'] + df['bb'] + df['hbp'] + df['sf']
    df['h'] = df['_1b'] + df['_2b'] + df['_3b'] + df['hr']
    df['outs'] = df['ab'] - df['h'] + df['cs'] + df['sf']
    df['tb'] = df['h'] + df['_2b'] + 2*df['_3b'] + 3*df['hr']
    df['ob'] = df['h'] + df['bb'] + df['hbp']
    df['ba'] = df['h'] / df['ab']
    df['obp'] = df['ob'] / df['pa']
    df['slg'] = df['tb'] / df['ab']
    df['ops'] = df['obp'] + df['slg']
    df['woba'] = get_woba(df)
    return df

projections = add_batting_rate_stats(priors_regressed)

projections.sort_values('woba', ascending=False)[['pa', 'ba', 'obp', 'slg', 'woba']]

Unnamed: 0_level_0,pa,ba,obp,slg,woba
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
troutmi01,592.166667,0.311230,0.449057,0.605704,0.432177
martijd02,562.666667,0.316983,0.388033,0.624895,0.413588
judgeaa01,457.250000,0.275585,0.399854,0.565663,0.401234
vottojo01,664.500000,0.307525,0.434663,0.508646,0.398751
freemfr01,639.166667,0.306634,0.394915,0.543600,0.388242
...,...,...,...,...,...
maldoma01,395.692946,0.221136,0.286378,0.359829,0.277218
mccanja02,413.416667,0.230383,0.284015,0.354754,0.275809
hechaad01,397.932535,0.247404,0.284555,0.353669,0.270362
engelad01,393.941498,0.220381,0.276666,0.340127,0.267238


In [11]:
# Now let's evaluate the projections, for players who had at least 200 PA in 2019

actuals = bbl.load_batting(proj_yr, coalesce_type=bbl.CoalesceMode.PLAYER_SEASON)
actuals['pa'] = compute_pa(actuals)
actuals['_1b'] = compute_1b(actuals)
actuals = actuals.query('pa>=200')
actuals = add_batting_rate_stats(actuals)
actuals

Unnamed: 0_level_0,Unnamed: 1_level_0,g,ab,r,h,_2b,_3b,hr,rbi,sb,cs,...,pa,_1b,outs,tb,ob,ba,obp,slg,ops,woba
player_id,yr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
abreujo02,2019,159,634,85,180,38,1,33,123,2,2,...,693,108,466,319,229,0.283912,0.330447,0.503155,0.833602,0.345689
acunaro01,2019,156,626,127,175,22,2,41,101,37,9,...,712,110,461,324,260,0.279553,0.365169,0.517572,0.882740,0.371246
adamewi01,2019,152,531,69,135,25,1,20,52,4,2,...,581,89,399,222,184,0.254237,0.316695,0.418079,0.734774,0.314234
adamsma01,2019,111,310,42,70,14,0,20,56,0,0,...,333,36,241,144,92,0.225806,0.276276,0.464516,0.740792,0.308620
adriaeh01,2019,84,202,34,55,8,3,5,22,0,2,...,232,39,153,84,81,0.272277,0.349138,0.415842,0.764980,0.327662
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wolteto01,2019,121,359,42,94,17,2,1,42,0,1,...,409,74,272,118,138,0.261838,0.337408,0.328691,0.666099,0.290312
wongko01,2019,148,478,61,136,25,4,11,59,24,4,...,543,96,351,202,196,0.284519,0.360958,0.422594,0.783552,0.335059
yastrmi01,2019,107,371,64,101,22,3,21,55,2,4,...,410,55,277,192,137,0.272237,0.334146,0.517520,0.851667,0.355262
yelicch01,2019,130,489,100,161,29,3,44,97,30,2,...,580,85,333,328,249,0.329243,0.429310,0.670757,1.100067,0.445241


In [12]:
comp_stats = ['ba', 'obp', 'slg', 'woba']
comparisons = pd.merge(projections[comp_stats], actuals[comp_stats], on='player_id', how='inner')
comparisons


Unnamed: 0_level_0,ba_x,obp_x,slg_x,woba_x,ba_y,obp_y,slg_y,woba_y
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
abreujo02,0.286819,0.343179,0.500365,0.350526,0.283912,0.330447,0.503155,0.345689
alonsyo01,0.255746,0.331908,0.432247,0.325709,0.198630,0.295522,0.345890,0.278861
altuvjo01,0.332230,0.396987,0.506043,0.381124,0.298000,0.352834,0.550000,0.376501
anderti01,0.254483,0.284309,0.409325,0.292632,0.335341,0.357143,0.508032,0.363726
andruel01,0.284913,0.334143,0.429184,0.325390,0.275000,0.313272,0.393333,0.300578
...,...,...,...,...,...,...,...,...
wilkest01,0.248546,0.316426,0.408050,0.309264,0.224924,0.286111,0.382979,0.287183
winkeje01,0.273364,0.356773,0.434771,0.338866,0.269231,0.356771,0.473373,0.353037
wolteto01,0.233001,0.319843,0.356558,0.292284,0.261838,0.337408,0.328691,0.290312
wongko01,0.259411,0.345618,0.389070,0.315328,0.284519,0.360958,0.422594,0.335059


In [13]:
# Compute MAEs
for stat in comp_stats:
    error = mean_absolute_error(comparisons[f'{stat}_x'], comparisons[f'{stat}_y'])
    print('MAE', stat, error)


MAE ba 0.022788035168030497
MAE obp 0.022965440266282538
MAE slg 0.05140403355823716
MAE woba 0.026649677121432347


In [14]:
# OK, I've never like regressing to the mean.
# Instead of regressing players with little playing time to the mean, let's regress them to *their* mean
# E.g., the mean of players who don't average 400 PA/season

regress_level = players_under_400.sum()[proj_stats]/players_under_400['pa'].sum()
regress_level

ab      0.897834
r       0.108381
_1b     0.138832
_2b     0.041524
_3b     0.004479
hr      0.026081
rbi     0.101531
sb      0.012327
cs      0.004962
bb      0.077494
so      0.242444
ibb     0.003674
hbp     0.009941
sh      0.008812
sf      0.005918
gidp    0.018922
dtype: float64

In [15]:
# This is just as before, but replacing lg_avg with regress_level
ballast = pd.DataFrame(np.outer(400 - players_under_400['pa'], regress_level))
ballast.index = players_under_400.index
ballast.columns = proj_stats
players_under_400_regressed = players_under_400[proj_stats] + ballast
players_under_400_regressed['pa'] = compute_pa(players_under_400_regressed)
players_under_400_regressed = players_under_400_regressed[['pa'] + proj_stats]
priors_regressed = pd.concat([players_400, players_under_400_regressed])
projections_new = add_batting_rate_stats(priors_regressed)

# Let's pick 10 players and see their projections before and after
plyrs = players_under_400.sample(10).index
projections.loc[plyrs][comp_stats], projections_new.loc[plyrs][comp_stats]

(                 ba       obp       slg      woba
 player_id                                        
 ramosed02  0.252521  0.321553  0.417192  0.314835
 flahery01  0.240630  0.313047  0.376647  0.296072
 baderha01  0.256274  0.324066  0.416347  0.315002
 escobed02  0.252404  0.320992  0.416998  0.314458
 loughda01  0.252102  0.322407  0.412695  0.313657
 pireljo01  0.258323  0.314583  0.396295  0.304773
 wietema01  0.235243  0.305303  0.374269  0.291361
 gilleco01  0.252136  0.319715  0.415979  0.313418
 bettich01  0.237252  0.305903  0.388149  0.297450
 brachsi01  0.252580  0.321194  0.417289  0.314656,
                  ba       obp       slg      woba
 player_id                                        
 ramosed02  0.234698  0.301192  0.377939  0.291897
 flahery01  0.228752  0.299491  0.350519  0.280740
 baderha01  0.247706  0.314337  0.397493  0.304015
 escobed02  0.234590  0.300625  0.377764  0.291514
 loughda01  0.235098  0.303034  0.375257  0.291817
 pireljo01  0.254908  0.310624

In [16]:
# Note the OBPs and SLG are mostly down.  The extent will vary based on how much playing time a player had

# OK now to the MAEs:
comparisons_new = pd.merge(projections_new[comp_stats], actuals[comp_stats], on='player_id', how='inner')
for stat in comp_stats:
    error = mean_absolute_error(comparisons_new[f'{stat}_x'], comparisons_new[f'{stat}_y'])
    print('MAE', stat, error)

MAE ba 0.02436793257935354
MAE obp 0.024161411138986982
MAE slg 0.05539311280354459
MAE woba 0.028625530915013728


In [17]:
# OK, that's actually worse!  So maybe regressing to league average isn't perfect, but it's working
# better than regressing to this lower mean.  Of course, there's selective sampling by only
# evaluating the players who got to 200 PA in 2019.