In [1]:
import pandas as pd
from pathlib import Path
import statsmodels.formula.api as sm
import numpy as np

interim = '../../data/interim'
br = pd.read_pickle(Path(interim) / 'batting_records.pkl')
events = pd.read_pickle(Path(interim) / 'events.pkl')
people = pd.read_pickle(Path(interim) / 'people.pkl')

In [2]:
def fill_gaps(records, player_id):
    records = records.reset_index()
    records['first_year'] = records.groupby(player_id)['year'].transform("min")
    records['last_year'] = records.groupby(player_id)['year'].transform("max")

    records = records.set_index([player_id, 'year'])
    records = records.unstack(fill_value=0).stack(dropna=False)
    records['first_year'] = records.groupby(player_id)['first_year'].transform("max")
    records['last_year'] = records.groupby(player_id)['last_year'].transform("max")

    records = records.loc[(records.index.get_level_values('year') <= records['last_year']) &
                  (records.index.get_level_values('year') >= records['first_year'])]
    del records['first_year']
    del records['last_year']

    return records

In [3]:
def merge_data(records, people, player_id):
    records = records.reset_index()

    merged = pd.merge(
        records,
        people['birthYear'],
        left_on = [player_id],
        right_on=['PlayerID'],
        how='left'
    )

    merged = merged.set_index([player_id, 'year']).sort_values([player_id, 'year'])

    merged['Age'] = merged.index.get_level_values('year') - merged['birthYear']
    del merged['birthYear']

    merged = merged.sort_values([player_id, 'year'])

    return merged

In [4]:
def gen_history(merged, metric, base):
    group = merged.groupby([player_id])
    
    for lag in range(1, 4):
        lag_name = 'L' + str(lag) + '_' + metric
        merged[lag_name] = group[metric].shift(lag)
        merged[lag_name] = merged[lag_name].fillna(0).astype('int')

        lag_name = 'L' + str(lag) + '_' + base
        merged[lag_name] = group[base].shift(lag)
        merged[lag_name] = merged[lag_name].fillna(0).astype('int')

    del merged[base]
    del merged[metric]
    
    return merged

In [5]:
def gen_player_log(events, metric, base):
    bg = events.groupby(['GAME_ID', player_id]).agg({
        metric: 'sum',
        base: 'sum',
        'Date': 'first', 
        'year': 'first',
    })

    bg = bg.sort_values([player_id, 'year', 'Date'])

    bg[metric + '_cum'] = bg.groupby([player_id, 'year'])[metric].cumsum()
    bg[base + '_cum'] = bg.groupby([player_id, 'year'])[base].cumsum()
    
    bg[metric + '_cum'] = bg[metric + '_cum'] - bg[metric]
    bg[base + '_cum'] = bg[base + '_cum'] - bg[base]
    
    return bg

In [6]:
def est_naive_rate(df, w, metric, base):
    sum_metric = w[0]*df[metric + '_cum'] + w[1]*df['L1_' + metric] + w[2]*df['L2_' + metric] + w[3]*df['L3_' + metric]
    sum_base = w[0]*df[base + '_cum'] + w[1]*df['L1_' + base] + w[2]*df['L2_' + base] + w[3]*df['L3_' + base]
    naive_rate = sum_metric / sum_base

    return naive_rate

In [7]:
def gen_league(events, w, metric, base):
    League = events[events.BAT_FLD_CD != 1].groupby('year')[[metric, base]].sum()
    League.columns = ['L_count', 'L_base']

    League = League.sort_values('year')
    League['L_rate'] = League['L_count'] / League['L_base']
    League['L1_L_rate'] = League['L_rate'].shift(1)
    League['L2_L_rate'] = League['L_rate'].shift(2)
    League['L3_L_rate'] = League['L_rate'].shift(3)
    League['L_avg_rate'] = (w[1]*League['L1_L_rate'] + w[2]*League['L2_L_rate'] + w[3]*League['L3_L_rate']) / sum(w[1:4])
    
    del League['L_count']
    del League['L_base']

    return League

In [8]:
def reg_to_league_mean(df, w, metric, base):
    League_base_weighted_sum = (
        w[1]*df['L1_L_rate']*df['L1_' + base] +
        w[2]*df['L2_L_rate']*df['L2_' + base] +
        w[3]*df['L3_L_rate']*df['L3_' + base]
    )
    
    sum_base = w[1]*df['L1_' + base] + w[2]*df['L2_' + base] + w[3]*df['L3_' + base]

    League_mean_rate = League_base_weighted_sum / sum_base
    League_mean_rate = League_mean_rate.where(sum_base > 0, df['L_avg_rate'])

    reliability = sum_base / (1200 + sum_base)

    adj_rate = League_mean_rate * (1 - reliability) + df['naive_rate'] * reliability
    adj_rate = adj_rate.where(sum_base > 0, df['L_avg_rate'])

    return adj_rate

In [9]:
def adjust_age(df):
    age_adj = np.where(
        df.Age <= 29,
        1 + (29 - df.Age)* .006,
        1 + (29 - df.Age)* .003,
    )
    pred_rate = (df['adj_rate'] * age_adj).astype('float')
    
    return pred_rate

In [10]:
def est_base(df, base):
    pred_base = (.5 * df['L1_' + base] + .1 * df['L2_' + base] + 200).astype('float')

    return pred_base

In [11]:
metric = 'HR'
base = 'PA'
player_id = 'BAT_ID'
records = br.loc[:, [metric, base]]
records = fill_gaps(records, player_id)

w = [8, 5, 4, 3]

lr = gen_league(events, w, metric, base)

merged = merge_data(records, people, player_id)
merged[metric] = merged[metric].astype('int')
merged = merged.sort_values([player_id, 'year'])

history = gen_history(merged, metric, base)
bg = gen_player_log(events, metric, base)

In [12]:
test = pd.merge(
    bg.reset_index(), 
    history.reset_index(),
    on=[player_id, 'year'],
    how='left'
)

In [13]:
test['pred_base'] = est_base(test, base)
test['naive_rate'] = est_naive_rate(test, w, metric, base)
test = pd.merge(test, lr, on=['year'], how='left')
test['adj_rate'] = reg_to_league_mean(test, w, metric, base)
test['pred_rate'] = adjust_age(test)

test['pred_amt'] = (test['pred_base'] * test['pred_rate']).astype('float')

In [15]:
test[(test['BAT_ID'] == 'harpb003') & (test['year'] == 2016)]

Unnamed: 0,GAME_ID,BAT_ID,HR,PA,Date,year,HR_cum,PA_cum,Age,L1_HR,...,pred_base,naive_rate,L_rate,L1_L_rate,L2_L_rate,L3_L_rate,L_avg_rate,adj_rate,pred_rate,pred_amt
1380407,ATL201604040,harpb003,1,5,2016-04-04,2016,0,0,24.0,42,...,566.5,0.050781,0.031169,0.027398,0.023378,0.025869,0.025676,0.046843,0.048248,27.332769
1380408,ATL201604060,harpb003,0,4,2016-04-06,2016,1,5,24.0,42,...,566.5,0.051716,0.031169,0.027398,0.023378,0.025869,0.025676,0.04763,0.049059,27.791716
1380409,WAS201604070,harpb003,1,5,2016-04-07,2016,1,9,24.0,42,...,566.5,0.051458,0.031169,0.027398,0.023378,0.025869,0.025676,0.047413,0.048835,27.665103
1380410,WAS201604100,harpb003,0,4,2016-04-10,2016,2,14,24.0,42,...,566.5,0.052379,0.031169,0.027398,0.023378,0.025869,0.025676,0.048187,0.049633,28.116869
1380411,WAS201604110,harpb003,0,5,2016-04-11,2016,2,18,24.0,42,...,566.5,0.05212,0.031169,0.027398,0.023378,0.025869,0.025676,0.04797,0.049409,27.990056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1380549,PIT201609240,harpb003,0,5,2016-09-24,2016,24,607,24.0,42,...,566.5,0.045905,0.031169,0.027398,0.023378,0.025869,0.025676,0.042744,0.044026,24.940647
1380550,PIT201609250,harpb003,0,2,2016-09-25,2016,24,612,24.0,42,...,566.5,0.045742,0.031169,0.027398,0.023378,0.025869,0.025676,0.042606,0.043884,24.860472
1380551,WAS201609300,harpb003,0,4,2016-09-30,2016,24,614,24.0,42,...,566.5,0.045677,0.031169,0.027398,0.023378,0.025869,0.025676,0.042551,0.043828,24.828562
1380552,WAS201610010,harpb003,0,4,2016-10-01,2016,24,618,24.0,42,...,566.5,0.045547,0.031169,0.027398,0.023378,0.025869,0.025676,0.042443,0.043716,24.765013
