In [2]:
from pybaseball import batting
from pybaseball import playerid_reverse_lookup
import pandas as pd

In [113]:
statcast = pd.read_csv('../hmwk/statcast2016.csv')

In [None]:
statcast['events'].value_counts()

field_out                       75296
strikeout                       39457
single                          27865
walk                            14331
double                           8349
home_run                         5677
force_out                        3922
grounded_into_double_play        3759
hit_by_pitch                     1678
field_error                      1614
sac_fly                          1206
sac_bunt                         1037
intent_walk                       949
triple                            883
double_play                       452
fielders_choice_out               290
caught_stealing_2b                229
strikeout_double_play             141
fielders_choice                    98
pickoff_caught_stealing_2b         49
other_out                          42
catcher_interf                     40
pickoff_1b                         34
sac_fly_double_play                24
caught_stealing_3b                 19
pickoff_2b                         13
run         

In [None]:
def calc_slg_by_game(group):
    
    event_counts = group['events'].value_counts()
    
    total_bases = 0
    total_bases += 1 * event_counts['single'] if 'single' in event_counts else 0
    total_bases += 2 * event_counts['double'] if 'double' in event_counts else 0
    total_bases += 3 * event_counts['triple'] if 'triple' in event_counts else 0
    total_bases += 4 * event_counts['home_run'] if 'home_run' in event_counts else 0
    
    hits = 0
    hits += event_counts['single'] if 'single' in event_counts else 0
    hits += event_counts['double'] if 'double' in event_counts else 0
    hits += event_counts['triple'] if 'triple' in event_counts else 0
    hits += event_counts['home_run'] if 'home_run' in event_counts else 0
    
    at_bats = len(group)
    at_bats -= event_counts['walk'] if 'walk' in event_counts else 0
    at_bats -= event_counts['hit_by_bat'] if 'hit_by_bat' in event_counts else 0
    
    ret_obj = {
        'total_bases': total_bases,
        'hits': hits,
        'at_bats': at_bats,
        'stand': group['stand'].iloc[0],
        'batter': group['batter'].iloc[0],
        'home_team': group['home_team'].iloc[0]
    }
    
    return pd.Series(ret_obj)

bat_stats_by_game = statcast.groupby(['game_pk', 'batter']).apply(calc_slg_by_game)

In [None]:
bat_stats_by_game

In [None]:
sorted(bat_stats_by_game['home_team'].unique())

In [None]:
# park factors from: https://swishanalytics.com/mlb/mlb-park-factors

# L is left field, so right handed will use left field factors

slg_pfs = {
    
    'ARIL': 1.05,
    'ARIR': 1.07,
    
    'ATLL': 0.97,
    'ATLR': 0.98,
    
    'BALL': 1.05,
    'BALR': 1.03,
    
    'BOSL': 1.02,
    'BOSR': 1.02,
    
    'CHCL': 1.02,
    'CHCR': 1.01,
    
    'CINL': 1.08,
    'CINR': 1.02,
    
    'CLEL': 0.93,
    'CLER': 1.09,
    
    'COLL': 1.21,
    'COLR': 1.27,
    
    'CWSL': 0.99,
    'CWSR': 0.97,
    
    'DETL': 1.08,
    'DETR': 0.95,
    
    'HOUL': 0.98,
    'HOUR': 0.95,
    
    'KCL': 0.98,
    'KCR': 1.00,
    
    'LAAL': 0.98,
    'LAAR': 1.00,
    
    'LADL': 0.96,
    'LADR': 0.98,
    
    'MIAL': 0.87,
    'MIAR': 0.87,
    
    'MILL': 1.02,
    'MILR': 1.05,
    
    'MINL': 1.03,
    'MINR': 1.02,
    
    'NYML': 0.88,
    'NYMR': 0.87,
    
    'NYYL': 1.03,
    'NYYR': 1.08,
    
    'OAKL': 0.96,
    'OAKR': 0.92,
    
    'PHIL': 1.07,
    'PHIR': 1.03,
    
    'PITL': 0.94,
    'PITR': 1.02,
    
    'SDL': 0.92,
    'SDR': 0.98,
    
    'SEAL': 0.93,
    'SEAR': 0.91,
    
    'SFL': 0.93,
    'SFR': 0.89,
    
    'STLL': 0.94,
    'STLR': 0.94,
    
    'TBL': 0.96,
    'TBR': 0.89,
    
    'TEXL': 1.12,
    'TEXR': 1.17,
    
    'TORL': 1.03,
    'TORR': 0.99,
    
    'WSHL': 1.08,
    'WSHR': 1.08
    
}

In [None]:

# batter_bat_stats = {
#     '<player_id>': {
#         'total_bases': 0,
#         'adjusted_total_bases': 0,
#         'at_bats': 0,
#         'hits': 0
#     }
# }

batter_bat_stats = {}

for idx, row in bat_stats_by_game.iterrows():
    
    player_id = row['batter']
    right_left_handed = 'L' if row['stand'] == 'R' else 'R'
    park_factor_key = row['home_team'] + right_left_handed
    
    if player_id not in batter_bat_stats:
        batter_bat_stats[player_id] = {
            'total_bases': row['total_bases'],
            'hits': row['hits'],
            'adjusted_total_bases': row['total_bases'] * slg_pfs[park_factor_key],
            'at_bats': row['at_bats'],
        }
    else:
        batter_bat_stats[player_id]['total_bases'] = batter_bat_stats[player_id]['total_bases'] + row['total_bases']
        batter_bat_stats[player_id]['adjusted_total_bases'] = batter_bat_stats[player_id]['adjusted_total_bases'] + row['total_bases'] * slg_pfs[park_factor_key]
        batter_bat_stats[player_id]['at_bats'] = batter_bat_stats[player_id]['at_bats'] + row['at_bats']
        batter_bat_stats[player_id]['hits'] = batter_bat_stats[player_id]['hits'] + row['hits']
        

In [None]:
batter_bat_stats

In [None]:
slgs = {
    'batter': [],
    'slg': [],
    'aslg': [],
    'total_bases': [],
    'hits': [],
    'at_bats': [],
}

for batter in batter_bat_stats:
    stats = batter_bat_stats[batter]
    slg = stats['total_bases'] / stats['at_bats']
    aslg = stats['adjusted_total_bases'] / stats['at_bats']
    slgs['batter'].append(batter)
    slgs['slg'].append(slg)
    slgs['aslg'].append(aslg)
    slgs['hits'].append(stats['hits'])
    slgs['at_bats'].append(stats['at_bats'])
    slgs['total_bases'].append(stats['total_bases'])


In [None]:
final_df = pd.DataFrame(slgs)

In [None]:
final_df

In [None]:
batter_ids = list(map(int, list(final_df['batter'])))

In [None]:
batter_ids

In [None]:
ids_names = playerid_reverse_lookup(batter_ids, key_type='mlbam')

In [None]:
ids_names

In [None]:
joined = pd.merge(final_df, ids_names, left_on='batter', right_on='key_mlbam', how='left')

In [None]:
joined['full_name'] = joined['name_first'] + ' ' + joined['name_last']

In [None]:
cleaned_df = joined[['full_name', 'slg', 'aslg', 'total_bases', 'hits', 'at_bats']]

In [None]:
cleaned_df

In [None]:
cleaned_df.to_json('./2016_ASLG_results.json', orient='records')