In [None]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

In [None]:
interim = '../data/interim'
br = pd.read_pickle(Path(interim) / 'batting_records.pkl')
bg = pd.read_pickle(Path(interim) / 'batting_games.pkl')
gl = pd.read_pickle(Path(interim) / 'game_logs.pkl')
pr = pd.read_pickle(Path(interim) / 'park_records.pkl')
panel = pd.read_pickle(Path(interim) / 'panel.pkl')
events = pd.read_pickle(Path(interim) / 'events.pkl')

In [None]:
bg = events.groupby(['GAME_ID', 'BAT_ID']).agg({
    'H': 'sum',
    'AB_FL': 'sum',
    'BAT_LINEUP_ID': 'first',
    'year': 'first',
})

bg['Win'] = (bg['H'] > 0).astype('int')
bg = bg.rename(columns={'AB_FL': 'AB'})

In [None]:
bg_adj = pd.merge(bg.reset_index(), gl[['GAME_ID', 'ParkID', 'Date']], on=['GAME_ID'])

In [None]:
bg_adj2 = pd.merge(bg_adj, pr[['factor_year', 'factor']], on=['ParkID', 'year'])

In [None]:
bg_adj2['H_adj'] = bg_adj2['H'] / bg_adj2['factor']

In [None]:
WinRate = bg_adj2.groupby(['BAT_ID', 'year']).agg({
    'Win': 'mean', 'H_adj': 'sum', 'AB': 'sum', 'GAME_ID': 'nunique'
})
WinRate.columns = ['WPG', 'H_adj', 'AB', 'G']
WinRate['BA_adj'] = WinRate['H_adj'] / WinRate['AB']
WinRate['Win_adj'] = WinRate['H_adj'] / WinRate['G']

In [None]:
test = pd.merge(bg_adj2.reset_index(), WinRate[['G', 'BA_adj', 'Win_adj']].reset_index(), on=['BAT_ID', 'year'])
test = test[test.G >= 100]
test = test.set_index(['GAME_ID', 'BAT_ID'])

test['pred_BA'] = test['BA_adj'] * test['factor']
test['pred_Win'] = test['Win_adj'] # * test['factor']

In [None]:
test4 = test.groupby('Date')['pred_Win'].nlargest(2).to_frame()
test4 = test4.sort_values(by=['Date', 'pred_Win', 'GAME_ID'], ascending=[True, False, True])
test4['pick_order'] = test4.groupby(['Date']).cumcount()+1

selection_data = test4.merge(test[['Date', 'Win', 'year']], on =['GAME_ID', 'BAT_ID'])
selection_data = selection_data.set_index(['Date', 'pick_order'])

In [None]:
# plt.style.use('fivethirtyeight')

In [None]:
wr_plot = selection_data.groupby('year')[['Win']].mean().plot.line()
adjust_plot(wr_plot)

In [None]:
selection_data.loc[selection_data.year >= 2010, 'Win'].mean()

In [None]:
br['BA'] = br['H'] / br['AB']

In [None]:
merged = pd.merge(br, WinRate, on=['BAT_ID', 'year'])

In [None]:
def adjust_plot(plt):
    plt.xaxis.grid(False)
    plt.yaxis.grid(True, linestyle='dashed', color='lightgrey')
    plt.set_axisbelow(True)
    plt.xaxis.set_ticks_position('none') 
    plt.yaxis.set_ticks_position('none') 

    plt.spines['left'].set_visible(False)
    plt.spines['right'].set_visible(False)
    plt.spines['top'].set_visible(False)

In [None]:
plot = merged.loc[merged.G >= 100 , 'ABPG'].hist(rwidth=0.9)
adjust_plot(plot)

In [None]:
plot = merged.loc[merged.G >= 100 , 'WPG'].hist(rwidth=0.9)
adjust_plot(plot)

In [None]:
merged.loc[(merged.G >= 120) & (merged.WPG > .8)]

In [None]:
merged.loc[
    (merged.G >= 120) &
    (merged.WPG > .8) & 
    (merged.index.get_level_values('year') >= 1990),
    ['FirstName', 'LastName', 'G', 'ABPG', 'BA', 'HPG', 'WPG']
].sort_values(['year'])

In [None]:
test = pd.merge(bg.reset_index(), br[['G', 'BA', 'HPG', 'ABPG']].reset_index(), on=['BAT_ID', 'year'])

In [None]:
test2 = pd.merge(test, gl[['GAME_ID', 'ParkID', 'Date']], on=['GAME_ID'])

In [None]:
test3 = pd.merge(test2, pr[['factor_year', 'factor']], on=['ParkID', 'year'])

In [None]:
test3 = test3[test3.G >= 100]
test3 = test3.set_index(['GAME_ID', 'BAT_ID'])

test3['adj_BA'] = test3['BA'] * test3['factor']

test4 = test3.groupby('Date')['adj_BA'].nlargest(2).to_frame()
test4 = test4.sort_values(by=['Date', 'adj_BA', 'GAME_ID'], ascending=[True, False, True])
test4['pick_order'] = test4.groupby(['Date']).cumcount()+1

selection_data = test4.merge(test3[['Date', 'Win', 'year']], on =['GAME_ID', 'BAT_ID'])
selection_data = selection_data.set_index(['Date', 'pick_order'])

In [None]:
wr_plot = selection_data.groupby('year')[['Win']].mean().plot.line()
adjust_plot(wr_plot)

In [None]:
selection_data.loc[selection_data.year >= 2010, 'Win'].mean()

In [None]:
def streak_counter(df):
    df['L_Win'] = df['Win'].shift(1)
    df['L_year'] = df['year'].shift(1)
    df['start_of_streak'] = (df['Win'] != df['L_Win']) | (df['year'] != df['L_year'])
    df['streak_id'] = df['start_of_streak'].cumsum()
    df['streak_counter'] = df.groupby('streak_id').cumcount() + 1
    df.loc[df.Win == 0, 'streak_counter'] = 0
    
    return df

In [None]:
selection_data = selection_data.astype({'Win':'int8'})
selection_data = streak_counter(selection_data)
streaks = selection_data.groupby('streak_id').agg({'streak_counter':'max'})
streaks['streak_counter'].max()

In [None]:
year_results = selection_data.groupby('year').agg({'Win':'mean', 'streak_counter':'max'})
st_plot = year_results['streak_counter'].plot()
adjust_plot(st_plot)

In [None]:
fp = pd.merge(panel, bg, on=['GAME_ID', 'BAT_ID'])

In [None]:
fp = fp[['Win', 'GAME_ID', 'ParkID', 'BAT_ID', 'PIT_ID', 'home']]

In [None]:
fp.dtypes

In [None]:
fp['Win'] = fp['Win'].astype('int')