In [None]:
from numpy.random import binomial
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
interim = Path('../../data/interim')
events = pd.read_pickle(Path(interim) / 'events.pkl')

In [None]:
events['EVENT_ID'] = events['EVENT_ID'].astype('int')
events['INN_CT'] = events['INN_CT'].astype('int')
events = events.sort_values(['GAME_ID', 'INN_CT', 'BAT_HOME_ID', 'EVENT_ID', 'BAT_LINEUP_ID'])

In [None]:
events = events.sort_values(['GAME_ID', 'INN_CT', 'BAT_HOME_ID', 'EVENT_ID', 'BAT_LINEUP_ID'])
events['FINAL_OUTS'] = events['OUTS_CT'] + events['EVENT_OUTS_CT']

events['outs_in_inn'] = events.groupby(['GAME_ID', 'BAT_HOME_ID', 'INN_CT'])['EVENT_OUTS_CT'].transform('sum')
events['outs_in_game'] = events.groupby(['GAME_ID', 'BAT_HOME_ID'])['EVENT_OUTS_CT'].transform('sum')
events = events.loc[events.outs_in_inn == 3]
events = events.loc[events.outs_in_game == 27]

events['INN_LAST_PLAY'] = np.where(events['FINAL_OUTS'] == 3, 1, 0)
events['GAME_LAST_PLAY'] = np.where((events['INN_LAST_PLAY'] == 1) & (events['INN_CT'] == 9), 1, 0)

events['NEXT_BATTER'] = np.where(
    events['INN_LAST_PLAY'] == 1, 1, events['BAT_EVENT_FL'].astype('int')
)

## Prep data by cleaning states and events

In [None]:
# Limit to main event types and relevant variables for simplicity
states_data = events.loc[
    (events.outs_in_inn == 3) & (events.outs_in_game == 27),
    ['EVENT_CD', 'BASE1_RUN_ID', 'BASE2_RUN_ID', 
    'BASE3_RUN_ID', 'BAT_DEST_ID', 'RUN1_DEST_ID', 'RUN2_DEST_ID', 'RUN3_DEST_ID',
    'OUTS_CT', 'EVENT_OUTS_CT', 'EVENT_RUNS_CT', 
    'NEXT_BATTER', 'INN_LAST_PLAY', 'GAME_LAST_PLAY',
    'GAME_ID', 'BAT_HOME_ID', 'INN_CT', 'BAT_EVENT_FL', 'BAT_LINEUP_ID', 'EVENT_ID', ]
]
states_data = states_data.sort_values(['GAME_ID', 'INN_CT', 'BAT_HOME_ID', 'EVENT_ID', 'BAT_LINEUP_ID'])

# Come up with starting base state for each event
states_data['1b'] = np.where(states_data['BASE1_RUN_ID'].isna(), 0, 1)
states_data['2b'] = np.where(states_data['BASE2_RUN_ID'].isna(), 0, 1)
states_data['3b'] = np.where(states_data['BASE3_RUN_ID'].isna(), 0, 1)
states_data['bases'] = states_data['1b'] + states_data['2b']*2 + states_data['3b']*4

# Come up with ending base state for each event
states_data['1b_new'] = np.where(
    (states_data['BAT_DEST_ID'] == 1) |
    (states_data['RUN1_DEST_ID'] == 1) |
    (states_data['RUN2_DEST_ID'] == 1) |
    (states_data['RUN3_DEST_ID'] == 1),
    1, 0
)

states_data['2b_new'] = np.where(
    (states_data['BAT_DEST_ID'] == 2) |
    (states_data['RUN1_DEST_ID'] == 2) |
    (states_data['RUN2_DEST_ID'] == 2) |
    (states_data['RUN3_DEST_ID'] == 2),
    1, 0
)

states_data['3b_new'] = np.where(
    (states_data['BAT_DEST_ID'] == 3) |
    (states_data['RUN1_DEST_ID'] == 3) |
    (states_data['RUN2_DEST_ID'] == 3) |
    (states_data['RUN3_DEST_ID'] == 3),
    1, 0
)

# Clean up outs and events
states_data = states_data.rename(columns={'OUTS_CT': 'outs', 'EVENT_RUNS_CT': 'runs'})
states_data['outs_new'] = np.where(
    events['INN_LAST_PLAY'] == 1, 0, states_data['outs'] + states_data['EVENT_OUTS_CT']
)

# Cleanup lineup events
states_data['BAT_LINEUP_ID_new'] = np.where(
    states_data['BAT_LINEUP_ID'] + states_data['NEXT_BATTER'] == 10,
    1, states_data['BAT_LINEUP_ID'] + states_data['NEXT_BATTER']
)

states_data['INN_CT_new'] = np.where(
    (states_data['INN_LAST_PLAY'] == 1) & (states_data['GAME_LAST_PLAY'] != 1), 
    states_data['INN_CT'] + 1, 
    states_data['INN_CT']
)

states_data['bases_new'] = np.where(
    events['INN_LAST_PLAY'] == 1, 
    0, 
    states_data['1b_new'] + states_data['2b_new']*2 + states_data['3b_new']*4
)

# Only keep base variables
states = ['INN_CT', 'outs', 'bases', 'BAT_LINEUP_ID', 'INN_CT_new', 'INN_LAST_PLAY', 'outs_new', 'bases_new', 'BAT_LINEUP_ID_new']
states_data = states_data.loc[:, states + ['EVENT_CD', 'BAT_EVENT_FL', 'runs'] + ['GAME_ID', 'BAT_HOME_ID', 'EVENT_ID']]

## New State Probabilites conditional on starting state and event

In [None]:
new_state_prob = states_data.groupby(['outs', 'bases', 'INN_LAST_PLAY', 'outs_new', 'bases_new', 'EVENT_CD', 'BAT_EVENT_FL']).size().to_frame()
new_state_prob.columns = ['freq']
new_state_prob['totals'] = new_state_prob.groupby(['EVENT_CD', 'BAT_EVENT_FL' , 'outs', 'bases'])['freq'].transform('sum')
new_state_prob['new_state_prob'] = new_state_prob['freq'] / new_state_prob['totals']
del new_state_prob['totals']
del new_state_prob['freq']

new_state_prob = new_state_prob.reset_index()
new_state_prob = new_state_prob.sort_values(['outs', 'bases', 'EVENT_CD', 'BAT_EVENT_FL', 'INN_LAST_PLAY', 'outs_new', 'bases_new'])

new_state_prob = new_state_prob.reindex(np.repeat(new_state_prob.index, 9))
new_state_prob['BAT_LINEUP_ID'] = new_state_prob.groupby(['outs', 'bases', 'EVENT_CD', 'BAT_EVENT_FL', 'INN_LAST_PLAY', 'outs_new', 'bases_new']).cumcount()+1

new_state_prob['NEXT_BATTER'] = np.where(
    new_state_prob['INN_LAST_PLAY'] == 1, 1, new_state_prob['BAT_EVENT_FL'].astype('int')
)

new_state_prob['BAT_LINEUP_ID_new'] = np.where(
    new_state_prob['BAT_LINEUP_ID'] + new_state_prob['NEXT_BATTER'] == 10,
    1, new_state_prob['BAT_LINEUP_ID'] + new_state_prob['NEXT_BATTER']
)

new_state_prob = new_state_prob.reset_index()
new_state_prob = new_state_prob.reindex(np.repeat(new_state_prob.index, 9))
new_state_prob['INN_CT'] = new_state_prob.groupby(['outs', 'bases', 'BAT_LINEUP_ID', 'EVENT_CD', 'BAT_EVENT_FL', 'INN_LAST_PLAY', 'outs_new', 'bases_new', 'BAT_LINEUP_ID_new']).cumcount()+1

new_state_prob['INN_CT_new'] = np.where(
    new_state_prob['INN_LAST_PLAY'] == 1, 
    new_state_prob['INN_CT'] + 1, 
    new_state_prob['INN_CT']
)

## Calculate Event Odds Conditional on State

In [None]:
event_prob = states_data.groupby(['outs', 'bases', 'BAT_LINEUP_ID', 'EVENT_CD']).size().to_frame()
event_prob.columns = ['freq']
event_prob['totals'] = event_prob.groupby(['outs', 'bases', 'BAT_LINEUP_ID'])['freq'].transform('sum')
event_prob['event_prob'] = event_prob['freq'] / event_prob['totals']
del event_prob['totals']
del event_prob['freq']

event_prob = event_prob.reset_index()
event_prob = event_prob.sort_values(['outs', 'bases', 'BAT_LINEUP_ID', 'EVENT_CD'])

In [None]:
event_prob = event_prob.reindex(np.repeat(new_state_prob.index, 9))
event_prob['INN_CT'] = event_prob.groupby(['outs', 'bases', 'BAT_LINEUP_ID', 'EVENT_CD']).cumcount()+1

# Calculate Transition Probabilities

In [None]:
transition_prob = new_state_prob.merge(event_prob, on=['INN_CT', 'outs', 'bases', 'BAT_LINEUP_ID', 'EVENT_CD']).set_index(states)
transition_prob['transition_prob'] = transition_prob['new_state_prob'] * transition_prob['event_prob']
transition_prob = transition_prob.groupby(states)['transition_prob'].sum().to_frame()

## Calculate Reward Matrix

In [None]:
rewards = states_data.groupby(['outs', 'bases', 'INN_LAST_PLAY', 'outs_new', 'bases_new'])[['runs']].mean()

In [None]:
merged = transition_prob.reset_index().merge(rewards, on=['outs', 'bases', 'INN_LAST_PLAY', 'outs_new', 'bases_new']) 
merged['prod'] = merged['transition_prob'] * merged['runs']
Q = merged.groupby(['INN_CT', 'outs', 'bases', 'BAT_LINEUP_ID'])[['prod']].sum().to_numpy()

## Set up matrices

In [None]:
inns_mat = [1, 2, 3, 4, 5, 6, 7, 8, 9]
outs_mat = [0, 1, 2]
bases_mat = [0, 1, 2, 3, 4, 5, 6, 7]
lineup_mat = [1, 2, 3, 4, 5, 6, 7, 8, 9]
mind = pd.MultiIndex.from_product(
    [inns_mat, outs_mat, bases_mat, lineup_mat, inns_mat, outs_mat, bases_mat, lineup_mat], 
    names = ['INN_CT', 'outs', 'bases', 'BAT_LINEUP_ID', 'INN_CT_new', 'outs_new', 'bases_new', 'BAT_LINEUP_ID_new']
)

In [None]:
transition_prob = transition_prob.reset_index().set_index(['INN_CT', 'outs', 'bases', 'BAT_LINEUP_ID', 'INN_CT_new', 'outs_new', 'bases_new', 'BAT_LINEUP_ID_new'])

In [None]:
transition_prob = transition_prob.reindex(mind, fill_value=0)

In [None]:
transition_prob_wide = transition_prob.reset_index().pivot(
    index=['INN_CT', 'outs', 'bases', 'BAT_LINEUP_ID'], 
    columns=['INN_CT_new', 'outs_new', 'bases_new', 'BAT_LINEUP_ID_new'], 
    values='transition_prob'
)
P = transition_prob_wide.to_numpy()

In [None]:
rewards = rewards.reindex(mind, fill_value=0)
rewards_wide = rewards.reset_index().pivot(
    index=['INN_CT', 'outs', 'bases', 'BAT_LINEUP_ID'], 
    columns=['INN_CT_new', 'outs_new', 'bases_new', 'BAT_LINEUP_ID_new'], 
    values='runs'
)
R = rewards_wide.to_numpy()

## Calculate Expected Runs

In [None]:
I = np.identity(P.shape[0])

In [None]:
v = np.linalg.solve((I-P), Q)

In [None]:
mind2 = pd.MultiIndex.from_product(
    [inns_mat, outs_mat, bases_mat, lineup_mat], 
    names = ['INN_CT', 'outs', 'bases', 'BAT_LINEUP_ID']
)
v_pd = pd.DataFrame(v, index=mind2)

## Compare with Actual Run Values

In [None]:
states_data = states_data.sort_values(['GAME_ID', 'INN_CT', 'BAT_HOME_ID', 'EVENT_ID', 'BAT_LINEUP_ID'])
states_data['current_game_runs'] = states_data.groupby(['GAME_ID', 'BAT_HOME_ID'])['runs'].cumsum()

states_data['game_runs'] = states_data.groupby(['GAME_ID', 'BAT_HOME_ID'])['runs'].transform('sum')

states_data['add_runs'] = states_data['game_runs'] - states_data['current_game_runs'] + states_data['runs']

In [None]:
states_data.groupby(['INN_CT', 'outs', 'bases', 'BAT_LINEUP_ID'])['add_runs'].agg('mean').to_frame().head(15)

In [None]:
v_pd.head(20)

In [None]:
pd.crosstab(events.inning_runs, events.EVENT_RUNS_CT)