In [None]:
from numpy.random import binomial
import pandas as pd
import numpy as np
from pathlib import Path

## Events Prep

In [None]:
interim = Path('../../data/interim')
events = pd.read_pickle(Path(interim) / 'events.pkl')

In [None]:
events['EVENT_ID'] = events['EVENT_ID'].astype('int')
events['INN_CT'] = events['INN_CT'].astype('int')
events = events.sort_values(['GAME_ID', 'INN_CT', 'BAT_HOME_ID', 'EVENT_ID', 'BAT_LINEUP_ID'])

###### States
- Outs DONE
- Bases, DONE
- ScoreDiff, DONE
- TopVBottom, DONE
- Home Lineup, DONE
- Away Lineup, DONE

### Outs

In [None]:
events['TOTAL_OUTS_CT'] = events['OUTS_CT'] + events['EVENT_OUTS_CT']
events['OUTS_CT_new'] = events['TOTAL_OUTS_CT'].mod(3)

### Bases

In [None]:
# Come up with starting base state for each event
events['1b'] = np.where(events['BASE1_RUN_ID'].isna(), 0, 1)
events['2b'] = np.where(events['BASE2_RUN_ID'].isna(), 0, 1)
events['3b'] = np.where(events['BASE3_RUN_ID'].isna(), 0, 1)
events['bases'] = events['1b'] + events['2b']*2 + events['3b']*4

# Come up with ending base state for each event
events['1b_new'] = np.where(
    (events['BAT_DEST_ID'] == 1) |
    (events['RUN1_DEST_ID'] == 1) |
    (events['RUN2_DEST_ID'] == 1) |
    (events['RUN3_DEST_ID'] == 1),
    1, 0
)

events['2b_new'] = np.where(
    (events['BAT_DEST_ID'] == 2) |
    (events['RUN1_DEST_ID'] == 2) |
    (events['RUN2_DEST_ID'] == 2) |
    (events['RUN3_DEST_ID'] == 2),
    1, 0
)

events['3b_new'] = np.where(
    (events['BAT_DEST_ID'] == 3) |
    (events['RUN1_DEST_ID'] == 3) |
    (events['RUN2_DEST_ID'] == 3) |
    (events['RUN3_DEST_ID'] == 3),
    1, 0
)

events['bases_new'] = events['1b_new'] + events['2b_new']*2 + events['3b_new']*4

### Top Vs. Bottom

In [None]:
events['BAT_HOME_ID_new'] = np.where(
    (events['TOTAL_OUTS_CT'] == 3),
    1 - events['BAT_HOME_ID'] ,
    events['BAT_HOME_ID']
)

### Score Diff

In [None]:
events['SCORE_DIFF'] = events['HOME_SCORE_CT'] - events['AWAY_SCORE_CT']

events['SCORE_DIFF'] = np.where(
    events['SCORE_DIFF'] > 8, 8, events['SCORE_DIFF']
)

events['SCORE_DIFF'] = np.where(
    events['SCORE_DIFF'] < -8, -8, events['SCORE_DIFF']
)

events['HOME_SCORE_CT_new'] = np.where(
    events['BAT_HOME_ID'] == 1,
    events['HOME_SCORE_CT'] + events['EVENT_RUNS_CT'], 
    events['HOME_SCORE_CT']
)

events['AWAY_SCORE_CT_new'] = np.where(
    events['BAT_HOME_ID'] == 0,
    events['AWAY_SCORE_CT'] + events['EVENT_RUNS_CT'], 
    events['AWAY_SCORE_CT']
)

events['SCORE_DIFF_new'] = events['HOME_SCORE_CT_new'] - events['AWAY_SCORE_CT_new']

events['SCORE_DIFF_new'] = np.where(
    events['SCORE_DIFF_new'] > 8, 8, events['SCORE_DIFF_new']
)

events['SCORE_DIFF_new'] = np.where(
    events['SCORE_DIFF_new'] < -8, -8, events['SCORE_DIFF_new']
)

### Batting Lineups

In [None]:
events = events.sort_values(['GAME_ID', 'INN_CT', 'EVENT_ID', 'BAT_LINEUP_ID'])

events['HOME_BAT_LINEUP_ID'] = np.where(
    events['BAT_HOME_ID'] == 1,
    events['BAT_LINEUP_ID'],
    np.nan
)

events['HOME_BAT_LINEUP_ID'] = np.where(
    (events['INN_CT'] == 1) & (events['BAT_HOME_ID'] == 0),
    1,
    events['HOME_BAT_LINEUP_ID']
)

events['AWAY_BAT_LINEUP_ID'] = np.where(
    events['BAT_HOME_ID'] == 0,
    events['BAT_LINEUP_ID'],
    np.nan
)

events['half_inning_event_ct'] = events.groupby(['GAME_ID', 'INN_CT', 'BAT_HOME_ID']).cumcount() + 1

events['HOME_BAT_LINEUP_ID'] = np.where(
    (events['half_inning_event_ct'] == 1) & (events['HOME_BAT_LINEUP_ID'].isna()),
    events.groupby(['GAME_ID'])['HOME_BAT_LINEUP_ID'].shift(1).add(1).mod(9),
    events['HOME_BAT_LINEUP_ID']
)

events['AWAY_BAT_LINEUP_ID'] = np.where(
    (events['half_inning_event_ct'] == 1) & (events['AWAY_BAT_LINEUP_ID'].isna()),
    events.groupby(['GAME_ID'])['AWAY_BAT_LINEUP_ID'].shift(1).add(1).mod(9),
    events['AWAY_BAT_LINEUP_ID']
)

events['HOME_BAT_LINEUP_ID'] = events.groupby(['GAME_ID'])['HOME_BAT_LINEUP_ID'].ffill()
events['AWAY_BAT_LINEUP_ID'] = events.groupby(['GAME_ID'])['AWAY_BAT_LINEUP_ID'].ffill()

In [None]:
events['WALKOFF'] = np.where(
    (events['SCORE_DIFF_new'] > 0) & (events['INN_CT'] >= 9) & (events['BAT_HOME_ID'] == 1),
    1, 0
)

events['INN_LAST_PLAY'] = np.where(
    (events['TOTAL_OUTS_CT'] == 3) | (events['WALKOFF'] == 1), 1, 0
)

events['NEXT_BATTER'] = np.where(
    events['INN_LAST_PLAY'] == 1, 1, events['BAT_EVENT_FL'].astype('int')
)

events['HOME_NEXT_BATTER'] = np.where(
    events['BAT_HOME_ID'] == 1, events['NEXT_BATTER'], 0
)

events['AWAY_NEXT_BATTER'] = np.where(
    events['BAT_HOME_ID'] == 0, events['NEXT_BATTER'], 0
)

In [None]:
events['HOME_BAT_LINEUP_ID_new'] = (events['HOME_BAT_LINEUP_ID'] + events['HOME_NEXT_BATTER']).mod(9)

events['AWAY_BAT_LINEUP_ID_new'] = (events['AWAY_BAT_LINEUP_ID'] + events['AWAY_NEXT_BATTER']).mod(9)

events['BAT_LINEUP_ID_new'] = np.where(
    events['BAT_HOME_ID_new'] == 1,
    events['HOME_BAT_LINEUP_ID_new'],
    events['AWAY_BAT_LINEUP_ID_new']
)

## Prep data by cleaning states and events

In [None]:
# Limit to main event types and relevant variables for simplicity
# states_data = events.loc[
#     : , # events.outs_in_inn == 3,
#     ['EVENT_CD', 'BASE1_RUN_ID', 'BASE2_RUN_ID', 'BASE3_RUN_ID', 
#     'BAT_DEST_ID', 'RUN1_DEST_ID', 'RUN2_DEST_ID', 'RUN3_DEST_ID',
#     'OUTS_CT', 'EVENT_OUTS_CT', 'NEXT_BATTER',
#     'GAME_ID', 'BAT_HOME_ID', 'INN_CT', 'BAT_EVENT_FL', 'EVENT_ID', 
#     'BAT_LINEUP_ID', 'HOME_BAT_LINEUP_ID', 'AWAY_BAT_LINEUP_ID', 
#     'EVENT_RUNS_CT', 'AWAY_SCORE_CT', 'HOME_SCORE_CT', 'SCORE_DIFF', 'SCORE_DIFF_new', 'WALKOFF']
# ]

In [None]:
# Only keep base variables
states = [
    'OUTS_CT', 'bases', 'SCORE_DIFF', 'BAT_HOME_ID', 
    'HOME_BAT_LINEUP_ID', 'AWAY_BAT_LINEUP_ID', 
    'OUTS_CT_new', 'bases_new', 'SCORE_DIFF_new', 'BAT_HOME_ID_new', 
    'HOME_BAT_LINEUP_ID_new', 'AWAY_BAT_LINEUP_ID_new'
]

states_data = events.loc[:, states + ['EVENT_CD', 'BAT_EVENT_FL', 'TOTAL_OUTS_CT', 'EVENT_RUNS_CT'] + ['GAME_ID', 'INN_CT', 'EVENT_ID']]

states_data = states_data.sort_values(['GAME_ID', 'INN_CT', 'BAT_HOME_ID', 'EVENT_ID'])


In [None]:
del events

## New State Probabilites conditional on starting state and event

In [None]:
new_state_prob = states_data.groupby([
    'OUTS_CT', 'bases', 'TOTAL_OUTS_CT', 'bases_new', 
    'EVENT_CD', 'BAT_EVENT_FL', 'EVENT_RUNS_CT'
]).size().to_frame()
new_state_prob.columns = ['freq']

new_state_prob['totals'] = new_state_prob.groupby([
    'EVENT_CD', 'BAT_EVENT_FL' , 'OUTS_CT', 'bases'
])['freq'].transform('sum')
new_state_prob['new_state_prob'] = new_state_prob['freq'] / new_state_prob['totals']
del new_state_prob['totals']
del new_state_prob['freq']

new_state_prob = new_state_prob.reset_index()
new_state_prob = new_state_prob.sort_values([
    'OUTS_CT', 'bases', 'EVENT_CD', 'BAT_EVENT_FL', 'TOTAL_OUTS_CT', 'bases_new', 'EVENT_RUNS_CT'
])

In [None]:
new_state_prob = new_state_prob.reindex(np.repeat(new_state_prob.index, 9))
new_state_prob['HOME_BAT_LINEUP_ID'] = new_state_prob.groupby([
    'OUTS_CT', 'bases', 'EVENT_CD', 'BAT_EVENT_FL', 'TOTAL_OUTS_CT', 'bases_new', 'EVENT_RUNS_CT'
]).cumcount()+1
new_state_prob = new_state_prob.reset_index(drop=True)

new_state_prob = new_state_prob.reindex(np.repeat(new_state_prob.index, 9))
new_state_prob['AWAY_BAT_LINEUP_ID'] = new_state_prob.groupby([
    'OUTS_CT', 'bases', 'HOME_BAT_LINEUP_ID', 'EVENT_CD', 
    'BAT_EVENT_FL', 'TOTAL_OUTS_CT', 'bases_new', 'EVENT_RUNS_CT'
]).cumcount()+1
new_state_prob = new_state_prob.reset_index(drop=True)

new_state_prob = new_state_prob.reindex(np.repeat(new_state_prob.index, 2))
new_state_prob['BAT_HOME_ID'] = new_state_prob.groupby([
    'OUTS_CT', 'bases', 'HOME_BAT_LINEUP_ID', 'AWAY_BAT_LINEUP_ID', 
    'EVENT_CD', 'BAT_EVENT_FL', 'TOTAL_OUTS_CT', 'bases_new', 'EVENT_RUNS_CT'
]).cumcount()
new_state_prob = new_state_prob.reset_index(drop=True)

new_state_prob = new_state_prob.reindex(np.repeat(new_state_prob.index, 17))
new_state_prob['SCORE_DIFF'] = new_state_prob.groupby([
    'OUTS_CT', 'bases', 'HOME_BAT_LINEUP_ID', 'AWAY_BAT_LINEUP_ID', 
    'BAT_HOME_ID', 'EVENT_CD', 'BAT_EVENT_FL', 'TOTAL_OUTS_CT', 'bases_new', 'EVENT_RUNS_CT'
]).cumcount() - 8 
new_state_prob = new_state_prob.reset_index(drop=True)

In [None]:
new_state_prob['OUTS_CT_new'] = new_state_prob['TOTAL_OUTS_CT'].mod(3)

In [None]:
new_state_prob['BAT_HOME_ID_new'] = np.where(
    (new_state_prob['TOTAL_OUTS_CT'] == 3),
    1 - new_state_prob['BAT_HOME_ID'] ,
    new_state_prob['BAT_HOME_ID']
)

In [None]:
new_state_prob['SCORE_DIFF_new'] = np.where(
    new_state_prob['BAT_HOME_ID'] == 1,
    new_state_prob['SCORE_DIFF'] + new_state_prob['EVENT_RUNS_CT'], 
    new_state_prob['SCORE_DIFF']
)

new_state_prob['SCORE_DIFF_new'] = np.where(
    new_state_prob['BAT_HOME_ID'] == 0,
    new_state_prob['SCORE_DIFF'] - new_state_prob['EVENT_RUNS_CT'], 
    new_state_prob['SCORE_DIFF']
)

new_state_prob['SCORE_DIFF_new'] = np.where(
    new_state_prob['SCORE_DIFF'] > 8, 8, new_state_prob['SCORE_DIFF_new']
)

new_state_prob['SCORE_DIFF_new'] = np.where(
    new_state_prob['SCORE_DIFF_new'] > 8, 8, new_state_prob['SCORE_DIFF_new']
)

new_state_prob['SCORE_DIFF_new'] = np.where(
    new_state_prob['SCORE_DIFF'] < -8, -8, new_state_prob['SCORE_DIFF_new']
)

new_state_prob['SCORE_DIFF_new'] = np.where(
    new_state_prob['SCORE_DIFF_new'] < -8, -8, new_state_prob['SCORE_DIFF_new']
)

In [None]:
new_state_prob['WALKOFF'] = np.where(
    (new_state_prob['SCORE_DIFF_new'] > 0) & (new_state_prob['BAT_HOME_ID'] == 1),
    1, 0
)

new_state_prob['INN_LAST_PLAY'] = np.where(
    (new_state_prob['TOTAL_OUTS_CT'] == 3) | (new_state_prob['WALKOFF'] == 1), 1, 0
)

new_state_prob['NEXT_BATTER'] = np.where(
    new_state_prob['INN_LAST_PLAY'] == 1, 1, new_state_prob['BAT_EVENT_FL'].astype('int')
)

new_state_prob['HOME_NEXT_BATTER'] = np.where(
    new_state_prob['BAT_HOME_ID'] == 1, new_state_prob['NEXT_BATTER'], 0
)

new_state_prob['AWAY_NEXT_BATTER'] = np.where(
    new_state_prob['BAT_HOME_ID'] == 0, new_state_prob['NEXT_BATTER'], 0
)

In [None]:
new_state_prob['HOME_BAT_LINEUP_ID_new'] = (new_state_prob['HOME_BAT_LINEUP_ID'] + 
                                            new_state_prob['HOME_NEXT_BATTER']).mod(9)

new_state_prob['AWAY_BAT_LINEUP_ID_new'] = (new_state_prob['AWAY_BAT_LINEUP_ID'] + 
                                            new_state_prob['AWAY_NEXT_BATTER']).mod(9)

new_state_prob['BAT_LINEUP_ID_new'] = np.where(
    new_state_prob['BAT_HOME_ID_new'] == 1,
    new_state_prob['HOME_BAT_LINEUP_ID_new'],
    new_state_prob['AWAY_BAT_LINEUP_ID_new']
)

## Calculate Event Odds Conditional on State

In [None]:
event_prob = states_data.groupby([
    'OUTS_CT', 'bases', 'SCORE_DIFF', 'BAT_HOME_ID', 
    'HOME_BAT_LINEUP_ID', 'AWAY_BAT_LINEUP_ID', 'EVENT_CD'
]).size().to_frame()
event_prob.columns = ['freq']
event_prob['totals'] = event_prob.groupby([
    'OUTS_CT', 'bases', 'SCORE_DIFF', 'BAT_HOME_ID', 
    'HOME_BAT_LINEUP_ID', 'AWAY_BAT_LINEUP_ID'
])['freq'].transform('sum')
event_prob['event_prob'] = event_prob['freq'] / event_prob['totals']
del event_prob['totals']
del event_prob['freq']

event_prob = event_prob.reset_index()
event_prob = event_prob.sort_values([
    'OUTS_CT', 'bases', 'SCORE_DIFF', 'BAT_HOME_ID', 
    'HOME_BAT_LINEUP_ID', 'AWAY_BAT_LINEUP_ID', 'EVENT_CD'
])

# Calculate Transition Probabilities

In [None]:
transition_prob = new_state_prob.merge(event_prob, on=[
    'OUTS_CT', 'bases', 'SCORE_DIFF', 'BAT_HOME_ID', 
    'HOME_BAT_LINEUP_ID', 'AWAY_BAT_LINEUP_ID', 'EVENT_CD'
]).set_index(states)
transition_prob['transition_prob'] = transition_prob['new_state_prob'] * transition_prob['event_prob']
transition_prob = transition_prob.groupby(states)['transition_prob'].sum().to_frame()

## Calculate Reward Matrix

In [None]:
rewards = states_data.groupby(states)[['EVENT_RUNS_CT']].mean()

In [None]:
merged = transition_prob.merge(rewards, on=states) 
merged['prod'] = merged['transition_prob'] * merged['EVENT_RUNS_CT']
Q = merged.groupby([
    'OUTS_CT', 'bases', 'SCORE_DIFF', 'BAT_HOME_ID', 
    'HOME_BAT_LINEUP_ID', 'AWAY_BAT_LINEUP_ID'
])[['prod']].sum().to_numpy()

## Set up matrices

In [None]:
outs_mat = [0, 1, 2]
bases_mat = [0, 1, 2, 3, 4, 5, 6, 7]
lineup_mat = [1, 2, 3, 4, 5, 6, 7, 8, 9]
top_mat = [0, 1]
diff_mat = [-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8]
mind = pd.MultiIndex.from_product(
    [outs_mat, bases_mat, diff_mat, top_mat, lineup_mat, lineup_mat], 
    names = ['OUTS_CT', 'bases', 'SCORE_DIFF', 'BAT_HOME_ID', 'HOME_BAT_LINEUP_ID', 'AWAY_BAT_LINEUP_ID'] 
)

In [None]:
transition_prob = transition_prob.astype(pd.SparseDtype("float", 0))

In [None]:
transition_prob_wide = transition_prob.reset_index().pivot(
    index=['OUTS_CT', 'bases', 'SCORE_DIFF', 'BAT_HOME_ID', 'HOME_BAT_LINEUP_ID', 'AWAY_BAT_LINEUP_ID'], 
    columns=['OUTS_CT_new', 'bases_new', 'SCORE_DIFF_new', 'BAT_HOME_ID_new', 'HOME_BAT_LINEUP_ID_new', 'AWAY_BAT_LINEUP_ID_new'], 
    values='transition_prob'
)
P = transition_prob_wide.to_numpy()

In [None]:
transition_prob_wide = transition_prob_wide.astype(pd.SparseDtype("float", 0))

In [None]:
transition_prob_wide = transition_prob_wide.reindex(mind, fill_value=0)

In [None]:
rewards = rewards.reindex(mind, fill_value=0)
rewards_wide = rewards.reset_index().pivot(
    index=['outs', 'bases', 'SCORE_DIFF', 'BAT_HOME_ID', 'HOME_BAT_LINEUP_ID', 'AWAY_BAT_LINEUP_ID'], 
    columns=['outs_new', 'bases_new', 'SCORE_DIFF_new', 'BAT_HOME_ID_new', 'HOME_BAT_LINEUP_ID_new', 'AWAY_BAT_LINEUP_ID_new'],
    values='runs'
)
R = rewards_wide.to_numpy()

## Calculate Expected Runs

In [None]:
I = np.identity(P.shape[0])

In [None]:
v = np.linalg.solve((I-P), Q)

In [None]:
mind2 = pd.MultiIndex.from_product(
    [outs_mat, bases_mat, lineup_mat], 
    names = ['outs', 'bases', 'SCORE_DIFF', 'BAT_HOME_ID', 'HOME_BAT_LINEUP_ID', 'AWAY_BAT_LINEUP_ID']
)
v_pd = pd.DataFrame(v, index=mind2)

## Compare with Actual Run Values

In [None]:
states_data = states_data.sort_values(['GAME_ID', 'INN_CT', 'BAT_HOME_ID', 'EVENT_ID', 'BAT_LINEUP_ID'])
states_data['current_inning_runs'] = states_data.groupby(['GAME_ID', 'BAT_HOME_ID', 'INN_CT'])['runs'].cumsum()

states_data['inning_runs'] = states_data.groupby(['GAME_ID', 'BAT_HOME_ID', 'INN_CT'])['runs'].transform('sum')

states_data['add_runs'] = states_data['inning_runs'] - states_data['current_inning_runs'] + states_data['runs']

In [None]:
states_data.groupby(['outs', 'bases', 'BAT_LINEUP_ID'])['add_runs'].agg('mean').to_frame().head(15)

In [None]:
v_pd.head(15)