# Generate Field Outcome
 - Given the batter has made contact, and the hit characteristics are given, what happens on the field?
 - Is the ball caught, do runners advance, is it a home run?
 - Model must factor in current game state, i.e., bases, etc.

## Potential Difficulties:
 - Maybe need to adjust for outfielders?
 - How to factor stealing in, this also needs to be worked on in the bat outcome as it can happen on strikes/balls?

In [1]:
from src.data.data_utils import query_mlb_db

In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)


In [61]:
df = query_mlb_db("""
select game_pk, inning, inning_topbot, at_bat_number, pitch_number, outs_when_up, post_bat_score - bat_score as runs_scored, stand,
CASE
	WHEN events IN ('single') THEN 'single'
	WHEN events IN ('double') THEN 'double'
	WHEN events IN ('triple') THEN 'triple'
	WHEN events IN ('home_run') THEN 'home_run'
	WHEN events IN ('field_out') THEN 'field_out'
	WHEN events IN ('ground_out', 'force_out') THEN 'ground_out'
	WHEN events IN ('fly_out', 'sac_fly') THEN 'fly_out'
	WHEN events IN ('double_play', 'grounded_into_double_play', 'sac_fly_double_play') THEN 'double_play'
	WHEN events IN ('triple_play') THEN 'triple_play'
	WHEN events IN ('field_error') THEN 'fielding_error'
	WHEN events IN ('fielders_choice') THEN 'fielders_choice'
	ELSE NULL
END AS simplified_outcome,
batter, on_1b, on_2b, on_3b
from Statcast 
where game_year > 2020
and inning < 9 
order by game_date, game_pk, inning, at_bat_number, pitch_number
;
""")

In [45]:
df[df['simplified_outcome']=='ground_out'][['outs_when_up','stand','on_1b', 'on_2b', 'on_3b']]

Unnamed: 0,outs_when_up,stand,on_1b,on_2b,on_3b
9,0,R,660670.0,,
29,2,R,595909.0,650333.0,
91,2,R,467055.0,,
229,0,R,670156.0,656484.0,
302,1,L,519299.0,,
...,...,...,...,...,...
2551415,0,R,621493.0,,
2551543,1,L,663656.0,,
2551928,1,R,669134.0,,
2551994,1,L,606115.0,621566.0,


In [69]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 409521 entries, 1 to 2305676
Data columns (total 17 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   game_pk             409521 non-null  int64  
 1   inning              409521 non-null  int64  
 2   inning_topbot       409521 non-null  object 
 3   at_bat_number       409521 non-null  int64  
 4   pitch_number        409521 non-null  int64  
 5   outs_when_up        409521 non-null  int64  
 6   runs_scored         409521 non-null  int64  
 7   stand               409521 non-null  object 
 8   simplified_outcome  409521 non-null  object 
 9   batter              409521 non-null  int64  
 10  on_1b               127155 non-null  float64
 11  on_2b               76051 non-null   float64
 12  on_3b               38990 non-null   float64
 13  outs_after          409521 non-null  int64  
 14  on_1b_after         119232 non-null  float64
 15  on_2b_after         68476 non-null   f

In [74]:
df

Unnamed: 0,game_pk,inning,inning_topbot,at_bat_number,pitch_number,outs_when_up,runs_scored,stand,simplified_outcome,batter,on_1b,on_2b,on_3b,outs_after,on_1b_after,on_2b_after,on_3b_after,initial_mapping
1,641542,1,Top,1,2,0,0,L,single,650333,,,,0,650333.0,,,"{650333: 'batter', nan: '1b', nan: '2b', nan: ..."
2,641542,1,Top,2,1,0,0,L,field_out,595909,650333.0,,,1,650333.0,,,"{595909: 'batter', 650333.0: '1b', nan: '2b', ..."
3,641542,1,Top,3,1,1,0,R,field_out,680777,650333.0,,,2,650333.0,,,"{680777: 'batter', 650333.0: '1b', nan: '2b', ..."
4,641542,1,Top,4,1,2,0,L,field_out,666135,650333.0,,,3,,,,"{666135: 'batter', 650333.0: '1b', nan: '2b', ..."
9,641542,1,Bot,6,1,0,0,R,ground_out,645277,660670.0,,,1,645277.0,,,"{645277: 'batter', 660670.0: '1b', nan: '2b', ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2305649,747121,8,Top,59,1,2,0,L,single,595777,650333.0,,,2,595777.0,650333.0,,"{595777: 'batter', 650333.0: '1b', nan: '2b', ..."
2305655,747121,8,Top,60,6,2,0,L,field_out,630105,595777.0,650333.0,,3,,,,"{630105: 'batter', 595777.0: '1b', 650333.0: '..."
2305672,747121,8,Bot,63,6,1,0,R,single,518595,542303.0,,,1,518595.0,542303.0,,"{518595: 'batter', 542303.0: '1b', nan: '2b', ..."
2305674,747121,8,Bot,64,2,1,0,R,ground_out,594807,518595.0,542303.0,,2,594807.0,,542303.0,"{594807: 'batter', 518595.0: '1b', 542303.0: '..."


In [66]:
df['outs_after'] = df.groupby(['game_pk', 'inning', 'inning_topbot'])['outs_when_up'].shift(-1, fill_value=3)
for col in ('on_1b_after', 'on_2b_after', 'on_3b_after'):
    df[col] = df.groupby(['game_pk', 'inning', 'inning_topbot'])[col[:5]].shift(-1, fill_value=np.nan)

In [71]:
#only care about hit outcome events
df = df[~df['simplified_outcome'].isna()]

In [73]:
def create_initial_mapping(row):
    mapping = {
        row['batter']: 'batter',
        row['on_1b']: '1b',
        row['on_2b']: '2b',
        row['on_3b']: '3b'
    }
    return mapping

df['initial_mapping'] = df.apply(create_initial_mapping, axis=1)

In [75]:
def encode_after_state(row):
    mapping = row['initial_mapping']
    after_state = {
        '1b_after': mapping.get(row['on_1b_after'], False),
        '2b_after': mapping.get(row['on_2b_after'], False),
        '3b_after': mapping.get(row['on_3b_after'], False)
    }
    return after_state

# Apply the function to each row to get the after state encoding
after_states = df.apply(encode_after_state, axis=1)kjjjj

In [76]:
after_states_df = pd.DataFrame(after_states.tolist())

In [95]:
# Merge the after states with the original data
data = pd.concat([df.reset_index(drop=True), after_states_df], axis=1)

In [96]:
data

Unnamed: 0,game_pk,inning,inning_topbot,at_bat_number,pitch_number,outs_when_up,runs_scored,stand,simplified_outcome,batter,on_1b,on_2b,on_3b,outs_after,on_1b_after,on_2b_after,on_3b_after,initial_mapping,1b_after,2b_after,3b_after
0,641542,1,Top,1,2,0,0,L,single,650333,,,,0,650333.0,,,"{650333: 'batter', nan: '1b', nan: '2b', nan: ...",batter,False,False
1,641542,1,Top,2,1,0,0,L,field_out,595909,650333.0,,,1,650333.0,,,"{595909: 'batter', 650333.0: '1b', nan: '2b', ...",1b,False,False
2,641542,1,Top,3,1,1,0,R,field_out,680777,650333.0,,,2,650333.0,,,"{680777: 'batter', 650333.0: '1b', nan: '2b', ...",1b,False,False
3,641542,1,Top,4,1,2,0,L,field_out,666135,650333.0,,,3,,,,"{666135: 'batter', 650333.0: '1b', nan: '2b', ...",False,False,False
4,641542,1,Bot,6,1,0,0,R,ground_out,645277,660670.0,,,1,645277.0,,,"{645277: 'batter', 660670.0: '1b', nan: '2b', ...",batter,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
409516,747121,8,Top,59,1,2,0,L,single,595777,650333.0,,,2,595777.0,650333.0,,"{595777: 'batter', 650333.0: '1b', nan: '2b', ...",batter,1b,False
409517,747121,8,Top,60,6,2,0,L,field_out,630105,595777.0,650333.0,,3,,,,"{630105: 'batter', 595777.0: '1b', 650333.0: '...",False,False,False
409518,747121,8,Bot,63,6,1,0,R,single,518595,542303.0,,,1,518595.0,542303.0,,"{518595: 'batter', 542303.0: '1b', nan: '2b', ...",batter,1b,False
409519,747121,8,Bot,64,2,1,0,R,ground_out,594807,518595.0,542303.0,,2,594807.0,,542303.0,"{594807: 'batter', 518595.0: '1b', 542303.0: '...",batter,False,2b


In [97]:
data['on_1b'] = data['on_1b'].notnull().astype(bool)
data['on_2b'] = data['on_2b'].notnull().astype(bool)
data['on_3b'] = data['on_3b'].notnull().astype(bool)

In [98]:
def represent_state(row):
    state_before = (row['outs_when_up'], (row['on_1b'], row['on_2b'], row['on_3b']))
    state_after = (row['outs_after'], (row['1b_after'], row['2b_after'], row['3b_after']), row['runs_scored'])
    return state_before, state_after

# Apply the function to get the state representations
data['state_before'], data['state_after'] = zip(*data.apply(represent_state, axis=1))
state_transition_df = data[['simplified_outcome', 'stand', 'state_before', 'state_after']]

In [99]:
state_transition_df

Unnamed: 0,simplified_outcome,stand,state_before,state_after
0,single,L,"(0, (False, False, False))","(0, (batter, False, False), 0)"
1,field_out,L,"(0, (True, False, False))","(1, (1b, False, False), 0)"
2,field_out,R,"(1, (True, False, False))","(2, (1b, False, False), 0)"
3,field_out,L,"(2, (True, False, False))","(3, (False, False, False), 0)"
4,ground_out,R,"(0, (True, False, False))","(1, (batter, False, False), 0)"
...,...,...,...,...
409516,single,L,"(2, (True, False, False))","(2, (batter, 1b, False), 0)"
409517,field_out,L,"(2, (True, True, False))","(3, (False, False, False), 0)"
409518,single,R,"(1, (True, False, False))","(1, (batter, 1b, False), 0)"
409519,ground_out,R,"(1, (True, True, False))","(2, (batter, False, 2b), 0)"


In [100]:
from collections import defaultdict

def compute_state_transition_probs(df):
    transition_counts = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(int))))

    for row in df.itertuples():
        #event, cur_state, next_state
        transition_counts[row[1]][row[2]][row[3]][row[4]] += 1

    # Convert transition counts to probabilities
    transition_probs = defaultdict(lambda: defaultdict(dict))
    
    #for event, cur_state_dict in transition_counts.items():
    #    for cur_state, next_state_dict in cur_state_dict.items():
    #        total_transitions = sum(next_state_dict.values())
    #        transition_probs[event][cur_state] = {next_state: count / total_transitions for next_state, count in next_state_dict.items()}

    for event, stand_dict in transition_counts.items():
        for stand, cur_state_dict in stand_dict.items():
            for cur_state, next_state_dict in cur_state_dict.items():
                total_transitions = sum(next_state_dict.values())
                transition_probs[event][stand][cur_state] = {next_state: count / total_transitions for next_state, count in next_state_dict.items()}

    return transition_probs


t_probs = compute_state_transition_probs(state_transition_df)

In [285]:
t_probs['field_out']['R'][(1,(False, False, False))]

{(2, (False, False, False), 0): 1.0}

In [274]:
from pprint import pprint
def sample_next_state(event, stand, cur_state, transition_probs, print_probs=False):
    if event not in transition_probs or stand not in transition_probs[event] or cur_state not in transition_probs[event][stand]:
        print('ERROR: KEY NOT FOUND')
        return None  # No transitions available for this event and current state

    next_states = list(transition_probs[event][stand][cur_state].keys())
    probabilities = list(transition_probs[event][stand][cur_state].values())

    if print_probs:
        pprint(transition_probs[event][stand][cur_state])
        
    next_state = np.random.choice(len(next_states), p=probabilities)
    return next_states[next_state]

# Example of using the sample_next_state function
event = 'single'
cur_state = (1, (True, False, True))
stand = 'R'
next_state = sample_next_state(event, stand, cur_state, t_probs)
print(next_state)

(1, ('batter', '1b', False), 1)


In [213]:
import pickle

with open('game_state_t_probs.pkl', 'wb') as f:
    pickle.dump(dict(t_probs), f)

In [2]:
import pickle
with open('game_state_t_probs.pkl', 'rb') as f:
    loaded_dict = pickle.load(f)

In [22]:
bases = {'1b': 1, '2b': None, '3b': None}
outs=2

In [29]:
state_encoded = (outs,tuple(bases[base] is not None for base in ['1b', '2b', '3b']))

In [30]:
state_encoded

(2, (True, False, False))

In [38]:
list(loaded_dict['single']['L'][state_encoded].keys())[0]

(2, ('batter', '1b', False), 0)

In [275]:
sample_next_state('groundout', stand, cur_state, loaded_dict, True)

ERROR: KEY NOT FOUND


In [None]:
def encode_after_state(row):
    mapping = row['initial_mapping']
    after_state = {
        '1b_after': mapping.get(row['on_1b_after'], np.nan),
        '2b_after': mapping.get(row['on_2b_after'], np.nan),
        '3b_after': mapping.get(row['on_3b_after'], np.nan)
    }
    return after_state

df['initial_mapping'] = df.apply(initial_state_map, axis=1)
after_states = df.apply(encode_after_state, axis=1)
after_states_df = pd.DataFrame(after_states.tolist())

# Merge the after states with the original data
data = pd.concat([df, after_states_df], axis=1)
test = data.drop('initial_mapping', axis=1)[['game_pk','inning', 'inning_topbot', 'at_bat_number', 'pitch_number',
                                             'simplified_outcome',
                                             'runs_scored', 'outs_when_up', 'outs_after',
                                            'batter', 'on_1b', 'on_2b', 'on_3b',
                                            '1b_after', '2b_after', '3b_after', 
                                            ]]
test[~test['simplified_outcome'].isna()].to_csv('test.csv', index=False)

In [60]:
test2 = test[~test['simplified_outcome'].isna()]
test2['before_state'] = test2.apply(encode_before_state,axis=1)
#test2['after_state'] = test2.apply(encode_after_state, axis=1)
test2[['simplified_outcome', 'before_state']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test2['before_state'] = test2.apply(encode_before_state,axis=1)


Unnamed: 0,simplified_outcome,before_state
3,single,"(0, [1b, 2b, 3b])"
11,single,"(1, [1b, 2b, 3b])"
15,double_play,"(1, [1b, 2b, 3b])"
18,field_out,"(0, [1b, 2b, 3b])"
20,double,"(1, [1b, 2b, 3b])"
...,...,...
9981,field_out,"(1, [1b, 2b, 3b])"
9986,double,"(2, [1b, 2b, 3b])"
9988,field_out,"(2, [1b, 2b, 3b])"
9993,field_out,"(0, [1b, 2b, 3b])"


In [39]:
test[~test['simplified_outcome'].isna()]

Unnamed: 0,game_pk,inning,inning_topbot,at_bat_number,pitch_number,outs_when_up,bat_score,batter,stand,simplified_outcome,on_1b,on_2b,on_3b,outs_after,bat_score_after,on_1b_after,on_2b_after,on_3b_after,1b_after,2b_after,3b_after
3,490099,1,Top,1,4,0,0,656941,L,single,,,,0,0,656941.0,,,batter,,
11,490099,1,Top,3,3,1,0,519203,L,single,656941.0,,,1,1,519203.0,656941.0,,batter,1b,
15,490099,1,Top,4,4,1,0,450314,L,double_play,519203.0,656941.0,,3,3,,,,,,
18,490099,1,Bot,5,3,0,0,451594,R,field_out,,,,1,1,,,,,,
20,490099,1,Bot,6,2,1,0,649557,R,double,,,,1,1,,649557.0,,,batter,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9981,490133,2,Bot,14,4,1,0,493329,R,field_out,,,,2,2,,,,,,
9986,490133,2,Bot,15,5,2,0,594828,R,double,,,,2,2,,594828.0,,,batter,
9988,490133,2,Bot,16,2,2,0,502210,L,field_out,,594828.0,,3,3,,,,,,
9993,490133,3,Top,17,5,0,0,547982,L,field_out,,,,1,1,,,,,,


In [21]:
df.head(50)

Unnamed: 0,game_pk,inning,inning_topbot,at_bat_number,pitch_number,outs_when_up,bat_score,batter,stand,simplified_outcome,on_1b,on_2b,on_3b,outs_after,bat_score_after,on_1b_after,on_2b_after,on_3b_after
0,490099,1,Top,1,1,0,0,656941,L,,,,,0,0,,,
1,490099,1,Top,1,2,0,0,656941,L,,,,,0,0,,,
2,490099,1,Top,1,3,0,0,656941,L,,,,,0,0,,,
3,490099,1,Top,1,4,0,0,656941,L,single,,,,0,0,656941.0,,
4,490099,1,Top,2,1,0,0,592178,R,,656941.0,,,0,0,656941.0,,
5,490099,1,Top,2,2,0,0,592178,R,,656941.0,,,0,0,656941.0,,
6,490099,1,Top,2,3,0,0,592178,R,,656941.0,,,0,0,656941.0,,
7,490099,1,Top,2,4,0,0,592178,R,,656941.0,,,0,0,656941.0,,
8,490099,1,Top,2,5,0,0,592178,R,,656941.0,,,1,1,656941.0,,
9,490099,1,Top,3,1,1,0,519203,L,,656941.0,,,1,1,656941.0,,


In [None]:
import pandas as pd
import numpy as np

# Sample game states representation
# (outs, [first_base, second_base, third_base])
initial_state = (0, [1, 0, 0])  # 0 outs, runner on first

# Load your Statcast data
data = pd.read_csv('statcast_data.csv')

# Assuming you have the necessary columns in your data
# 'launch_speed', 'launch_angle', 'spray_angle', 'batter_speed', 'runner_speed', 'event', 'outs_before', 'runners_before', 'outs_after', 'runners_after'

# Define a function to compute transition probabilities
def compute_transition_probabilities(data):
    transition_counts = {}
    transition_probs = {}
    
    for idx, row in data.iterrows():
        state_before = (row['outs_before'], tuple(row['runners_before']))
        state_after = (row['outs_after'], tuple(row['runners_after']))
        event = row['event']
        
        if state_before not in transition_counts:
            transition_counts[state_before] = {}
        if state_after not in transition_counts[state_before]:
            transition_counts[state_before][state_after] = 0
        
        transition_counts[state_before][state_after] += 1
    
    # Convert counts to probabilities
    for state_before in transition_counts:
        total_transitions = sum(transition_counts[state_before].values())
        transition_probs[state_before] = {state_after: count / total_transitions
                                          for state_after, count in transition_counts[state_before].items()}
    
    return transition_probs

transition_probs = compute_transition_probabilities(data)

# Define a function to simulate game state transitions
def simulate_game_state(initial_state, event, transition_probs):
    state = initial_state
    
    if state in transition_probs:
        possible_transitions = transition_probs[state]
        next_states = list(possible_transitions.keys())
        probabilities = list(possible_transitions.values())
        
        next_state = np.random.choice(next_states, p=probabilities)
        return next_state
    else:
        # Handle unseen states or edge cases
        return state

# Example simulation
current_state = initial_state
event = 'single'  # Example event

# Assuming batter speed and runner speeds are taken into account when generating probabilities
next_state = simulate_game_state(current_state, event, transition_probs)
print("Next State:", next_state)
