In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import typing

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import defaultdict

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [19]:
raw_df = pd.read_csv('../../data/raw_data_curmap.csv')
exploded_train_df = pd.read_csv("gs://pitch-sequencing/sequence_data/full_sequence_data/v2/exploded/large_cur_train.csv")
exploded_valid_df = pd.read_csv('gs://pitch-sequencing/sequence_data/full_sequence_data/v2/exploded/large_cur_validation.csv')
exploded_test_df = pd.read_csv('gs://pitch-sequencing/sequence_data/full_sequence_data/v2/exploded/large_cur_test.csv')

In [20]:
exploded_test_df.head(10)

Unnamed: 0,pitch_sequence,count_sequence,zone_sequence,p_throws,stand,pitcher_id,batter_id,at_bat_number,game_date
0,"CB,SL","0-0,0-1",913,L,R,666129,621028,35,2023-05-14
1,"CB,SL,SI","0-0,0-1,0-2",9132,L,R,666129,621028,35,2023-05-14
2,"CB,SL,SI,SL","0-0,0-1,0-2,0-2",913214,L,R,666129,621028,35,2023-05-14
3,"CB,SL,SI,SL,SI","0-0,0-1,0-2,0-2,1-2",9132144,L,R,666129,621028,35,2023-05-14
4,"SI,CH","0-0,0-1",411,R,L,453562,520471,13,2018-08-12
5,"SI,CH,SI","0-0,0-1,1-1",4113,R,L,453562,520471,13,2018-08-12
6,"SI,CH,SI,CB","0-0,0-1,1-1,1-2",41138,R,L,453562,520471,13,2018-08-12
7,"FF,CB","0-0,0-1",914,R,L,656546,606466,19,2019-06-20
8,"FF,CB,FF","0-0,0-1,0-2",91414,R,L,656546,606466,19,2019-06-20
9,"FF,CB,FF,CB","0-0,0-1,0-2,1-2",9141411,R,L,656546,606466,19,2019-06-20


In [5]:
def add_pitch_number_to_df(df: pd.DataFrame):
    df['at_bat_pitch_number'] = df['pitch_sequence'].apply(lambda seq: len(seq.split(',')))

def add_model_inputs(df: pd.DataFrame):
    df['target_pitch'] = df['pitch_sequence'].apply(lambda x: x.split(',')[-1])
    df['setup_count'] = df['count_sequence'].apply(lambda x: x.split(',')[-1])
    df['input_pitch_sequence'] = df['pitch_sequence'].apply(lambda x: ','.join(x.split(',')[:-1]))

In [21]:
add_pitch_number_to_df(exploded_train_df)
add_pitch_number_to_df(exploded_valid_df)
add_pitch_number_to_df(exploded_test_df)

In [22]:
add_model_inputs(exploded_train_df)
add_model_inputs(exploded_valid_df)
add_model_inputs(exploded_test_df)

In [23]:
exploded_test_df.head(10)

Unnamed: 0,pitch_sequence,count_sequence,zone_sequence,p_throws,stand,pitcher_id,batter_id,at_bat_number,game_date,at_bat_pitch_number,target_pitch,setup_count,input_pitch_sequence
0,"CB,SL","0-0,0-1",913,L,R,666129,621028,35,2023-05-14,2,SL,0-1,CB
1,"CB,SL,SI","0-0,0-1,0-2",9132,L,R,666129,621028,35,2023-05-14,3,SI,0-2,"CB,SL"
2,"CB,SL,SI,SL","0-0,0-1,0-2,0-2",913214,L,R,666129,621028,35,2023-05-14,4,SL,0-2,"CB,SL,SI"
3,"CB,SL,SI,SL,SI","0-0,0-1,0-2,0-2,1-2",9132144,L,R,666129,621028,35,2023-05-14,5,SI,1-2,"CB,SL,SI,SL"
4,"SI,CH","0-0,0-1",411,R,L,453562,520471,13,2018-08-12,2,CH,0-1,SI
5,"SI,CH,SI","0-0,0-1,1-1",4113,R,L,453562,520471,13,2018-08-12,3,SI,1-1,"SI,CH"
6,"SI,CH,SI,CB","0-0,0-1,1-1,1-2",41138,R,L,453562,520471,13,2018-08-12,4,CB,1-2,"SI,CH,SI"
7,"FF,CB","0-0,0-1",914,R,L,656546,606466,19,2019-06-20,2,CB,0-1,FF
8,"FF,CB,FF","0-0,0-1,0-2",91414,R,L,656546,606466,19,2019-06-20,3,FF,0-2,"FF,CB"
9,"FF,CB,FF,CB","0-0,0-1,0-2,1-2",9141411,R,L,656546,606466,19,2019-06-20,4,CB,1-2,"FF,CB,FF"


In [24]:
raw_df.head(10)

Unnamed: 0,game_date,pitch_number,batter,pitcher,pitch_type,events,at_bat_number,zone,outs_when_up,p_throws,stand,type,bb_type,balls,strikes,player_name,on_3b,on_2b,on_1b
0,2017-04-02,1,656941,593372,FF,,1,12.0,0,R,L,B,,0,0,"Martinez, Carlos",,,
1,2017-04-02,4,592450,643493,FF,fielders_choice,75,5.0,0,R,R,X,ground_ball,2,1,"Pruitt, Austin",,,452104.0
2,2017-04-02,1,543305,643493,CB,,76,9.0,0,R,L,S,,0,0,"Pruitt, Austin",,452104.0,592450.0
3,2017-04-02,2,543305,643493,CH,,76,14.0,0,R,L,B,,0,1,"Pruitt, Austin",,452104.0,592450.0
4,2017-04-02,3,543305,643493,SI,,76,13.0,0,R,L,B,,1,1,"Pruitt, Austin",,452104.0,592450.0
5,2017-04-02,4,543305,643493,SI,single,76,4.0,0,R,L,X,line_drive,2,1,"Pruitt, Austin",,452104.0,592450.0
6,2017-04-02,1,474892,517008,FC,,77,7.0,0,R,R,S,,0,0,"Colomé, Alex",452104.0,592450.0,543305.0
7,2017-04-02,2,474892,517008,FC,,77,13.0,0,R,R,B,,0,1,"Colomé, Alex",452104.0,592450.0,543305.0
8,2017-04-02,3,474892,517008,FC,,77,14.0,0,R,R,S,,1,1,"Colomé, Alex",452104.0,592450.0,543305.0
9,2017-04-02,4,474892,517008,FC,,77,14.0,0,R,R,B,,1,2,"Colomé, Alex",452104.0,592450.0,543305.0


In [32]:
raw_df.rename(columns={
    'pitcher': 'pitcher_id',
    'batter': 'batter_id',
    'pitch_number': 'at_bat_pitch_number',
    'pitch_type': 'target_pitch'
}, inplace=True)

cleaned_raw_df = raw_df.drop(['p_throws', 'stand', 'balls', 'strikes', 'player_name'], axis=1)
cleaned_raw_df.head(2)

Unnamed: 0,game_date,at_bat_pitch_number,batter_id,pitcher_id,target_pitch,events,at_bat_number,zone,outs_when_up,type,bb_type,on_3b,on_2b,on_1b
0,2017-04-02,1,656941,593372,FF,,1,12.0,0,B,,,,
1,2017-04-02,4,592450,643493,FF,fielders_choice,75,5.0,0,X,ground_ball,,,452104.0


In [46]:
result_test_df = pd.merge(exploded_test_df, cleaned_raw_df, on=['game_date', 'pitcher_id', 'batter_id', 'at_bat_number', 'at_bat_pitch_number', 'target_pitch'], how='left')
print(len(exploded_test_df))
print(len(result_test_df))
result_test_df.head(5)

339233
339233


Unnamed: 0,pitch_sequence,count_sequence,zone_sequence,p_throws,stand,pitcher_id,batter_id,at_bat_number,game_date,at_bat_pitch_number,...,setup_count,input_pitch_sequence,events,zone,outs_when_up,type,bb_type,on_3b,on_2b,on_1b
0,"CB,SL","0-0,0-1",913,L,R,666129,621028,35,2023-05-14,2,...,0-1,CB,,13.0,1.0,S,,,571976.0,
1,"CB,SL,SI","0-0,0-1,0-2",9132,L,R,666129,621028,35,2023-05-14,3,...,0-2,"CB,SL",,2.0,1.0,S,,,571976.0,
2,"CB,SL,SI,SL","0-0,0-1,0-2,0-2",913214,L,R,666129,621028,35,2023-05-14,4,...,0-2,"CB,SL,SI",,14.0,1.0,B,,,571976.0,
3,"CB,SL,SI,SL,SI","0-0,0-1,0-2,0-2,1-2",9132144,L,R,666129,621028,35,2023-05-14,5,...,1-2,"CB,SL,SI,SL",field_out,4.0,1.0,X,ground_ball,,571976.0,
4,"SI,CH","0-0,0-1",411,R,L,453562,520471,13,2018-08-12,2,...,0-1,SI,,11.0,0.0,B,,,,595978.0


In [34]:
result_train_df = pd.merge(exploded_train_df, cleaned_raw_df, on=['game_date', 'pitcher_id', 'batter_id', 'at_bat_number', 'at_bat_pitch_number', 'target_pitch'], how='left')
print(len(exploded_train_df))
print(len(result_train_df))
result_train_df.head(5)

2375112
2375113


Unnamed: 0,pitch_sequence,count_sequence,zone_sequence,p_throws,stand,pitcher_id,batter_id,at_bat_number,game_date,at_bat_pitch_number,...,setup_count,input_pitch_sequence,events,zone,outs_when_up,type,bb_type,on_3b,on_2b,on_1b
0,"FF,CB","0-0,0-1",214,R,L,572955,656976,54,2021-04-27,2,...,0-1,FF,,14.0,2.0,B,,,,
1,"FF,CB,FF","0-0,0-1,1-1",21413,R,L,572955,656976,54,2021-04-27,3,...,1-1,"FF,CB",,13.0,2.0,S,,,,
2,"FF,CB,FF,FF","0-0,0-1,1-1,1-2",2141312,R,L,572955,656976,54,2021-04-27,4,...,1-2,"FF,CB,FF",,12.0,2.0,B,,,,
3,"FF,CB,FF,FF,CB","0-0,0-1,1-1,1-2,2-2",214131214,R,L,572955,656976,54,2021-04-27,5,...,2-2,"FF,CB,FF,FF",,14.0,2.0,B,,,,
4,"FF,CB,FF,FF,CB,CB","0-0,0-1,1-1,1-2,2-2,3-2",2141312148,R,L,572955,656976,54,2021-04-27,6,...,3-2,"FF,CB,FF,FF,CB",strikeout,8.0,2.0,S,,,,


In [44]:
result_validation_df = pd.merge(exploded_valid_df, cleaned_raw_df, on=['game_date', 'pitcher_id', 'batter_id', 'at_bat_number', 'at_bat_pitch_number', 'target_pitch'], how='left')
print(len(exploded_valid_df))
print(len(result_validation_df))
result_validation_df.head(5)

678753
678753


game_date
2020-09-04    893
2023-04-18    853
2020-09-18    816
2018-09-26    816
2022-05-17    815
             ... 
2019-03-22     22
2021-10-23     21
2019-10-22     20
2017-10-24     16
2019-10-23     16
Name: count, Length: 1375, dtype: int64

In [36]:
# why is 
duplicate_train_rows = result_train_df.groupby(['game_date', 'pitcher_id', 'batter_id', 'at_bat_number', 'at_bat_pitch_number', 'target_pitch']).size()
duplicate_groups = duplicate_train_rows[duplicate_train_rows > 1].reset_index()
duplicate_groups.head(10)

Unnamed: 0,game_date,pitcher_id,batter_id,at_bat_number,at_bat_pitch_number,target_pitch,0
0,2022-07-16,571710,624413,76,3,FF,2


In [43]:
cleaned_raw_df[(cleaned_raw_df['game_date'] == '2022-07-16') & (cleaned_raw_df['batter_id'] == 624413) & (cleaned_raw_df['at_bat_number'] == 76)]

Unnamed: 0,game_date,at_bat_pitch_number,batter_id,pitcher_id,target_pitch,events,at_bat_number,zone,outs_when_up,type,bb_type,on_3b,on_2b,on_1b
3719689,2022-07-16,4,624413,571710,FF,sac_fly,76,4.0,0,X,fly_ball,641645.0,,596019.0
3719690,2022-07-16,3,624413,571710,FF,,76,1.0,0,S,,641645.0,,596019.0
3719691,2022-07-16,2,624413,571710,FF,,76,11.0,0,B,,641645.0,,596019.0
3719786,2022-07-16,1,624413,571710,FF,,76,11.0,0,B,,641645.0,,596019.0
3719973,2022-07-16,3,624413,571710,FF,hit_by_pitch,76,11.0,0,B,,607043.0,516782.0,596019.0
3719974,2022-07-16,2,624413,571710,SL,,76,5.0,0,S,,607043.0,516782.0,596019.0
3719975,2022-07-16,1,624413,571710,FF,,76,5.0,0,S,,607043.0,516782.0,596019.0


In [47]:
result_train_df.to_csv('gs://pitch-sequencing/sequence_data/full_sequence_data/v2/kitchensink/exploded/large_cur_train.csv', index=False)
result_validation_df.to_csv('gs://pitch-sequencing/sequence_data/full_sequence_data/v2/kitchensink/exploded/large_cur_validation.csv', index=False)
result_test_df.to_csv('gs://pitch-sequencing/sequence_data/full_sequence_data/v2/kitchensink/exploded/large_cur_test.csv', index=False)