In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import typing

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import defaultdict

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [3]:
raw_df = pd.read_csv('../../data/raw_data_curmap.csv')
raw_df.head(10)

Unnamed: 0,game_date,pitch_number,batter,pitcher,pitch_type,events,at_bat_number,zone,outs_when_up,p_throws,stand,type,bb_type,balls,strikes,player_name,on_3b,on_2b,on_1b
0,2017-04-02,1,656941,593372,FF,,1,12.0,0,R,L,B,,0,0,"Martinez, Carlos",,,
1,2017-04-02,4,592450,643493,FF,fielders_choice,75,5.0,0,R,R,X,ground_ball,2,1,"Pruitt, Austin",,,452104.0
2,2017-04-02,1,543305,643493,CB,,76,9.0,0,R,L,S,,0,0,"Pruitt, Austin",,452104.0,592450.0
3,2017-04-02,2,543305,643493,CH,,76,14.0,0,R,L,B,,0,1,"Pruitt, Austin",,452104.0,592450.0
4,2017-04-02,3,543305,643493,SI,,76,13.0,0,R,L,B,,1,1,"Pruitt, Austin",,452104.0,592450.0
5,2017-04-02,4,543305,643493,SI,single,76,4.0,0,R,L,X,line_drive,2,1,"Pruitt, Austin",,452104.0,592450.0
6,2017-04-02,1,474892,517008,FC,,77,7.0,0,R,R,S,,0,0,"Colomé, Alex",452104.0,592450.0,543305.0
7,2017-04-02,2,474892,517008,FC,,77,13.0,0,R,R,B,,0,1,"Colomé, Alex",452104.0,592450.0,543305.0
8,2017-04-02,3,474892,517008,FC,,77,14.0,0,R,R,S,,1,1,"Colomé, Alex",452104.0,592450.0,543305.0
9,2017-04-02,4,474892,517008,FC,,77,14.0,0,R,R,B,,1,2,"Colomé, Alex",452104.0,592450.0,543305.0


In [27]:
# remove instances where pitch_type is NaN
raw_df = raw_df[~raw_df['pitch_type'].isna()]
unique_pitches = raw_df.groupby('pitcher')['pitch_type'].agg(lambda x: list(x.unique())).reset_index()
unique_pitches = unique_pitches.rename(columns={'pitch_type': 'pitch_arsenal'})

unique_pitches['pitch_arsenal_csv'] = unique_pitches['pitch_arsenal'].apply(lambda x: ','.join(x))
unique_pitches['arsenal_size'] = unique_pitches['pitch_arsenal'].apply(len)

In [28]:
max_length = unique_pitches['pitch_arsenal'].apply(len).max()
print(max_length)

9


In [30]:
max_index = unique_pitches['arsenal_size'].idxmax()
pitcher_with_largest_arsenal = unique_pitches.loc[max_index, 'pitcher']
pitch_types = unique_pitches.loc[max_index, 'pitch_arsenal']
print(pitcher_with_largest_arsenal)
print(pitch_types)

506433
['FC', 'SI', 'FF', 'ST', 'CB', 'CH', 'SL', 'FS', 'PO']


In [31]:
raw_df[raw_df['pitcher']==506433]

Unnamed: 0,game_date,pitch_number,batter,pitcher,pitch_type,events,at_bat_number,zone,outs_when_up,p_throws,stand,type,bb_type,balls,strikes,player_name,on_3b,on_2b,on_1b
2483,2017-04-03,1,571980,506433,FC,,51,4.0,0,R,L,B,,0,0,"Darvish, Yu",,,
2484,2017-04-03,2,571980,506433,SI,field_out,51,13.0,0,R,L,X,line_drive,1,0,"Darvish, Yu",,,
2485,2017-04-03,1,650490,506433,FF,,52,5.0,1,R,R,S,,0,0,"Darvish, Yu",,,
2486,2017-04-03,2,650490,506433,ST,,52,14.0,1,R,R,B,,0,1,"Darvish, Yu",,,
2487,2017-04-03,3,650490,506433,FF,double,52,14.0,1,R,R,X,line_drive,1,1,"Darvish, Yu",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4649822,2023-08-25,2,467793,506433,FF,,8,13.0,2,R,L,B,,0,1,"Darvish, Yu",,,
4649823,2023-08-25,3,467793,506433,FS,,8,14.0,2,R,L,B,,1,1,"Darvish, Yu",,,
4649824,2023-08-25,4,467793,506433,FS,,8,6.0,2,R,L,S,,2,1,"Darvish, Yu",,,
4649826,2023-08-25,5,467793,506433,FS,strikeout,8,7.0,2,R,L,S,,2,2,"Darvish, Yu",,,


In [38]:
pitch_counts = raw_df.groupby(['pitcher', 'pitch_type']).size().unstack(fill_value=0)
pitch_counts.head(10)

pitch_type,CB,CH,FC,FF,FS,KN,PO,SI,SL,ST
pitcher,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
112526,0,470,69,815,0,0,0,2824,330,0
276351,0,20,0,485,0,0,0,24,269,0
276520,322,111,86,270,0,0,0,120,238,0
276542,1,240,0,423,0,0,0,89,108,0
279571,153,6,0,698,0,0,0,139,522,0
282332,0,857,2680,54,0,0,0,1199,2175,0
285079,0,51,0,488,0,2329,1,0,0,0
405395,8,0,0,19,0,0,0,0,0,0
407793,211,128,1006,1111,0,0,0,368,0,0
407822,89,0,232,852,706,0,0,4,0,0


In [45]:
# Convert to dictionary where each key is a pitch type and value is the count
pitch_dict = pitch_counts.apply(lambda x: x.to_dict(), axis=1)
pitch_counts_df = pd.DataFrame(pitch_dict, columns=['pitch_counts'])
pitch_counts_df['pitcher'] = pitch_counts_df.index
pitch_counts_df.reset_index(drop=True, inplace=True)
pitch_counts_df.head(10)

Unnamed: 0,pitch_counts,pitcher
0,"{'CB': 0, 'CH': 470, 'FC': 69, 'FF': 815, 'FS'...",112526
1,"{'CB': 0, 'CH': 20, 'FC': 0, 'FF': 485, 'FS': ...",276351
2,"{'CB': 322, 'CH': 111, 'FC': 86, 'FF': 270, 'F...",276520
3,"{'CB': 1, 'CH': 240, 'FC': 0, 'FF': 423, 'FS':...",276542
4,"{'CB': 153, 'CH': 6, 'FC': 0, 'FF': 698, 'FS':...",279571
5,"{'CB': 0, 'CH': 857, 'FC': 2680, 'FF': 54, 'FS...",282332
6,"{'CB': 0, 'CH': 51, 'FC': 0, 'FF': 488, 'FS': ...",285079
7,"{'CB': 8, 'CH': 0, 'FC': 0, 'FF': 19, 'FS': 0,...",405395
8,"{'CB': 211, 'CH': 128, 'FC': 1006, 'FF': 1111,...",407793
9,"{'CB': 89, 'CH': 0, 'FC': 232, 'FF': 852, 'FS'...",407822


In [48]:
print(len(unique_pitches))
print(len(pitch_counts_df))

2258
2258


In [46]:
final_pitch_arsenal_df = pd.merge(pitch_counts_df, unique_pitches, on='pitcher', how='inner')

In [49]:
final_pitch_arsenal_df.head(10)

Unnamed: 0,pitch_counts,pitcher,pitch_arsenal,pitch_arsenal_csv,arsenal_size
0,"{'CB': 0, 'CH': 470, 'FC': 69, 'FF': 815, 'FS'...",112526,"[SI, CH, FF, SL, FC]","SI,CH,FF,SL,FC",5
1,"{'CB': 0, 'CH': 20, 'FC': 0, 'FF': 485, 'FS': ...",276351,"[SL, FF, CH, SI]","SL,FF,CH,SI",4
2,"{'CB': 322, 'CH': 111, 'FC': 86, 'FF': 270, 'F...",276520,"[SL, CB, SI, FF, FC, CH]","SL,CB,SI,FF,FC,CH",6
3,"{'CB': 1, 'CH': 240, 'FC': 0, 'FF': 423, 'FS':...",276542,"[FF, SL, CH, SI, CB]","FF,SL,CH,SI,CB",5
4,"{'CB': 153, 'CH': 6, 'FC': 0, 'FF': 698, 'FS':...",279571,"[FF, SL, SI, CB, CH]","FF,SL,SI,CB,CH",5
5,"{'CB': 0, 'CH': 857, 'FC': 2680, 'FF': 54, 'FS...",282332,"[SL, FC, SI, CH, FF]","SL,FC,SI,CH,FF",5
6,"{'CB': 0, 'CH': 51, 'FC': 0, 'FF': 488, 'FS': ...",285079,"[KN, FF, CH, PO]","KN,FF,CH,PO",4
7,"{'CB': 8, 'CH': 0, 'FC': 0, 'FF': 19, 'FS': 0,...",405395,"[FF, CB]","FF,CB",2
8,"{'CB': 211, 'CH': 128, 'FC': 1006, 'FF': 1111,...",407793,"[FF, CH, FC, CB, SI]","FF,CH,FC,CB,SI",5
9,"{'CB': 89, 'CH': 0, 'FC': 232, 'FF': 852, 'FS'...",407822,"[FF, FS, FC, CB, SI]","FF,FS,FC,CB,SI",5


In [50]:
final_pitch_arsenal_df.to_csv('gs://pitch-sequencing/arsenal_data/pitch_arsenal_data.csv', index=False)