In [1]:
import numpy as np
import pandas as pd
import tensorflow
from sklearn.model_selection import train_test_split
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
pd.set_option('display.max_columns', 50)
from IPython.display import display, HTML
import ast
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
data = pd.read_csv('../data/sequence_data.csv')
print(data.shape)
data.head()

(181872, 15)


Unnamed: 0,game_pk,pitcher,pitch_number,batter,stand,p_throws,balls,strikes,outs_when_up,inning,fielder_2,bat_score,fld_score,pitch_sequence,strikeout
0,661032,543238,1,660271,L,L,0,0,0,8,595978,4,0,['FF'],0
1,661032,543238,6,545361,R,L,3,2,1,8,595978,4,0,"['FF', 'SL', 'SL', 'FF', 'SL', 'FF']",0
2,661032,543238,1,665120,L,L,0,0,1,8,595978,4,0,['FF'],0
3,661032,543238,6,543685,R,L,3,2,2,8,595978,4,0,"['SL', 'FF', 'FF', 'SL', 'SL', 'SL']",0
4,661032,571901,3,676391,R,L,1,1,0,8,435559,0,4,"['SI', 'SI', 'SI']",0


In [3]:
data['strikeout'].value_counts()

0    141181
1     40691
Name: strikeout, dtype: int64

In [4]:
data['pitch_number'].value_counts()

4     34154
3     32626
5     30577
2     27154
6     21693
1     20486
7      9215
8      3734
9      1410
10      525
11      177
12       89
13       22
14        6
15        3
16        1
Name: pitch_number, dtype: int64

In [5]:
# length of pitch sequence should be equal to or more than 3
# otherwise it won't even stand a chance to have an outcome of strikeout
data = data[~((data['pitch_number']==1)|(data['pitch_number']==2))].reset_index()
data

Unnamed: 0,index,game_pk,pitcher,pitch_number,batter,stand,p_throws,balls,strikes,outs_when_up,inning,fielder_2,bat_score,fld_score,pitch_sequence,strikeout
0,1,661032,543238,6,545361,R,L,3,2,1,8,595978,4,0,"['FF', 'SL', 'SL', 'FF', 'SL', 'FF']",0
1,3,661032,543238,6,543685,R,L,3,2,2,8,595978,4,0,"['SL', 'FF', 'FF', 'SL', 'SL', 'SL']",0
2,4,661032,571901,3,676391,R,L,1,1,0,8,435559,0,4,"['SI', 'SI', 'SI']",0
3,5,661032,571901,6,595978,R,L,2,2,1,8,435559,0,4,"['FC', 'FC', 'SI', 'SI', 'FC', 'SI']",1
4,6,661032,571901,6,665926,L,L,3,2,2,8,435559,0,4,"['SI', 'FC', 'SI', 'SI', 'SI', 'SI']",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134227,181866,663463,669211,6,596142,R,L,0,2,0,8,621532,5,2,"['SL', 'SL', 'FF', 'FF', 'CH', 'FF']",1
134228,181867,663463,669211,8,663616,L,L,3,2,1,8,621532,5,2,"['SL', 'SL', 'FF', 'FF', 'SL', 'SL', 'SL', 'SL']",1
134229,181868,663463,669211,3,680777,R,L,0,2,2,8,621532,5,2,"['FF', 'FF', 'SL']",0
134230,181870,663463,669211,3,621439,R,L,2,0,0,9,621532,5,2,"['SL', 'CH', 'CH']",0


In [6]:
data['strikeout'].value_counts() #43.5% are strikeouts

0    93541
1    40691
Name: strikeout, dtype: int64

In [7]:
# checking all the types of pitches in the data
# FF    235175 Four-Seam Fastball
# SL    126136 Slider
# SI    109385 Sinker (synonymous with FT)
# CH     79425 Changeup
# CU     54955 Curveball
# FC     51010 Cutter
# ST     21100 ?  (not in statcast pitch abv)
# KC     15378 Knuckle Curve
# FS     11209 Splitter
# SV      2456 Slurve (not in statcast pitch abv)
# FA      1269 ?  (not in statcast pitch abv)
# EP       514 Eephus
# CS        96 ?  (not in statcast pitch abv)
# PO        40 Pitchout
# KN        19 Knuckleball

In [8]:
pitch_to_int = {
    'FF': 1,
    'SL': 2,
    'SI': 3,
    'CH': 4,
    'CU': 5,
    'FC': 6,
    'ST': 7,
    'KC': 8,
    'FS': 9,
    'SV': 10,
    'FA': 11, 
    'EP': 12,
    'CS': 13,
    'PO': 14,
    'KN': 15,
}
pitch_vocab_size = len(pitch_to_int)
data['pitch_seq_int'] = data['pitch_sequence'].apply(lambda x: [pitch_to_int.get(pitch,-1) for pitch in ast.literal_eval(x)])
data = data[~data['pitch_seq_int'].apply(lambda x: -1 in x)].reset_index(drop=True)

max_sequence_length = 6  # Adjust as needed
data['padded_seq_int'] = pad_sequences(data['pitch_seq_int'], maxlen=max_sequence_length, padding='post',value=0).tolist()

# Split the data into training and testing sets
X = np.array(data['padded_seq_int'].tolist())
y = np.array(data['strikeout'].tolist())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an LSTM-based model
model = Sequential()
model.add(Embedding(input_dim=pitch_vocab_size+1, output_dim=64, input_length=max_sequence_length))
model.add(LSTM(64, dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

# Evaluate the model on the test data
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test loss: {loss:.4f}, Test accuracy: {accuracy:.4f}')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss: 0.5905, Test accuracy: 0.7024
