In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Layer, MultiHeadAttention, LayerNormalization, Dropout, Dense, GlobalAveragePooling1D, Lambda, Concatenate
import tensorflow.keras.backend as K

In [4]:
class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Dense(ff_dim, activation="relu")
        self.out = Dense(embed_dim)
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.out(ffn_output)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
class DataPreprocessor:
    def __init__(self, df):
        self.df = df
        self.pitch_tokenizer = Tokenizer()
        self.pitch_tokenizer.fit_on_texts(df['Pitch Sequence'])
        self.pitcher_tokenizer = Tokenizer()
        self.pitcher_tokenizer.fit_on_texts(df['Pitcher ID'].astype(str))

    def get_sequences(self):
        return self.pitch_tokenizer.texts_to_sequences(self.df['Pitch Sequence'])

    def get_pitcher_ids(self):
        return self.pitcher_tokenizer.texts_to_sequences(self.df['Pitcher ID'].astype(str))

    def pad_sequences(self, sequences, max_len=None):
        if max_len is None:
            max_len = max(len(seq) for seq in sequences)
        return pad_sequences(sequences, maxlen=max_len, padding='post')

In [7]:
class PitchPredictionModel:
    def __init__(self, num_tokens, num_pitchers, embedding_dim=64, num_heads=2, ff_dim=128):
        self.num_tokens = num_tokens
        self.num_pitchers = num_pitchers
        self.embedding_dim = embedding_dim
        self.build_model(embedding_dim, num_heads, ff_dim)

    def build_model(self, embed_dim, num_heads, ff_dim):
        # Inputs
        sequence_input = Input(shape=(None,), dtype="int64", name="sequence")
        pitcher_input = Input(shape=(1,), dtype="int64", name="pitcher")
        mask_input = Input(shape=(self.num_tokens,), dtype="float32", name="mask")

        # Embeddings and transformer
        embedded_sequence = Embedding(self.num_tokens, embed_dim)(sequence_input)
        transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
        transformed_sequence = transformer_block(embedded_sequence)

        embedded_pitcher = Embedding(self.num_pitchers, embed_dim)(pitcher_input)
        flat_pitcher = GlobalAveragePooling1D()(embedded_pitcher)

        # Concatenate and apply mask
        concat = Concatenate()([GlobalAveragePooling1D()(transformed_sequence), flat_pitcher])
        logits = Dense(self.num_tokens, activation=None)(concat)
        masked_logits = Lambda(lambda x: x[0] + (x[1] - 1) * 1e9)([logits, mask_input])
        output = Dense(self.num_tokens, activation="softmax")(masked_logits)

        # Model setup
        self.model = Model(inputs=[sequence_input, pitcher_input, mask_input], outputs=output)
        self.model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
        
    def train_test(self, sequences, pitcher_ids, masks, targets, epochs=10, batch_size=32, val_split=0.1):
        # Splitting data into training and validation sets
        train_seq, val_seq, train_pitcher, val_pitcher, train_mask, val_mask, train_target, val_target = \
            train_test_split(sequences, pitcher_ids, masks, targets, test_size=val_split, random_state=42)
        
        # Training the model
        self.model.fit(
            [train_seq, train_pitcher, train_mask], train_target,
            validation_data=([val_seq, val_pitcher, val_mask], val_target),
            epochs=epochs,
            batch_size=batch_size
        )
        
    def evaluate(self, sequences, pitcher_ids, masks, targets):
        # Evaluate the model on a provided holdout set
        results = self.model.evaluate([sequences, pitcher_ids, masks], targets)
        return results
        
    def train(self, sequences, pitcher_ids, masks, targets, epochs=10, batch_size=32):
        self.model.fit([sequences, pitcher_ids, masks], targets, epochs=epochs, batch_size=batch_size)

    def predict(self, sequence, pitcher_id, mask):
        prediction = self.model.predict([sequence, pitcher_id, mask])
        return np.argmax(prediction, axis=-1)


In [10]:
# Load data into DataFrame
df = pd.read_csv("../../data/sequence_data_opt.csv")

In [11]:
df.head()

Unnamed: 0,Pitch Sequence,Pitcher ID,At-Bat Outcome
0,SI,621107,field_error
1,"SI,CB,FC,SI,CB,SI,FF",621107,single
2,"ST,ST,SI,SI,ST,ST",676534,walk
3,"SI,ST,SI,ST,SI",687330,grounded_into_double_play
4,"FF,FF,FF,SL,FF,SL",477132,strikeout


In [None]:
preprocessor = DataPreprocessor(df)

In [None]:


# Preprocess data

sequences = preprocessor.get_sequences()
pitcher_ids = preprocessor.get_pitcher_ids()
padded_sequences = preprocessor.pad_sequences(sequences)

# Prepare targets (for training, assume predicting next pitch type in sequence)
targets = np.array([seq[1:] + [0] for seq in sequences])

# Initialize and train the model
model = PitchPredictionModel(num_tokens=len(preprocessor.pitch_tokenizer.word_index) + 1,
                             num_pitchers=len(preprocessor.pitcher_tokenizer.word_index) + 1,
                             embedding_dim=64, num_heads=2, ff_dim=128)
model.train(padded_sequences, pitcher_ids, targets, epochs=10, batch_size=32)
