In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Layer, MultiHeadAttention, LayerNormalization, Dropout, Dense, GlobalAveragePooling1D, Lambda, Concatenate
import tensorflow.keras.backend as K

In [4]:
class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Dense(ff_dim, activation="relu")
        self.out = Dense(embed_dim)
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.out(ffn_output)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [24]:
class DataPreprocessor:
    def __init__(self, sequences, pitcher_ids):
        self.pitch_tokenizer = Tokenizer(filters='', split=',')
        self.pitch_tokenizer.fit_on_texts(sequences)
        self.pitcher_tokenizer = Tokenizer()
        self.pitcher_tokenizer.fit_on_texts(map(str, pitcher_ids))

    def tokenize_sequences(self, sequences):
        return self.pitch_tokenizer.texts_to_sequences(sequences)

    def tokenize_pitchers(self, pitcher_ids):
        return self.pitcher_tokenizer.texts_to_sequences(map(str, pitcher_ids))

    def pad_sequences(self, sequences):
        max_len = max(len(seq) for seq in sequences)
        return pad_sequences(sequences, maxlen=max_len, padding='post')

    def create_pitcher_pitch_mask(self, sequences, pitcher_ids):
        num_pitchers = len(self.pitcher_tokenizer.word_index) + 1
        num_tokens = len(self.pitch_tokenizer.word_index) + 1
        pitcher_pitch_mask = np.zeros((num_pitchers, num_tokens))
        for pitcher_id, sequence in zip(pitcher_ids, sequences):
            tokenized_pitcher_id = self.pitcher_tokenizer.texts_to_sequences([str(pitcher_id)])[0][0]
            tokenized_sequence = self.pitch_tokenizer.texts_to_sequences([sequence])[0]
            pitcher_pitch_mask[tokenized_pitcher_id, tokenized_sequence] = 1
        return pitcher_pitch_mask

In [34]:
# Assuming the `PitchPredictionModel` has already been defined and uses the custom TransformerBlock
class PitchPredictionModel:
    def __init__(self, num_tokens, num_pitchers, embedding_dim=64, num_heads=2, ff_dim=128):
        self.num_tokens = num_tokens
        self.num_pitchers = num_pitchers
        self.embedding_dim = embedding_dim
        self.build_model(embedding_dim, num_heads, ff_dim)

    def build_model(self, embed_dim, num_heads, ff_dim):
        sequence_input = Input(shape=(None,), dtype="int64", name="sequence")
        pitcher_input = Input(shape=(1,), dtype="int64", name="pitcher")
        mask_input = Input(shape=(self.num_tokens,), dtype="float32", name="mask")

        embedded_sequence = Embedding(self.num_tokens, embed_dim, mask_zero=True)(sequence_input)
        transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
        transformed_sequence = transformer_block(embedded_sequence)

        embedded_pitcher = Embedding(self.num_pitchers, embed_dim)(pitcher_input)
        flat_pitcher = GlobalAveragePooling1D()(embedded_pitcher)

        concat = Concatenate()([GlobalAveragePooling1D()(transformed_sequence), flat_pitcher])
        logits = Dense(self.num_tokens, activation=None)(concat)
        masked_logits = Lambda(lambda x: x[0] + (x[1] - 1) * 1e9)([logits, mask_input])
        output = Dense(self.num_tokens, activation="softmax")(masked_logits)

        self.model = Model(inputs=[sequence_input, pitcher_input, mask_input], outputs=output)
        self.model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

    def train_test(self, sequences, pitcher_ids, masks, targets, epochs=10, batch_size=32, val_split=0.1):
        # Splitting data into training and validation sets
        train_seq, val_seq, train_pitcher, val_pitcher, train_mask, val_mask, train_target, val_target = \
            train_test_split(sequences, pitcher_ids, masks, targets, test_size=val_split, random_state=42)
        
        # Training the model
        self.model.fit(
            [train_seq, train_pitcher, train_mask], train_target,
            validation_data=([val_seq, val_pitcher, val_mask], val_target),
            epochs=epochs,
            batch_size=batch_size
        )

    def evaluate(self, sequences, pitcher_ids, masks, targets):
        # Evaluate the model on a provided holdout set
        results = self.model.evaluate([sequences, pitcher_ids, masks], targets)
        return results
        
    def train(self, sequences, pitcher_ids, masks, targets, epochs=10, batch_size=32):
        self.model.fit([sequences, pitcher_ids, masks], targets, epochs=epochs, batch_size=batch_size)

    def predict(self, sequence, pitcher_id, mask):
        prediction = self.model.predict([sequence, pitcher_id, mask])
        return np.argmax(prediction, axis=-1)

In [26]:
# Load data into DataFrame
df = pd.read_csv("../../data/sequence_data_opt.csv")

In [27]:
df.head()

Unnamed: 0,Pitch Sequence,Pitcher ID,At-Bat Outcome
0,SI,621107,field_error
1,"SI,CB,FC,SI,CB,SI,FF",621107,single
2,"ST,ST,SI,SI,ST,ST",676534,walk
3,"SI,ST,SI,ST,SI",687330,grounded_into_double_play
4,"FF,FF,FF,SL,FF,SL",477132,strikeout


In [28]:
preprocessor = DataPreprocessor(df['Pitch Sequence'], df['Pitcher ID'])

In [29]:
tokenized_sequences = preprocessor.tokenize_sequences(df['Pitch Sequence'])

In [30]:
padded_sequences = preprocessor.pad_sequences(tokenized_sequences)

In [31]:
tokenized_pitchers = preprocessor.tokenize_pitchers(df['Pitcher ID'])

In [32]:
pitcher_masks = preprocessor.create_pitcher_pitch_mask(df['Pitch Sequence'], df['Pitcher ID'])

In [35]:
padded_sequences

array([[3, 0, 0, ..., 0, 0, 0],
       [3, 5, 6, ..., 0, 0, 0],
       [7, 7, 3, ..., 0, 0, 0],
       ...,
       [7, 7, 7, ..., 0, 0, 0],
       [1, 8, 2, ..., 0, 0, 0],
       [5, 1, 1, ..., 0, 0, 0]])

In [36]:
# Prepare the targets for training: next pitch in the sequence
targets = np.array([seq[1:] + [0] for seq in padded_sequences])  # Assuming this simple target preparation

In [None]:
# Instantiate the model
model = PitchPredictionModel(
    num_tokens=len(preprocessor.pitch_tokenizer.word_index) + 1,
    num_pitchers=len(preprocessor.pitcher_tokenizer.word_index) + 1,
    embedding_dim=64, num_heads=2, ff_dim=128
)

In [None]:
# Train the model
model.train(padded_sequences, tokenized_pitchers, pitcher_masks, targets, epochs=10, batch_size=32, val_split=0.2)

In [None]:
# Evaluate the model
evaluation_results = model.evaluate(padded_sequences, tokenized_pitchers, pitcher_masks, targets)
print("Evaluation results:", evaluation_results)