In [18]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import defaultdict

In [73]:
class PitchDataPreprocessor:
    def __init__(self, df):
        self.df = df
        self.pitch_to_idx = {'<pad>': 0, '<start>': 1, '<arsenal>': 2}
        self.idx_to_pitch = {0: '<pad>', 1: '<start>', 2: '<arsenal>'}
        self.pitcher_arsenals = defaultdict(set)
        self.max_seq_length = 0
        self.max_arsenal_length = 8

    def preprocess(self):
        # Create pitch type vocabulary
        unique_pitches = set()
        for seq in self.df['Pitch Sequence']:
            unique_pitches.update(seq.split(','))
        
        for pitch in unique_pitches:
            if pitch not in self.pitch_to_idx:
                idx = len(self.pitch_to_idx)
                self.pitch_to_idx[pitch] = idx
                self.idx_to_pitch[idx] = pitch

        # Create pitcher arsenals
        for _, row in self.df.iterrows():
            pitcher_id = row['Pitcher ID']
            pitches = row['Pitch Sequence'].split(',')
            self.pitcher_arsenals[pitcher_id].update(pitches)

        # Find max sequence length (including <start> token)
        # TODO(kaelen): temporarily added 9 padding spaces for arsenal length for now. Think of better way to do this.
        self.max_seq_length = max(len(seq.split(',')) for seq in self.df['Pitch Sequence']) + 10

    def encode_input(self, sequence, pitcher_id):
        encoded_arsenals = self.encode_arsenal_for_pitcher(pitcher_id)
        encoded_sequence = self.encode_sequence(sequence)
        return encoded_arsenals + encoded_sequence
    
    def encode_arsenal_for_pitcher(self, pitcher_id):
        arsenal = self.pitcher_arsenals[pitcher_id]
        arsenal_ids = [self.pitch_to_idx[pitch] for pitch in arsenal]
        # [<arsenal>, <pitches in arsenal...>, <pad (if needed)>]
        encoded_arsenals = [2] + sorted(arsenal_ids) + [0] * (self.max_arsenal_length - len(arsenal))
        return encoded_arsenals

    def encode_sequence(self, sequence):
        return [1] + [self.pitch_to_idx[pitch] for pitch in sequence.split(',')]

    def pad_sequence(self, sequence):
        padded = sequence + [0] * (self.max_seq_length - len(sequence))
        return padded[:self.max_seq_length]

    def get_pitcher_arsenal_mask(self, pitcher_id):
        arsenal = self.pitcher_arsenals[pitcher_id]
        mask = [1 if pitch in arsenal or idx < 3 else 0 for idx, pitch in self.idx_to_pitch.items()]
        return torch.tensor(mask, dtype=torch.float)

In [74]:
class PitchSequenceDataset(Dataset):
    def __init__(self, df, preprocessor):
        self.df = df
        self.preprocessor = preprocessor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        sequence = self.preprocessor.encode_sequence(row['Pitch Sequence'])
        
        input_seq = torch.tensor(sequence[:-1], dtype=torch.long)
        target = torch.tensor(sequence[-1], dtype=torch.long)
        
        padded_input = self.preprocessor.pad_sequence(input_seq.tolist())
        input_seq = torch.tensor(padded_input, dtype=torch.long)

        return input_seq, target

In [75]:
class PitchSequenceAndPitcherDataset(Dataset):
    def __init__(self, df, preprocessor):
        self.df = df
        self.preprocessor = preprocessor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        pitcher_id = row['Pitcher ID']
        raw_sequence = row['Pitch Sequence']
        encoded_input = self.preprocessor.encode_input(raw_sequence, pitcher_id)
        
        # Strip off the last pitch from our sequence and make it the target pitch we want to predict. 
        input_seq = torch.tensor(encoded_input[:-1], dtype=torch.long)
        target = torch.tensor(encoded_input[-1], dtype=torch.long)
        
        padded_input = self.preprocessor.pad_sequence(input_seq.tolist())
        input_seq = torch.tensor(padded_input, dtype=torch.long)
        
        return input_seq, target

In [76]:
class PitchTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, dropout=0.1):
        super(PitchTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = nn.Embedding(1000, d_model)  # Assuming max sequence length < 1000
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model, nhead, dropout=dropout),
            num_layers
        )
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, src, src_mask=None):
        src = self.embedding(src)
        pos = torch.arange(0, src.size(1), dtype=torch.long, device=src.device).unsqueeze(0)
        src = src + self.pos_encoder(pos)
        output = self.transformer(src, src_key_padding_mask=src_mask)
        return self.fc(output[:, -1, :])  # Only use the last position for prediction

In [77]:
class PitchPredictor:
    def __init__(self, model, preprocessor):
        self.model = model
        self.preprocessor = preprocessor

    def get_input_sequence_for_pitcher_prediction(self, sequence, pitcher_id):
        encoded_input = self.preprocessor.encode_input(sequence, pitcher_id)
        padded_input = self.preprocessor.pad_sequence(encoded_input)
        input_seq = torch.tensor(padded_input, dtype=torch.long).unsqueeze(0)
        return input_seq
    
    def get_next_pitch_probs_for_pitcher(self, sequence, pitcher_id, should_mask=True):
        input_seq = self.get_input_sequence_for_pitcher_prediction(sequence, pitcher_id)

        self.model.eval()
        with torch.no_grad():
            logits = self.model(input_seq)
            # The output shape should be [1, vocab_size]
            logits = logits.squeeze(0)  # Remove batch dimension if present
            if logits.dim() > 1:
                logits = logits[-1]  # Take the last prediction if multiple outputs
            
            if should_mask:
                arsenal_mask = self.preprocessor.get_pitcher_arsenal_mask(pitcher_id)
                logits[~arsenal_mask.bool()] = float('-inf')

            probabilities = torch.softmax(logits, dim=0)
        
        return probabilities


    def predict_next_pitch_for_pitcher(self, sequence, pitcher_id, should_mask=True):
        probabilities = self.get_next_pitch_probs_for_pitcher(sequence, pitcher_id, should_mask)
        predicted_idx = torch.argmax(probabilities).item()

        return self.preprocessor.idx_to_pitch[predicted_idx]
    
    def get_next_pitch_probs(self, sequence, pitcher_id, should_mask=True):
        encoded_seq = self.preprocessor.encode_sequence(sequence)
        padded_seq = self.preprocessor.pad_sequence(encoded_seq)
        input_seq = torch.tensor(padded_seq, dtype=torch.long).unsqueeze(0)
        

        self.model.eval()  # Ensure the model is in evaluation mode
        with torch.no_grad():
            logits = self.model(input_seq)
            # The output shape should be [1, vocab_size]
            logits = logits.squeeze(0)  # Remove batch dimension if present
            if logits.dim() > 1:
                logits = logits[-1]  # Take the last prediction if multiple outputs
            
            if should_mask:
                arsenal_mask = self.preprocessor.get_pitcher_arsenal_mask(pitcher_id)
                logits[~arsenal_mask.bool()] = float('-inf')

            probabilities = torch.softmax(logits, dim=0)

            return probabilities


    def predict_next_pitch(self, sequence, pitcher_id, should_mask=True):
        probabilities = self.get_next_pitch_probs(sequence, pitcher_id, should_mask)
        predicted_idx = torch.argmax(probabilities).item()
        return self.preprocessor.idx_to_pitch[predicted_idx]


In [78]:
def train_model(model, train_loader, val_loader, num_epochs, lr, device):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            # Don't use arsenal mask for now.
            input_seq, target = [b.to(device) for b in batch]
            optimizer.zero_grad()
            output = model(input_seq)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                # Don't use arsenal mask for now.
                input_seq, target = [b.to(device) for b in batch]
                output = model(input_seq)
                loss = criterion(output, target)
                val_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")

    return model

In [79]:
df = pd.read_csv('../../data/sequence_data_opt.csv')

In [80]:
df.head(10)

Unnamed: 0,Pitch Sequence,Pitcher ID,At-Bat Outcome
0,SI,621107,field_error
1,"SI,CB,FC,SI,CB,SI,FF",621107,single
2,"ST,ST,SI,SI,ST,ST",676534,walk
3,"SI,ST,SI,ST,SI",687330,grounded_into_double_play
4,"FF,FF,FF,SL,FF,SL",477132,strikeout
5,"FF,FF,SL,FF,SL,SL",477132,strikeout
6,"FF,SL,CH,FF,CH,SL,FF,CH,CH,FF",656578,field_error
7,"FC,CH,CH,CH",608379,double_play
8,"FC,CH,FC",608379,field_out
9,"FC,FC,CH",608379,double


In [81]:
# Preprocess data
preprocessor = PitchDataPreprocessor(df)
preprocessor.preprocess()

In [82]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [83]:
train_df.shape

(148430, 3)

In [84]:
train_df.head()

Unnamed: 0,Pitch Sequence,Pitcher ID,At-Bat Outcome
101708,"FF,SL,SL,FF,SL",600917,strikeout
88766,"SL,FF,CB,CB",670102,field_out
177432,SL,623352,force_out
146963,"FF,FS,FS",592332,field_out
41087,"SL,FF,SL,FF,SL,CB,FF,SL",674072,field_out


In [85]:
# Create datasets and dataloaders
train_dataset = PitchSequenceDataset(train_df.head(1000), preprocessor)
test_dataset = PitchSequenceDataset(test_df.head(1000), preprocessor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [86]:
# Initialize model
vocab_size = len(preprocessor.pitch_to_idx)
model = PitchTransformer(vocab_size, d_model=64, nhead=4, num_layers=2)



In [87]:
# Train model
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model = train_model(model, train_loader, test_loader, num_epochs=25, lr=0.001, device="cpu")

Epoch 1/25, Train Loss: 2.0549, Val Loss: 1.8795
Epoch 2/25, Train Loss: 1.9274, Val Loss: 1.8778
Epoch 3/25, Train Loss: 1.9339, Val Loss: 1.8629
Epoch 4/25, Train Loss: 1.9144, Val Loss: 1.8517
Epoch 5/25, Train Loss: 1.9020, Val Loss: 1.8681
Epoch 6/25, Train Loss: 1.9094, Val Loss: 1.8569
Epoch 7/25, Train Loss: 1.8985, Val Loss: 1.8557
Epoch 8/25, Train Loss: 1.8912, Val Loss: 1.8765
Epoch 9/25, Train Loss: 1.9180, Val Loss: 1.8496
Epoch 10/25, Train Loss: 1.9075, Val Loss: 1.8483
Epoch 11/25, Train Loss: 1.9060, Val Loss: 1.8649
Epoch 12/25, Train Loss: 1.9049, Val Loss: 1.8474
Epoch 13/25, Train Loss: 1.9068, Val Loss: 1.8646
Epoch 14/25, Train Loss: 1.9082, Val Loss: 1.8492
Epoch 15/25, Train Loss: 1.8987, Val Loss: 1.8500
Epoch 16/25, Train Loss: 1.9016, Val Loss: 1.8496
Epoch 17/25, Train Loss: 1.8916, Val Loss: 1.8456
Epoch 18/25, Train Loss: 1.8927, Val Loss: 1.8493
Epoch 19/25, Train Loss: 1.8955, Val Loss: 1.8445
Epoch 20/25, Train Loss: 1.8907, Val Loss: 1.8474
Epoch 21/

In [88]:
predictor = PitchPredictor(trained_model, preprocessor)

In [89]:
# Example prediction
sequence = "SL,CH"
pitcher_id = 623352
next_pitch_masked = predictor.predict_next_pitch(sequence, pitcher_id, should_mask=True)
next_pitch_unmasked = predictor.predict_next_pitch(sequence, pitcher_id, should_mask=False)
print(f"Predicted Next Pitch Probs Unmasked: {predictor.get_next_pitch_probs(sequence, pitcher_id, should_mask=False)}")
print(f"Predicted next pitch Unmasked: {next_pitch_unmasked}")
print(f"Predicted Next Pitch Probs Masked: {predictor.get_next_pitch_probs(sequence, pitcher_id)}")
print(f"Predicted next pitch Masked: {next_pitch_masked}")


Predicted Next Pitch Probs Unmasked: tensor([1.0534e-04, 7.3921e-05, 1.2012e-04, 1.1442e-01, 4.7350e-02, 1.7856e-01,
        5.9544e-05, 1.4699e-01, 8.0721e-02, 8.5688e-04, 3.3405e-01, 2.9481e-02,
        6.7204e-02])
Predicted next pitch Unmasked: FF
Predicted Next Pitch Probs Masked: tensor([2.3926e-04, 1.6790e-04, 2.7282e-04, 2.5989e-01, 0.0000e+00, 4.0557e-01,
        0.0000e+00, 3.3386e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00])
Predicted next pitch Masked: SL


## Try with adding arsenal as encoded input to our model
Input sequence will look like 

```<arsenal><pitch id><pitch_id><pitch id>...<pad up to max arsenal len> <start> <pitch ids in sequence> <pad to max length>```

In [90]:
# Create datasets and dataloaders
p_train_dataset = PitchSequenceAndPitcherDataset(train_df.head(1000), preprocessor)
p_test_dataset = PitchSequenceAndPitcherDataset(test_df.head(1000), preprocessor)
p_train_loader = DataLoader(p_train_dataset, batch_size=32, shuffle=True)
p_test_loader = DataLoader(p_test_dataset, batch_size=32)

In [91]:
# Initialize model
vocab_size = len(preprocessor.pitch_to_idx)
arsenal_model = PitchTransformer(vocab_size, d_model=64, nhead=4, num_layers=2)

In [92]:
# Train model
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_arsenal_model = train_model(arsenal_model, p_train_loader, p_test_loader, num_epochs=25, lr=0.001, device="cpu")

Epoch 1/25, Train Loss: 2.0131, Val Loss: 1.8754
Epoch 2/25, Train Loss: 1.9246, Val Loss: 1.8593
Epoch 3/25, Train Loss: 1.9052, Val Loss: 1.8595
Epoch 4/25, Train Loss: 1.9067, Val Loss: 1.8610
Epoch 5/25, Train Loss: 1.9008, Val Loss: 1.8574
Epoch 6/25, Train Loss: 1.9039, Val Loss: 1.8756
Epoch 7/25, Train Loss: 1.9060, Val Loss: 1.8658
Epoch 8/25, Train Loss: 1.9127, Val Loss: 1.8569
Epoch 9/25, Train Loss: 1.9131, Val Loss: 1.8684
Epoch 10/25, Train Loss: 1.8974, Val Loss: 1.8681
Epoch 11/25, Train Loss: 1.9032, Val Loss: 1.8552
Epoch 12/25, Train Loss: 1.9067, Val Loss: 1.8709
Epoch 13/25, Train Loss: 1.9124, Val Loss: 1.8582
Epoch 14/25, Train Loss: 1.8872, Val Loss: 1.8561
Epoch 15/25, Train Loss: 1.8993, Val Loss: 1.8467
Epoch 16/25, Train Loss: 1.9183, Val Loss: 1.8522
Epoch 17/25, Train Loss: 1.8978, Val Loss: 1.8529
Epoch 18/25, Train Loss: 1.8986, Val Loss: 1.8542
Epoch 19/25, Train Loss: 1.8948, Val Loss: 1.8475
Epoch 20/25, Train Loss: 1.8949, Val Loss: 1.8516
Epoch 21/

In [93]:
arsenal_predictor = PitchPredictor(trained_arsenal_model, preprocessor)

In [94]:
preprocessor.pitcher_arsenals[623352]

{'CH', 'SI', 'SL'}

In [95]:
# Example prediction
sequence = "SL,CH"
pitcher_id = 623352
next_pitch_masked = arsenal_predictor.predict_next_pitch_for_pitcher(sequence, pitcher_id, should_mask=True)
next_pitch_unmasked = arsenal_predictor.predict_next_pitch_for_pitcher(sequence, pitcher_id, should_mask=False)
print(f"Predicted Next Pitch Probs Unmasked: {arsenal_predictor.get_next_pitch_probs_for_pitcher(sequence, pitcher_id, should_mask=False)}")
print(f"Predicted next pitch Unmasked: {next_pitch_unmasked}")
print(f"Predicted Next Pitch Probs Masked: {arsenal_predictor.get_next_pitch_probs_for_pitcher(sequence, pitcher_id)}")
print(f"Predicted next pitch Masked: {next_pitch_masked}")

Predicted Next Pitch Probs Unmasked: tensor([1.2956e-04, 1.4132e-04, 1.2171e-04, 1.0859e-01, 5.2791e-02, 1.7378e-01,
        1.0648e-04, 1.9605e-01, 7.2606e-02, 1.0102e-03, 2.9750e-01, 2.9522e-02,
        6.7646e-02])
Predicted next pitch Unmasked: FF
Predicted Next Pitch Probs Masked: tensor([2.7058e-04, 2.9513e-04, 2.5418e-04, 2.2679e-01, 0.0000e+00, 3.6293e-01,
        0.0000e+00, 4.0945e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00])
Predicted next pitch Masked: SI


## some sanity output checks for arsenal encoded input

In [96]:
sequence = "SL,CH"
pitcher_id = 623352
predictor.get_input_sequence_for_pitcher_prediction(sequence, pitcher_id)

tensor([[2, 3, 5, 7, 0, 0, 0, 0, 0, 1, 5, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]])