In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import defaultdict

In [16]:
class PitchDataPreprocessor:
    def __init__(self, df):
        self.df = df
        self.pitch_to_idx = {'<pad>': 0, '<start>': 1}
        self.idx_to_pitch = {0: '<pad>', 1: '<start>'}
        self.pitcher_arsenals = defaultdict(set)
        self.max_seq_length = 0

    def preprocess(self):
        # Create pitch type vocabulary
        unique_pitches = set()
        for seq in self.df['Pitch Sequence']:
            unique_pitches.update(seq.split(','))
        
        for pitch in unique_pitches:
            if pitch not in self.pitch_to_idx:
                idx = len(self.pitch_to_idx)
                self.pitch_to_idx[pitch] = idx
                self.idx_to_pitch[idx] = pitch

        # Create pitcher arsenals
        for _, row in self.df.iterrows():
            pitcher_id = row['Pitcher ID']
            pitches = row['Pitch Sequence'].split(',')
            self.pitcher_arsenals[pitcher_id].update(pitches)

        # Find max sequence length (including <start> token)
        self.max_seq_length = max(len(seq.split(',')) for seq in self.df['Pitch Sequence']) + 1

    def encode_sequence(self, sequence):
        return [1] + [self.pitch_to_idx[pitch] for pitch in sequence.split(',')]

    def pad_sequence(self, sequence):
        padded = sequence + [0] * (self.max_seq_length - len(sequence))
        return padded[:self.max_seq_length]

    def get_pitcher_arsenal_mask(self, pitcher_id):
        arsenal = self.pitcher_arsenals[pitcher_id]
        mask = [1 if pitch in arsenal or idx < 2 else 0 for idx, pitch in self.idx_to_pitch.items()]
        return torch.tensor(mask, dtype=torch.float)

In [17]:
class PitchSequenceDataset(Dataset):
    def __init__(self, df, preprocessor):
        self.df = df
        self.preprocessor = preprocessor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        sequence = self.preprocessor.encode_sequence(row['Pitch Sequence'])
        
        input_seq = torch.tensor(sequence[:-1], dtype=torch.long)
        target = torch.tensor(sequence[-1], dtype=torch.long)
        
        padded_input = self.preprocessor.pad_sequence(input_seq.tolist())
        input_seq = torch.tensor(padded_input, dtype=torch.long)
        
        pitcher_id = row['Pitcher ID']
        arsenal_mask = self.preprocessor.get_pitcher_arsenal_mask(pitcher_id)

        return input_seq, target, arsenal_mask

In [18]:
class PitchTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, dropout=0.1):
        super(PitchTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = nn.Embedding(1000, d_model)  # Assuming max sequence length < 1000
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model, nhead, dropout=dropout),
            num_layers
        )
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, src, src_mask=None):
        src = self.embedding(src)
        pos = torch.arange(0, src.size(1), dtype=torch.long, device=src.device).unsqueeze(0)
        src = src + self.pos_encoder(pos)
        output = self.transformer(src, src_key_padding_mask=src_mask)
        return self.fc(output[:, -1, :])  # Only use the last position for prediction

In [32]:
class PitchPredictor:
    def __init__(self, model, preprocessor):
        self.model = model
        self.preprocessor = preprocessor

    def predict_next_pitch(self, sequence, pitcher_id):
        encoded_seq = self.preprocessor.encode_sequence(sequence)
        padded_seq = self.preprocessor.pad_sequence(encoded_seq)
        input_seq = torch.tensor(padded_seq, dtype=torch.long).unsqueeze(0)
        arsenal_mask = self.preprocessor.get_pitcher_arsenal_mask(pitcher_id)

        self.model.eval()  # Ensure the model is in evaluation mode
        with torch.no_grad():
            logits = self.model(input_seq)
            # The output shape should be [1, vocab_size]
            logits = logits.squeeze(0)  # Remove batch dimension if present
            if logits.dim() > 1:
                logits = logits[-1]  # Take the last prediction if multiple outputs
            logits[~arsenal_mask.bool()] = float('-inf')
            probabilities = torch.softmax(logits, dim=0)
            predicted_idx = torch.argmax(probabilities).item()

        return self.preprocessor.idx_to_pitch[predicted_idx]

In [20]:
def train_model(model, train_loader, val_loader, num_epochs, lr, device):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            # Don't use arsenal mask for now.
            input_seq, target, _ = [b.to(device) for b in batch]
            optimizer.zero_grad()
            output = model(input_seq)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                # Don't use arsenal mask for now.
                input_seq, target, _ = [b.to(device) for b in batch]
                output = model(input_seq)
                loss = criterion(output, target)
                val_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")

    return model

In [39]:
df = pd.read_csv('../../data/sequence_data_opt.csv')

In [40]:
df.head(10)

Unnamed: 0,Pitch Sequence,Pitcher ID,At-Bat Outcome
0,SI,621107,field_error
1,"SI,CB,FC,SI,CB,SI,FF",621107,single
2,"ST,ST,SI,SI,ST,ST",676534,walk
3,"SI,ST,SI,ST,SI",687330,grounded_into_double_play
4,"FF,FF,FF,SL,FF,SL",477132,strikeout
5,"FF,FF,SL,FF,SL,SL",477132,strikeout
6,"FF,SL,CH,FF,CH,SL,FF,CH,CH,FF",656578,field_error
7,"FC,CH,CH,CH",608379,double_play
8,"FC,CH,FC",608379,field_out
9,"FC,FC,CH",608379,double


In [41]:
# Preprocess data
preprocessor = PitchDataPreprocessor(df)
preprocessor.preprocess()

In [42]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [43]:
train_df.shape

(148430, 3)

In [44]:
train_df.head()

Unnamed: 0,Pitch Sequence,Pitcher ID,At-Bat Outcome
101708,"FF,SL,SL,FF,SL",600917,strikeout
88766,"SL,FF,CB,CB",670102,field_out
177432,SL,623352,force_out
146963,"FF,FS,FS",592332,field_out
41087,"SL,FF,SL,FF,SL,CB,FF,SL",674072,field_out


In [45]:
# Create datasets and dataloaders
train_dataset = PitchSequenceDataset(train_df.head(1000), preprocessor)
test_dataset = PitchSequenceDataset(test_df.head(1000), preprocessor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [46]:
# Initialize model
vocab_size = len(preprocessor.pitch_to_idx)
model = PitchTransformer(vocab_size, d_model=64, nhead=4, num_layers=2)

In [47]:
# Train model
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model = train_model(model, train_loader, test_loader, num_epochs=10, lr=0.001, device="cpu")

Epoch 1/10, Train Loss: 1.5602, Val Loss: 1.4849
Epoch 2/10, Train Loss: 1.5114, Val Loss: 1.4756
Epoch 3/10, Train Loss: 1.4982, Val Loss: 1.5178
Epoch 4/10, Train Loss: 1.5100, Val Loss: 1.4950
Epoch 5/10, Train Loss: 1.4947, Val Loss: 1.4845
Epoch 6/10, Train Loss: 1.4963, Val Loss: 1.4702
Epoch 7/10, Train Loss: 1.4946, Val Loss: 1.4828
Epoch 8/10, Train Loss: 1.4861, Val Loss: 1.4764
Epoch 9/10, Train Loss: 1.4907, Val Loss: 1.4719
Epoch 10/10, Train Loss: 1.4915, Val Loss: 1.4740


In [48]:
predictor = PitchPredictor(trained_model, preprocessor)

In [62]:
# Example prediction
sequence = "SL,FF"
pitcher_id = 623352
next_pitch = predictor.predict_next_pitch(sequence, pitcher_id)
print(f"Predicted next pitch: {next_pitch}")

Predicted next pitch: SI
