In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import defaultdict

In [3]:
class PitchDataPreprocessor:
    def __init__(self, df):
        self.df = df
        self.pitch_to_idx = {'<PAD>': 0, '<START>': 1}  # Add start token
        self.idx_to_pitch = {0: '<PAD>', 1: '<START>'}  # Add start token
        self.pitcher_arsenals = defaultdict(set)
        self.max_seq_length = 0
        self.encoded_sequences = []

    def preprocess(self):
        # Create pitch type vocabulary
        unique_pitches = set()
        for seq in self.df['Pitch Sequence']:
            unique_pitches.update(seq.split(','))
        
        for idx, pitch in enumerate(unique_pitches, start=1):  # Start from 1 because 0 is <START>
            self.pitch_to_idx[pitch] = idx
            self.idx_to_pitch[idx] = pitch
        
        # Start tokenization from 1
        self.pitch_to_idx = {pitch: idx + 1 for idx, pitch in enumerate(unique_pitches)}
        self.idx_to_pitch = {idx: pitch for pitch, idx in self.pitch_to_idx.items()}

        # Create pitcher arsenals and encode sequences
        for _, row in self.df.iterrows():
            pitcher_id = row['Pitcher ID']
            pitches = row['Pitch Sequence'].split(',')
            self.pitcher_arsenals[pitcher_id].update(pitches)
            
            encoded_seq = self.encode_sequence(row['Pitch Sequence'])
            self.encoded_sequences.append(encoded_seq)
            self.max_seq_length = max(self.max_seq_length, len(encoded_seq))

        # Update the DataFrame with encoded sequences
        self.df['Encoded Sequence'] = self.encoded_sequences

    def encode_sequence(self, sequence):
        pitches = sequence.split(',')
        return [self.pitch_to_idx[pitch] for pitch in pitches]

    def get_pitcher_arsenal_mask(self, pitcher_id):
        arsenal = self.pitcher_arsenals[pitcher_id]
        mask = [1 if pitch in arsenal else 0 for pitch in self.idx_to_pitch.values()]
        return torch.tensor(mask, dtype=torch.float)

    # We'll keep this method, but it won't be used in the current implementation
    def pad_sequence(self, sequence):
        padded = sequence + [len(self.pitch_to_idx)] * (self.max_seq_length - len(sequence))
        return padded[:self.max_seq_length]

In [4]:
class PitchSequenceDataset(Dataset):
    def __init__(self, df, preprocessor):
        self.df = df
        self.preprocessor = preprocessor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        sequence = self.preprocessor.encode_sequence(row['Pitch Sequence'])
        #print(sequence)
        padded_sequence = self.preprocessor.pad_sequence(sequence)
        #print(padded_sequence)
        pitcher_id = row['Pitcher ID']
        #print(pitcher_id)
        arsenal_mask = self.preprocessor.get_pitcher_arsenal_mask(pitcher_id)

        input_seq = torch.tensor(padded_sequence[:-1], dtype=torch.long)
        target = torch.tensor(padded_sequence[-1], dtype=torch.long)
        print(input_seq)
        print(target)
        return input_seq, target, arsenal_mask

In [4]:
class PitchTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, dropout=0.1):
        super(PitchTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size + 2, d_model)  # +1 for padding, +1 for start token
        self.pos_encoder = nn.Embedding(1000, d_model)  # Assuming max sequence length < 1000
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model, nhead, dropout=dropout),
            num_layers
        )
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, src, src_mask=None):
        src = self.embedding(src)
        pos = torch.arange(0, src.size(1), dtype=torch.long, device=src.device).unsqueeze(0)
        src = src + self.pos_encoder(pos)
        output = self.transformer(src, src_key_padding_mask=src_mask)
        return self.fc(output[:, -1, :])
    
    def init_weights(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

In [6]:
class PitchPredictor:
    def __init__(self, model, preprocessor):
        self.model = model
        self.preprocessor = preprocessor

    def predict_next_pitch(self, sequence, pitcher_id):
        encoded_seq = self.preprocessor.encode_sequence(sequence)
        padded_seq = self.preprocessor.pad_sequence(encoded_seq)
        input_seq = torch.tensor(padded_seq[:-1], dtype=torch.long).unsqueeze(0)
        arsenal_mask = self.preprocessor.get_pitcher_arsenal_mask(pitcher_id).unsqueeze(0)

        with torch.no_grad():
            logits = self.model(input_seq)
            masked_logits = apply_arsenal_mask(logits, arsenal_mask)
            probabilities = torch.exp(masked_logits)
            predicted_idx = torch.argmax(probabilities, dim=-1).item()

        return self.preprocessor.idx_to_pitch[predicted_idx]

In [7]:
def train_model(model, train_loader, val_loader, num_epochs, lr, device):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            input_seq, target, arsenal_mask = [b.to(device) for b in batch]
            optimizer.zero_grad()
            output = model(input_seq)
            output = output.view(-1, output.size(-1))  # Reshape output
            output[~arsenal_mask.bool()] = float('-inf')
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_seq, target, arsenal_mask = [b.to(device) for b in batch]
                output = model(input_seq)
                masked_output = apply_arsenal_mask(output, arsenal_mask)
                loss = criterion(masked_output, target)
                val_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

        # Early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            epochs_without_improvement = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            epochs_without_improvement += 1
            if epochs_without_improvement >= patience:
                print(f"Early stopping triggered after {epoch+1} epochs")
                model.load_state_dict(torch.load('best_model.pth'))
                break

    return model

def apply_arsenal_mask(output, arsenal_mask, epsilon=1e-10):
    # Add small epsilon to prevent log(0)
    masked_output = output.clone()
    masked_output[~arsenal_mask.bool()] = float('-inf')
    return F.log_softmax(masked_output + epsilon, dim=-1)

In [6]:
df = pd.read_csv('../../data/sequence_data_opt.csv')

In [7]:
df.head(10)

Unnamed: 0,Pitch Sequence,Pitcher ID,At-Bat Outcome
0,SI,621107,field_error
1,"SI,CB,FC,SI,CB,SI,FF",621107,single
2,"ST,ST,SI,SI,ST,ST",676534,walk
3,"SI,ST,SI,ST,SI",687330,grounded_into_double_play
4,"FF,FF,FF,SL,FF,SL",477132,strikeout
5,"FF,FF,SL,FF,SL,SL",477132,strikeout
6,"FF,SL,CH,FF,CH,SL,FF,CH,CH,FF",656578,field_error
7,"FC,CH,CH,CH",608379,double_play
8,"FC,CH,FC",608379,field_out
9,"FC,FC,CH",608379,double


In [8]:
# Preprocess data
preprocessor = PitchDataPreprocessor(df)
preprocessor.preprocess()

In [9]:
# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [10]:
train_df.shape

(148430, 3)

In [11]:
train_df.head()

Unnamed: 0,Pitch Sequence,Pitcher ID,At-Bat Outcome
101708,"FF,SL,SL,FF,SL",600917,strikeout
88766,"SL,FF,CB,CB",670102,field_out
177432,SL,623352,force_out
146963,"FF,FS,FS",592332,field_out
41087,"SL,FF,SL,FF,SL,CB,FF,SL",674072,field_out


In [12]:
# Create datasets and dataloaders
train_dataset = PitchSequenceDataset(train_df.head(300), preprocessor)
test_dataset = PitchSequenceDataset(test_df.head(300), preprocessor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [13]:
# Initialize model
vocab_size = len(preprocessor.pitch_to_idx)
model = PitchTransformer(vocab_size, d_model=64, nhead=4, num_layers=2)
model.init_weights()

In [14]:
# Train model
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model = train_model(model, train_loader, test_loader, num_epochs=10, lr=0.001, device="cpu")

IndexError: The shape of the mask [512, 12] at index 0 does not match the shape of the indexed tensor [32, 12] at index 0

In [None]:
predictor = PitchPredictor(trained_model, preprocessor)

In [None]:
# Example prediction
sequence = "FF,CB,SI"
pitcher_id = 621107
next_pitch = predictor.predict_next_pitch(sequence, pitcher_id)
print(f"Predicted next pitch: {next_pitch}")