In [23]:
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5Config
from sklearn.metrics import accuracy_score

In [24]:
# Load your dataset
lichess_username = "ruchitoshniwal1"
df = pd.read_csv(f"../data/processed/sequence_target_map_{lichess_username}.csv")
df.fillna("", inplace=True)
df

Unnamed: 0,input_sequence,target_move
0,,d4
1,d4 d6,Nf3
2,d4 d6 Nf3 Nf6,Bg5
3,d4 d6 Nf3 Nf6 Bg5 c6,c4
4,d4 d6 Nf3 Nf6 Bg5 c6 c4 Qa5+,Nc3
...,...,...
8431,e4 c5,Nf3
8432,e4 c5 Nf3 d6,Bc4
8433,e4 c5 Nf3 d6 Bc4 Nc6,d3
8434,e4 c5 Nf3 d6 Bc4 Nc6 d3 Nf6,O-O


In [25]:
MODEL_MAX_LENGTH = 512
BATCH_SIZE = 8  # Adjust batch size according to your GPU memory
ML_DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
LEARNING_RATE = 0.001
NUM_EPOCHS = 25

In [26]:
def create_san_vocabulary():
    pieces = ['P', 'N', 'B', 'R', 'Q', 'K', '', 'p', 'n', 'b', 'r', 'q', 'k']
    columns = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
    ranks = ['1', '2', '3', '4', '5', '6', '7', '8']
    special = ['O-O', 'O-O-O', '+', '#', 'x', '=', 'e.p.']

    vocabulary = set()
    for piece in pieces:
        for col1 in columns:
            for rank1 in ranks:
                for col2 in columns:
                    for rank2 in ranks:
                        # Add moves like "e4", "Nf3", "Bb5+", etc.
                        line1 = f"{piece}{col1}{rank1}{col2}{rank2}"
                        line2 = f"{piece}{col1}{rank1}x{col2}{rank2}"
                        line3 = f"{piece}{col2}{rank2}"
                        line4 = f"{col2}{rank2}"
                        
                        vocabulary.add(line1)
                        vocabulary.add(line2)
                        vocabulary.add(line3)
                        vocabulary.add(line4)

                        for sp in special:
                            # Add moves like "O-O", "O-O-O", "Nf3#", etc.
                            line5 = f"{line1}{sp}"
                            line6 = f"{line2}{sp}"
                            line7 = f"{line3}{sp}"
                            line8 = f"{line4}{sp}"
                            
                            vocabulary.add(line5)
                            vocabulary.add(line6)
                            vocabulary.add(line7)
                            vocabulary.add(line8)
                # Add simpler moves like "e4", "Nf3", etc.
                vocabulary.add(f"{piece}{col1}{rank1}")
    # Add special moves and annotations
    vocabulary.update(special)

    # Convert the set to a list to fix the order
    vocabulary = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'] + list(vocabulary)  # Add special tokens
    return vocabulary

# Create the vocabulary
custom_vocabulary = create_san_vocabulary()
print(f"Vocabulary size: {len(custom_vocabulary)}")
# Define custom tokenizer class
class SimpleChessTokenizer:
    def __init__(self, vocab):
        self.vocab = {v: k for k, v in enumerate(vocab)}
        self.reverse_vocab = {k: v for v, k in self.vocab.items()}

    def encode(self, moves):
        return [self.vocab.get(move, self.vocab['[UNK]']) for move in moves]

    def decode(self, token_ids):
        return [self.reverse_vocab.get(token_id, '[UNK]') for token_id in token_ids]

# Create the tokenizer
tokenizer = SimpleChessTokenizer(custom_vocabulary)

Vocabulary size: 858636


In [27]:
# Define the chess dataset
class ChessMovesDataset(Dataset):
    def __init__(self, sequences, targets, tokenizer):
        self.sequences = sequences
        self.targets = targets
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        encoded_seq = self.tokenizer.encode(self.sequences[idx])
        encoded_target = self.tokenizer.encode([self.targets[idx]])[0]  # Encode target move
        return torch.tensor(encoded_seq, dtype=torch.long), torch.tensor(encoded_target, dtype=torch.long)

# Assuming 'sequences' is a list of lists of SAN moves, 'targets' is a list of SAN moves.
# Here's how you might extract them from your DataFrame:
sequences = [row["input_sequence"].split(" ") for index, row in df.iterrows()]
targets = df.target_move.tolist()


TRAINING_SET_SIZE = 0.8

# Splitting data into training and testing sets for example
split = int(len(sequences) * TRAINING_SET_SIZE)
train_sequences, train_targets = sequences[:split], targets[:split]
test_sequences, test_targets = sequences[split:], targets[split:]

# Create datasets
train_dataset = ChessMovesDataset(train_sequences, train_targets, tokenizer)
test_dataset = ChessMovesDataset(test_sequences, test_targets, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# T5 configuration
config = T5Config(
    vocab_size=len(custom_vocabulary),
    d_model=MODEL_MAX_LENGTH,
    d_ff=2048,
    num_layers=6,
    num_heads=8,
    dropout_rate=0.1
)
model = T5ForConditionalGeneration(config=config).to(ML_DEVICE)

# Training settings
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()  # Adjust if necessary for your specific task
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
# Train function (simplified for brevity)
def train(model, loader, optimizer):
    TRAIN_LOADER_LENGTH = len(loader)
    model.train()

    total_loss = 0
    for batch in train_loader:
        sequences, targets = batch
        # Pass sequences and targets through your model, compute loss, perform backpropagation
        optimizer.zero_grad()
        outputs = model(sequences)
        loss = criterion(outputs, targets)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / TRAIN_LOADER_LENGTH
    return avg_loss

def evaluate(model, loader):
    # Evaluation Loop
    model.eval()  # Set the model to evaluation mode
    total, correct = 0, 0
    total_loss = 0
    with torch.no_grad():  # No need to track gradients during evaluation
        for batch in test_loader:
            sequences, targets = batch
            outputs = model(sequences)
            _, predicted = torch.max(outputs, 1)  # Get the index of the max log-probability
            loss = outputs.loss
            total_loss += loss.item()
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
            
    total_loss = total_loss / len(loader)
    accuracy = correct / total
    return accuracy


# Assuming you have defined train and test functions
for epoch in range(NUM_EPOCHS):  # Assuming 4 epochs
    avg_training_loss = train(model, train_loader, optimizer)
    print(f'[+] Epoch {epoch + 1} average training loss: {avg_training_loss}')
    val_loss, val_accuracy = evaluate(model, test_loader)
    print(f'Validation Loss: {val_loss}')
    print(f'Validation Accuracy: {val_accuracy}')

RuntimeError: stack expects each tensor to be equal size, but got [1] at entry 0 and [2] at entry 1

In [None]:
# Save the model
model.save_pretrained(f"../models/t5/{lichess_username}_t5_model.pth")