In [1]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import GPT2LMHeadModel, GPT2Tokenizer, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split



In [2]:
class ChessDataset(Dataset):
    def __init__(self, tokenizer, sequences, targets, max_length):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []
        self.targets = []

        for sequence, target in zip(sequences, targets):
            # Tokenize and encode sequences and targets
            sequence_encoding = tokenizer.encode_plus(
                sequence,
                max_length=max_length,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                add_special_tokens=True,
                return_tensors='pt'
            )

            # For targets, we only need the input IDs, but they should be padded as well
            target_encoding = tokenizer.encode_plus(
                target,
                max_length=max_length,  # Ensure targets are also padded
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            self.input_ids.append(sequence_encoding['input_ids'].flatten())
            self.attn_masks.append(sequence_encoding['attention_mask'].flatten())
            self.targets.append(target_encoding['input_ids'].flatten())

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx], self.targets[idx]

In [3]:
def train(model, dataset, tokenizer, device, batch_size=8, epochs=25):
    model.train()
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    optimizer = AdamW(model.parameters(), lr=5e-5)
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    for epoch in range(epochs):
        for batch in train_loader:
            optimizer.zero_grad()
            inputs, masks, targets = batch
            inputs, masks, targets = inputs.to(device), masks.to(device), torch.tensor(targets).to(device)
            outputs = model(inputs, labels=inputs, attention_mask=masks)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

def predict(model, sequence, tokenizer, device, max_length):
    model.eval()
    inputs = tokenizer.encode(sequence, return_tensors="pt", max_length=max_length, padding='max_length', truncation=True).to(device)
    print(f"Seq length: {len(inputs)}")
    outputs = model.generate(inputs, max_length=max_length, temperature=0.7, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [4]:
# Load your dataset
lichess_username = "rootsec1"
file_path = f'../data/processed/sequence_target_map_{lichess_username}.csv' # Replace with your file path
data = pd.read_csv(file_path)
data['input_sequence'].fillna('[START]', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['input_sequence'].fillna('[START]', inplace=True)


In [5]:
# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [6]:
# Preprocess the data
max_length = 1024 # or any other sequence length
sequences = data['input_sequence'].tolist()
targets = data['target_move'].tolist()
dataset = ChessDataset(tokenizer, sequences, targets, max_length)

In [7]:
# Split the dataset
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

In [8]:
# Train the model
train(model, train_dataset, tokenizer, device, batch_size=4)

  inputs, masks, targets = inputs.to(device), masks.to(device), torch.tensor(targets).to(device)


KeyboardInterrupt: 

In [None]:
# Save the model
model.save_pretrained('models/gpt2') # Replace with your save path
tokenizer.save_pretrained('models/gpt2') # Replace with your save path

In [None]:
# Make a prediction
sequence = 'e4 d5 exd5 Qxd5 Nc3 Qe5+ Be2 Nc6' # Example chess moves
predicted_move = predict(model, sequence, tokenizer, device, max_length, max_new_tokens=256)
print(predicted_move)