In [1]:
# We are going to start with modelling chess so the first step is data processing.
import pandas as pd
import numpy as np

chess_df = pd.read_csv("./data/chess/games.csv")
chess_df.head()

n_games = len(chess_df)

game_ids = chess_df["id"].tolist()
rated = chess_df["rated"].tolist()
start_time = chess_df["created_at"].tolist()
end_time = chess_df["last_move_at"].tolist()
duration = list(np.array(end_time) - np.array(start_time)) #some of these are 0 for some reason
num_turns = chess_df["turns"].tolist()
win_condition = chess_df["victory_status"].tolist()
winner_colour = chess_df["winner"].tolist()
white_player_id = chess_df["white_id"].tolist()
white_player_rating = chess_df["white_rating"].tolist()
black_player_id = chess_df["black_id"].tolist()
black_player_rating = chess_df["black_rating"].tolist()
winning_player_id = [white_player_id[i] if winner_colour[i] == "white" else 
                    black_player_id[i] for i in range(n_games)]
move_lists = chess_df["moves"].tolist()
opening_code = chess_df["opening_eco"].tolist()
opening_name = chess_df["opening_name"]

In [2]:
# Now we will create sets of embeddings for the game state (move + player id + rating + colour they are playing)
import torch

data = []
game_data = []
data_grouped = []
all_data = None

for i in range(n_games):
    # w_id = white_player_id[i]
    w_rating = white_player_rating[i]
    
    # b_id = black_player_id[i]
    b_rating = black_player_rating[i]
    
    moves = move_lists[i]
        
    win_con = win_condition[i]
    winner = winner_colour[i]
    
    # FORMAT
    # W Rating: w_rating, B Rating: b_rating; moves; winner wins by win_con
    
    state = [f"[CHESS::] W Rating: {w_rating}, B Rating: {b_rating}; " + moves + f"; {winner} wins by {win_con} [::GAME]"]
        
    data.append(state)
    
    print(f"Processed game {i}")

Processed game 0
Processed game 1
Processed game 2
Processed game 3
Processed game 4
Processed game 5
Processed game 6
Processed game 7
Processed game 8
Processed game 9
Processed game 10
Processed game 11
Processed game 12
Processed game 13
Processed game 14
Processed game 15
Processed game 16
Processed game 17
Processed game 18
Processed game 19
Processed game 20
Processed game 21
Processed game 22
Processed game 23
Processed game 24
Processed game 25
Processed game 26
Processed game 27
Processed game 28
Processed game 29
Processed game 30
Processed game 31
Processed game 32
Processed game 33
Processed game 34
Processed game 35
Processed game 36
Processed game 37
Processed game 38
Processed game 39
Processed game 40
Processed game 41
Processed game 42
Processed game 43
Processed game 44
Processed game 45
Processed game 46
Processed game 47
Processed game 48
Processed game 49
Processed game 50
Processed game 51
Processed game 52
Processed game 53
Processed game 54
Processed game 55
Pr

In [3]:
bog_token = "[CHESS::]"
eog_token = "[::GAME]"

In [4]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=False)
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()

special_tokens = [bog_token, eog_token, "[UNK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)

In [5]:
tokenizer.train_from_iterator(data, trainer=trainer)






In [6]:
print(data[0])
print(tokenizer.encode(data[0][0]).tokens)

['[CHESS::] W Rating: 1500, B Rating: 1191; d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5 Bf4; white wins by outoftime [::GAME]']
['[CHESS::]', 'W', 'Rating:', '1500,', 'B', 'Rating:', '1191;', 'd4', 'd5', 'c4', 'c6', 'cxd5', 'e6', 'dxe6', 'fxe6', 'Nf3', 'Bb4+', 'Nc3', 'Ba5', 'Bf4;', 'white', 'wins', 'by', 'outoftime', '[::GAME]']


In [7]:
tokenizer.save("chess_tokenizer.json")

In [8]:
chess_tokenizer = Tokenizer.from_file("chess_tokenizer.json")
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(
    # tokenizer_object=tokenizer,
    tokenizer_file="chess_tokenizer.json", # You can load from the tokenizer file, alternatively
    unk_token="[UNK]",
    pad_token="[PAD]",
    eos_token=eog_token,
    bos_token=bog_token
)

In [9]:
context_length = 512

training_data = [d[0] for d in data]

In [10]:
outputs = tokenizer(
    training_data,
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(len(outputs['input_ids']))

20058


In [11]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [12]:
def tokenize(element):
    outputs = tokenizer(
        element,
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
        padding=True,
    )
    input_batch = outputs["input_ids"][0]
    eog = input_batch[-1]
    while len(input_batch) < context_length:
        input_batch.append(eog)
    if len(outputs["input_ids"]) > 1:
        print("THERE ARE MULTIPLE SUBLISTS")
        raise ValueError
    # for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
    #     if length == context_length:
    #         input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = [tokenize(x) for x in training_data]
print(tokenized_datasets[0])

{'input_ids': [0, 33, 119, 654, 21, 119, 3310, 160, 167, 198, 224, 302, 200, 797, 523, 150, 527, 168, 660, 2105, 197, 147, 137, 460, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [13]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [14]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")
print(len(tokenizer))

GPT-2 size: 93.4M parameters
9812


In [15]:
out = data_collator([tokenized_datasets[i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 512])
attention_mask shape: torch.Size([5, 512])
labels shape: torch.Size([5, 512])


In [16]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="kasparov",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True if torch.cuda.is_available() else False,
    use_cpu=False if torch.cuda.is_available() else True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
)



In [17]:
trainer.train()

Step,Training Loss,Validation Loss



KeyboardInterrupt

