In [None]:
%cd ..

In [None]:
from src.transformer import GPT
from src.utils import CfgNode as CN
import json
import torch
from safetensors.torch import load_model
from datasets import load_dataset
import numpy as np
from tqdm import tqdm
import torch.nn.functional as F

conf = 'config.json'

C = CN()

with open(conf) as f:
    config = json.load(f)

C.n_layer = config["n_layer"]
C.n_embd = config["n_embd"]
C.n_head = config["n_head"]
C.embd_pdrop = config["embd_pdrop"]
C.resid_pdrop = config["resid_pdrop"]
C.attn_pdrop = config["attn_pdrop"]
C.block_size = 768

chars = ['0', '5', '2', 'Q', 'e', 'a', 'R', 'd', '9', '8', '1', 'N', 'x', 'f', '6', '+', 'c', '=', 'h', 'O', 'B', '.', '7', '/', '4', '3', '*', 'b', '-', ' ', 'K', 'g', '#', ';', '[PAD]']

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
def encode(s):
    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
def decode(l):
    return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

C.vocab_size = len(stoi)
C.vocabulary = chars

In [2]:
class GPT_probing(GPT):
    def __init__(self, config):
        super().__init__(config)
        self.n_layer = config.n_layer

    def forward(self, idx, layer_num):
        device = idx.device
        b, t = idx.size()
        assert t <= self.block_size, "Cannot forward, model block size is exhausted."
        assert 0 <= layer_num < self.n_layer,  "Cannot extract this layer"
        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)

        tok_emb = self.transformer.wte(idx) # shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # shape (1, t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        all_probs = []
        for block in self.transformer.h[:layer_num+1]:
            x = block(x)
            all_probs.append(x[:, -1, :].detach().cpu().numpy())
        return all_probs

In [None]:
model = GPT_probing(C).eval()
load_model(model, "models/stockfish-16/model.safetensors")

number of parameters: 50.85M


In [4]:
dataset = load_dataset("adamkarvonen/chess_games", data_files="stockfish_dataset.zip")

Repo card metadata block was not found. Setting CardData to empty.


In [5]:
# Shuffle the dataset
dataset['train'] = dataset['train'].shuffle()

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model=model.to(device)

In [7]:
N = 10000
X = []
y = []
i = 0
with torch.no_grad():
    for i in tqdm(range(N)):
        game = dataset['train'][i]
        transcript = ";" + game['result'] + "#" + game['transcript'].split('\n\n')[-1]
        stockfish_level = int(game['player_two'].split(' ')[-1])
        encoded_text = torch.tensor(encode(transcript), dtype=torch.int64)[:C.block_size].unsqueeze(0).to(device)
        out = model(encoded_text, 15)
        X.append(out)
        y.append(stockfish_level)


100%|██████████| 10000/10000 [02:26<00:00, 68.15it/s]


In [8]:
X = np.array(X)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import numpy as np

class ProbingDataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.tensor(x, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
        self.max_class = 22
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        # One hot encode the output
        one_hot = torch.zeros(self.max_class + 1, dtype=torch.long)
        one_hot[self.y[idx]] = 1
        return self.x[idx], one_hot
    
dataset = ProbingDataset(X, y)

In [127]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

In [128]:
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [129]:
class LinearModel(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(LinearModel, self).__init__()
        self.fc = nn.Linear(input_dim, num_classes)
    
    def forward(self, x):
        return self.fc(x)

In [130]:
input_dim = 512
num_classes = 23
model = LinearModel(input_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [132]:
def train(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            print(labels.shape)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

In [134]:
def evaluate(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    print(f"Test Accuracy: {accuracy:.4f}")

In [None]:
train(model, train_loader, criterion, optimizer, epochs=10)
evaluate(model, test_loader)