In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
n = 5000000  # Decide with how many rows you want to start working
df = pd.read_csv("/kaggle/input/chess-evaluations/chessData.csv", delimiter=',', nrows =n)
fens = df["FEN"]
evals = df["Evaluation"]

# Preprocess FEN

We want to convert everything to numerical values. First let's have a look at the elements of the FEN notation. We can see what each column means in the following link: https://www.wikiwand.com/en/Forsyth%E2%80%93Edwards_Notation

In [None]:
def simple_stats(inp_df, to_exclude):
    for col in inp_df.columns:
        if col not in to_exclude:
            print(f"Unique elements for column - {col} -:")
            print(set(inp_df[col]))

fens_stratified = [fen.split(" ") for fen in fens]
df_rep = pd.DataFrame.from_records(fens_stratified, columns=["fen", "turn", "castle", "enpassant", "halfmove", "fullmove"])
simple_stats(df_rep, ["fen"])

In [None]:
df_rep.head()

### 1. Convert board-section of a FEN string into 12x8x8 tensor

In [None]:
import torch
PIECE_TO_INT_2 = {key: i for i, key in enumerate(["r", "n", "b", "q", "k", "p", "R", "N", "B", "Q", "K", "P"])}
def transform_fen(inp_fen:str) -> torch.Tensor:
    board_tensor = []
    for i, piece in enumerate(PIECE_TO_INT_2.keys()):
        piece_matrix = []
        for row in inp_fen.split("/"):
            row_vec = []
            assert isinstance(row, str)
            for element in row:
                if element.isalpha():
                    if element == piece:
                        row_vec += [1]  
                    else:
                        # it's a piece but not the one on the loop
                        row_vec += [0]
                else:
                    if element.isdigit():
                        row_vec += int(element) * [0]  # add as many zeros as the number
                    else:
                        raise ValueError  # raise error if that element is not an integer or a string
            assert len(row_vec) == 8
            piece_matrix.append(row_vec)
        assert len(piece_matrix) == 8
        board_tensor.append(piece_matrix)
    assert len(board_tensor) == 12
    return torch.ByteTensor(board_tensor)

In [None]:
!pip install chess
import chess
example_fen = df_rep["fen"].values[300]
chess.Board(example_fen)

In [None]:
board_tensor = transform_fen(example_fen)
print(board_tensor.size())
board_tensor

### 2. Convert castling to 4-element vector

In [None]:
CASTLE_TO_INT = {key: i for i, key in enumerate(["K", "Q", "k", "q"])}
def castle_to_vec(inp_castle_str):
    out_vec = 4 * [0]
    if inp_castle_str != "-":
        for side in inp_castle_str:
            out_vec[CASTLE_TO_INT[side]] = 1
    return torch.ByteTensor(out_vec)

In [None]:
print(castle_to_vec('KQkq'))
print(castle_to_vec('-'))
print(castle_to_vec('Qq'))
print(castle_to_vec('Kkq').size())

### 3. Convert all others

In [None]:
TURN_TO_INT = {"w": 0, "b": 1}
ENPASSANT_TO_INT = {key: str(i + 1) for i, key in enumerate(["a", "b", "c", "d", "e", "f", "g", "h"])}

def en_passant_to_vec(inp_en_passant):
    output_tensor = torch.zeros([8, 8], dtype=torch.uint8)
    if inp_en_passant != "-":
        assert len(inp_en_passant) == 2
        assert inp_en_passant[0].isalpha()
        assert inp_en_passant[1].isdigit()
        square = (int(ENPASSANT_TO_INT[inp_en_passant[0]])-1, int(inp_en_passant[1])-1)
        assert isinstance(square, tuple)
        output_tensor[7-square[1],square[0]] = 1
    return output_tensor

In [None]:
print(en_passant_to_vec("e5"))
print(en_passant_to_vec("-"))
print(en_passant_to_vec("a1"))
print(en_passant_to_vec("h8"))
print(en_passant_to_vec("f5"))

### Let's stack board (12x64), castling (4) and en passant (8x8) and turn representations (1)

For now we'll only be using these features and we'll include half&full moves later

In [None]:
TURN_TO_INT = {"w": 0, "b": 1}
def encode_fen_flat(inp_fen_string):
    board_str, turn_str, castling_str, enpassant_str, _, _ = inp_fen_string.split(" ")
    board_tensor = transform_fen(board_str).type(torch.FloatTensor)
    assert board_tensor.size() == torch.Size([12, 8, 8])
    enpassant_tensor = en_passant_to_vec(enpassant_str).type(torch.FloatTensor)
    assert enpassant_tensor.size() == torch.Size([8,8])
    castling_tensor = castle_to_vec(castling_str).type(torch.FloatTensor)
    assert castling_tensor.size() == torch.Size([4])
    turn_tensor = torch.ByteTensor([TURN_TO_INT[turn_str]]).type(torch.FloatTensor)
    assert turn_tensor.size() == torch.Size([1])
    
    # Concatenate vectors for Feed-forward network stacking all values in a single dimension
    output_tensor = torch.cat((board_tensor.flatten(), enpassant_tensor.flatten(),castling_tensor.flatten(),turn_tensor.flatten()), 0)
    
    return output_tensor                                                                              
                                                                                  

In [None]:
example = df["FEN"].values[300]
print(example)

In [None]:
encode_fen_flat("8/p7/1prn2k1/3r1pp1/PR1Pp3/2P3P1/2KN1P2/4R3 b - - 5 42").size()

In [None]:
encode_fen_flat("8/p7/1prn2k1/3r1pp1/PR1Pp3/2P3P1/2KN1P2/4R3 b - - 5 42")

In [None]:
len(encode_fen_flat("8/p7/1prn2k1/3r1pp1/PR1Pp3/2P3P1/2KN1P2/4R3 b - - 5 42"))

We can see that this tensor could be directly used as a representation for the model

# Preprocess LABELS

In [None]:
evals.values[0:10]

The majority of values can be converted to an integer/float directly. It's centipawn loss, which means that 1 pawn = 100 points, so these values could be divided by 100 to get a more compact distribution. However, we also realised that there were some cases in which there was the hash element in the string, indicating mate in x moves:

In [None]:
mates = [x for x in evals.values if "#" in x]
for x in mates[0:10]:
    print(x) 

So we need some logic to decide how to deal with these cases since they can't be directly converted. First, let's look at the range of mates that we see

In [None]:
sorted([int(mate.replace("#","").replace("+","").replace("-", "")) for mate in mates], reverse=True)[0:5]

We can see that in the first 10K games, the maximum depth is mate in 23

If we start thinking of this problem, we'll probably be using a regressor for predicting the evaluation of a particular chess position. Then, we would like to convert #1 (mate in one move) to a higher value than #10 (mate in 10), since we want to penalize not seeing mate in 1 more than not seeing mate in 10. Also, we probably want #1 or #x to have pretty high values (higher than non-mate positions). Below you can see a simple approach in which I set two extreme values 200 and -200 and compute 200-#mate/-200+#-mate. 

Note that this might not be an optimal approach depending on the application and it makes a big difference in the magnitude of the loss. Further thinking about this might be needed.

Any views about this?

In [None]:
import matplotlib.pyplot as plt
nonmates = [int(x.replace("+","").replace("\ufeff",""))/100 for x in evals.values if "#" not in x]
plt.hist(nonmates, 50)
plt.show()

In [None]:
nonmates = [x for x in nonmates if -10<x<10]
plt.hist(nonmates, 50)
plt.show()

In [None]:
def convert_mates(inp_mate, base = 20000, spacing=1):
    value = int(inp_mate.replace("#","").replace("+","").replace("\ufeff","")) * spacing
    if value > 0:
        return base - value
    else:
        return -base - value

In [None]:
all_labels_corrected = [int(x.replace("+","").replace("\ufeff",""))/100 if "#" not in x else convert_mates(x)/100 for x in evals.values]
plt.hist(all_labels_corrected, 50)
plt.show()

Seems that we can now deal with these labels as regression values. Also, we can see we have maany positions with evaluation close to 0.0 (probably early stages in the game). This might be a limitation in terms of the diversity of the dataset

Arctan transformation:

In [None]:
USE_ARCTAN = True

In [None]:
if USE_ARCTAN:
    import numpy as np
    all_labels_corrected =list(np.arctan(all_labels_corrected))
    plt.hist(all_labels_corrected, 50)
    plt.show()

# Create features and labels

In [None]:
from tqdm import tqdm
import numpy as np
features = []
labels = []
for fen_str, eval_str in tqdm(zip(df["FEN"].values,df["Evaluation"].values)):
    fen_tensor = encode_fen_flat(fen_str)
    if "#" not in eval_str:
        eval_value = int(eval_str.replace("+","").replace("\ufeff",""))/100
    else:
        eval_value = convert_mates(eval_str)/100
    if USE_ARCTAN:
        eval_value = np.arctan(eval_value)
    
    eval_tensor = torch.Tensor([eval_value])
    features.append(fen_tensor)
    labels.append(eval_tensor)
    

In [None]:
print(len(features))
print(len(labels))

In [None]:
!pip install scikit-learn==0.24.2

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.1, random_state=42, shuffle=True)

# Model development

## Dataloader

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class ChessDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = dict(features=self.encodings[idx], labels=self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

## Model definition

In [None]:
from torch.nn.init import xavier_uniform_
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def init_weights(m):
    try:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)
    except Exception:
        return
    
class MLP(torch.nn.Module):
    # define model elements
    def __init__(self, n_inputs):
        super(MLP, self).__init__()    
        self.base_model = torch.nn.Sequential(
        torch.nn.Linear(n_inputs, 600),
        torch.nn.ReLU(),
        torch.nn.Dropout(.2),
        torch.nn.Linear(600, 300),
        torch.nn.Dropout(.2),
        torch.nn.ReLU(),
        torch.nn.Linear(300, 100),
        torch.nn.Dropout(.2),
        torch.nn.ReLU(),
        torch.nn.Linear(100, 1),
    ).to(device)
        self.base_model.apply(init_weights)

    # forward propagate input
    def forward(self, X):
        X = X.to(device)
        # input to first hidden layer
        X = self.base_model(X)
        return X

## Train model

In [None]:
from tqdm import tqdm
from numpy import vstack
from sklearn.metrics import mean_squared_error
from numpy import sqrt

# evaluate the model
def evaluate_model(test_dl, model,criterion):
    predictions, actuals = list(), list()
    stop_training=False
    model.eval()
    with torch.no_grad():
        print("Running evaluation")
        total_loss = []
        for i, inputs in enumerate(test_dl):
            # evaluate the model on the test set
            yhat = model(inputs["features"])
            # retrieve numpy array
            targets = inputs["labels"].to(device)
            
            if USE_ARCTAN:
                yhat = torch.tan(yhat).to(device)
                tagets = torch.tan(yhat).to(device)
            loss = criterion(yhat, targets)
            total_loss.append(loss.item())
    mloss = sum(total_loss) / len(total_loss)
    return mloss

def train_model(train_dl, test_dl, model, n_epochs, lr):
    # define the optimization
    criterion = torch.nn.MSELoss()
    #optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum)
    optimizer = torch.optim.AdamW(model.parameters(),lr=lr)
    
    # enumerate epochs
    for e_n, epoch in tqdm(enumerate(range(n_epochs))):
        # enumerate mini batches
        model.train()
        training_loss = []
        for i, inputs in enumerate(train_dl):
            # clear the gradients
            optimizer.zero_grad()
            # compute the model output
            yhat = model(inputs["features"])
            # calculate loss
            targets = inputs["labels"].to(device)
            loss = criterion(yhat, targets)
            
            loss.backward() # calculate gradient
    
            optimizer.step() # update model weights
            
            optimizer.zero_grad()
            
            training_loss.append(loss.item())
            #print(loss.item())
        mse = evaluate_model(test_dl, model,criterion)
        msesqrt = sqrt(mse)
        
        print('Epoch: %.0f MSE: %.3f, RMSE: %.3f, Training loss: %.3f' % (e_n, mse, msesqrt,sum(training_loss) / len(training_loss)))
        if msesqrt < 2:
            break

In [None]:
torch.cuda.empty_cache()
# prepare the datasets

train_batch_size = 512*3
test_batch_size = 512*1000

train_dataset = ChessDataset(encodings = X_train, labels=y_train)
train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)

test_dataset = ChessDataset(encodings = X_test, labels=y_test)
test_loader = DataLoader(test_dataset, batch_size=test_batch_size, shuffle=False)

In [None]:
n_epochs = 1000
learning_rate = 1e-3
input_size = X_train[0].size()[0]

model = MLP(n_inputs=input_size)
# train the model
train_model(train_dl=train_loader, test_dl=test_loader, model=model, n_epochs=n_epochs, lr=learning_rate)


# Qualitative evaluation

In [None]:
import chess

examples = ["r1bqk1nr/pp1pppbp/2n3p1/1Bp5/4P3/5N2/PPPP1PPP/RNBQ1RK1 w Qkq - 0 1",
            "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/R1BQKBNR w KQkq - 0 1",
           "r1bqkbnr/pppp1ppp/2n5/4p3/4P3/5N2/PPPP1PPP/RNBQKB1R w KQkq - 0 1",
            "8/8/1q1k4/8/8/3K4/8/8 w - - 0 1",
            "8/5r2/3k4/8/8/3K4/8/8 w - - 0 1",
           "r1bqkbnr/1ppp1ppp/p1n5/4p2Q/2B1P3/8/PPPP1PPP/RNB1K1NR w KQkq - 0 1",
           "4k3/8/8/8/8/8/PPPPPPPP/RNBQKBNR w KQ - 0 1",
           "8/8/8/4k3/8/8/4K3/2Q5 w - - 0 1"]



def predict(fen_reps, model):
    
    features = []
    for fen_rep in fen_reps:
        features.append(encode_fen_flat(fen_rep))
    
    features = torch.stack(
    features
    ,dim=0
    )
    # make predictions
    model.eval()
    with torch.no_grad():
        yhats = model(features)
    # retrieve numpy array
    yhats = yhats.to('cpu').detach().numpy()
    if USE_ARCTAN:
        yhats = np.tan(yhats)
    for yh, fen in zip(yhats, fen_reps):
        display(chess.svg.board(chess.Board(fen), size=350))
        print(f"Model evaluation {yh}")

predict(fen_reps=examples,model=model)