## Dataprep

Notebook transforms raw data from [lichess elite database](https://database.lichess.org/#evals) jsonl file to number of pytorch tensors.

Download file and place it in data/raw folder.

In [1]:
import json
import chess
import random
import csv

In [None]:


lines_per_file=5859*4096
input_file = "../data/raw/lichess_db_eval.jsonl"
output_prefix = "../data/raw/output"
file_cnter = 0

def json_split(input_file, output_prefix):
    """Wczytuje JSON i tworzy CSV z pozycjami, dodając wersje lustrzane, po czym tasuje i zapisuje w plikach."""
    all_lines = []
    cnter = 0

    with open(input_file, 'r') as json_file:
        for idd, line in enumerate(json_file):
            position = json.loads(line)
            fen = position["fen"]
            board = chess.Board(fen)
            fen_parts = fen.split()
            turn = fen_parts[1]
            turn = 1 if turn == 'w' else -1

            best_eval = max(position["evals"], key=lambda eval: eval["depth"])
            pv = best_eval["pvs"][0]

            if "cp" in pv:
                win_prob = pv["cp"]
            elif "mate" in pv:
                mate = pv["mate"]
                if mate > 0:
                    win_prob = 2000000 - mate
                elif mate < 0:
                    win_prob = -2000000 - mate
            else:
                win_prob = None

            if win_prob is not None:
                all_lines.append([fen, turn, win_prob])

                mirrored_board = board.mirror()
                mirrored_fen = mirrored_board.fen().split(" ", 2)[0]
                mirrored_turn = turn * -1
                mirrored_win_prob = win_prob * -1

                all_lines.append([mirrored_fen, mirrored_turn, mirrored_win_prob])
                cnter += 2

            if idd % 50_000_000 == 0:
                print(idd)
    print(len(all_lines))

    random.seed(42)
    random.shuffle(all_lines)

    csv_file = open(f"{output_prefix}_{file_cnter}.csv", 'w', newline='')
    writer = csv.writer(csv_file)

    for line in all_lines:
        writer.writerow(line)
        line_count += 1

        if line_count >= lines_per_file:
            csv_file.close()
            file_cnter += 1
            line_count = 0
            csv_file = open(f"{output_prefix}_{file_cnter}.csv", 'w', newline='')
            writer = csv.writer(csv_file)

    csv_file.close()
    print(f"files num: {file_cnter+1}")

json_split(input_file, output_prefix)


## Create tensors from csv

In [5]:
import torch
import math

In [6]:
def eval_to_win_prob(centipawns, a=0.002):
    return 1 / (1 + math.exp(-a * centipawns))


def create_tensor(board: chess.Board):
    matrix_board = torch.zeros((6, 8, 8))
    for i in range(8):
        for j in range(8):
            piece = board.piece_at(chess.square(i, j))
            if piece is not None:
                piece_type = piece.piece_type
                piece_color = piece.color
                index = piece_type - 1

                row = 7-j

                if piece_color == chess.WHITE:
                    matrix_board[index, row, i] = 1
                else:
                    matrix_board[index, row, i] = -1

    return matrix_board

In [None]:
data_path = "../data/raw/"

for num in range(file_cnter):
    tensors = []
    labels = []
        
    with open(data_path+f"output_{num}.csv", "r") as f:
        csvFile = csv.reader(f)
        for idx, line in enumerate(csvFile):

            fen, turn, result = line
            tensor = create_tensor(chess.Board(fen))
            tensor = tensor.view(6*8*8)
            tensor = torch.cat([tensor, torch.tensor([int(turn)])])
            label = eval_to_win_prob(float(result))
            tensors.append(tensor)
            labels.append(label)

            if idx % 10_000_000 == 0:
                print(f"{idx}" )

    y = torch.tensor(labels)
    torch.save(y, f"../data/prep/y_{num}.pt")
    del y
    del labels  


    x = torch.stack(tensors)
    torch.save(x, f"../data/prep/X_{num}.pt")
    del x
    del tensors