In [1]:
import json
import os
#%load_ext cython

PATH_DATASET_JSONL = "data\\lichess_db_eval.jsonl" # this is huge
PATH_DATASET_JSONL_REDUCED = "data\\lichess_db_eval_only_fen_and_deepest_best_line.jsonl" # only the FEN and the best line with its eval
PATH_DATASET_JSONL_REDUCED_WIN_PROB = "data\\lichess_db_eval_fen_and_win_probability.jsonl" # only the FEN and the win probability
PATH_DATASET_JSONL_FEN_WIN_LINE = "data\\lichess_db_eval_fen_win_line.jsonl" # only the FEN, win probability and the best line
PATH_DIR_DATASET_JSONL_REDUCED_WIN_PROB_SPLIT = "data\\lichess_db_eval_fen_and_win_probability_split" # split the win probability dataset into separate files

In [2]:
"""
Each dataset element has a chess FEN position string (fen) and a list of evaluations (evals).
Each eval has 3 fiels: pvs, knodes, depth.
Each pvs (principal variations) can have more than 1 variation. Each variation has 2 fields: cp (centipawns), line (a list of moves).
"""

# Read the first json element and print it for the example
file = open(PATH_DATASET_JSONL, "r")
line = file.readline()
element = json.loads(line)
#print(json.dumps(element, indent=2))

fen = element["fen"]
evals = element["evals"]

print(fen)
print("len(evals):", len(evals))
for eval in evals:
    print(eval)


rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq -
len(evals): 4
{'pvs': [{'cp': 21, 'line': 'd2d4 g8f6 c2c4 e7e6 g1f3 d7d5 g2g3 f8b4 c1d2 b4e7'}, {'cp': 20, 'line': 'e2e4 e7e5 g1f3 b8c6 f1c4 g8f6 d2d3 f8c5 c2c3 d7d6'}], 'knodes': 68894779, 'depth': 63}
{'pvs': [{'cp': 22, 'line': 'e2e4 e7e5 g1f3 b8c6 f1b5 g8f6 e1h1 f6e4 f1e1 e4d6'}, {'cp': 17, 'line': 'g1f3 d7d5 d2d4 e7e6 c2c4 g8f6 b1c3 f8b4 c4d5 e6d5'}, {'cp': 16, 'line': 'c2c4 e7e5 g2g3 g8f6 f1g2 f8c5 d2d3 e8h8 b1c3 f8e8'}, {'cp': 14, 'line': 'd2d4 g8f6 c2c4 e7e6 g2g3 d7d5 f1g2 f8b4 c1d2 b4e7'}, {'cp': 13, 'line': 'g2g3 c7c5 c2c4 g7g6 b1c3 b8c6 f1g2 f8g7 d2d3 d7d6'}], 'knodes': 97823307, 'depth': 60}
{'pvs': [{'cp': 34, 'line': 'e2e4 e7e5 g1f3 b8c6 f1b5 g8f6 e1h1 f6e4 f1e1 e4d6'}, {'cp': 33, 'line': 'g1f3 g8f6 c2c4 e7e6 d2d4 d7d5 g2g3 f8b4 c1d2 b4e7'}, {'cp': 28, 'line': 'c2c4 g8f6 g1f3 e7e6 b1c3 d7d5 d2d4 f8b4 d1a4 b8c6'}, {'cp': 24, 'line': 'd2d4 g8f6 c2c4 e7e6 g1f3 d7d5 b1c3 b8d7 c1f4 f8b4'}, {'cp': 21, 'line': 'g2g3 d7d5 g1f3 

## Extracting only the best line and its evaluation to create a simplified dataset

In [None]:
def generate_dataset_only_best_depth(path_dataset:str, path_output:str, verbose=False) -> None:
    """
    Generate a new dataset with only the FEN position and the deepest evaluation of each one.
    The resulting JSONL file has same number of lines as the original dataset.
    Each line (element) has 6 fields: fen, cp, mate, depth, knodes, line.
    The dataset is already sorted by depth and then by evaluation.
    """
    dataset_positions_count = 0
    with open(path_dataset, 'r') as infile, open(path_output, 'w') as outfile:
        for idx, line in enumerate(infile):
            try:
                data = json.loads(line)
                fen = data['fen']
                evals = data['evals']
                best_mate = None
                best_cp = None

                # Get the deepest evaluation (first in the list) and its principal variation (first in the list)
                deepest_eval = evals[0]
                pv = deepest_eval['pvs'][0]

                # Get the best principal variation by mate or cp (first in the list)
                if 'mate' in pv:
                    best_mate = pv['mate']
                if 'cp' in pv:
                    best_cp = pv['cp']
                               
                # If there is no mate nor centipawns something is wrong
                if best_mate is None and best_cp is None:
                    raise ValueError('No mate or centipawns keys in the best eval')

                # Prepare the output data
                output_data = {
                    "fen": fen,
                    "cp": best_cp,
                    "mate": best_mate,
                    "depth": deepest_eval['depth'],
                    "knodes": deepest_eval['knodes'],
                    "line": pv['line']
                }

                # Write the output data as a JSON line
                outfile.write(json.dumps(output_data) + '\n')

                dataset_positions_count += 1

            except Exception as e:
                print(f"Error in line {idx}: {e}")
                continue

        if verbose:
            print("Successfully generated the output dataset with", dataset_positions_count, "lines / positions.")

In [None]:
%%time
# Run the function to generate the new dataset
generate_dataset_only_best_depth(PATH_DATASET_JSONL, PATH_DATASET_JSONL_REDUCED, verbose=True)

## Plot the function used to map cp to win probability

In [None]:
from torre.utils import map_mate_or_cp_to_win_probability
import matplotlib.pyplot as plt

# Plot the matematical function
cp = range(-1000, 1001)
win = [map_mate_or_cp_to_win_probability(mate=None, cp=i) for i in cp]

plt.plot(cp, win)
plt.xlabel('Centipawns')
plt.ylabel('Win probability')
plt.title('Centipawns to win probability')
plt.grid()
plt.show()

## Generate a JSONL dataset with FEN and win probability

In [None]:
def create_dataset_with_fen_and_win_probability(path_dataset:str, path_output:str, verbose=False) -> None:
    """
    Generate a new dataset with only the FEN position and the win probability of the best evaluation of each one.
    The resulting JSONL file has same number of lines as the original dataset.
    Each line (element) has 2 fields: fen, win_probability.
    """
    dataset_positions_count = 0
    with open(path_dataset, 'r') as infile, open(path_output, 'w') as outfile:
        for idx, line in enumerate(infile):
            try:
                data = json.loads(line)
                fen = data['fen']
                cp = data['cp']
                mate = data['mate']

                # Map the mate or centipawns evaluation to win probability
                win_probability = map_mate_or_cp_to_win_probability(mate, cp)
                
                # Prepare the output data
                output_data = {
                    "fen": fen,
                    "win_probability": win_probability
                }

                # Write the output data as a JSONL line
                outfile.write(json.dumps(output_data) + '\n')

                dataset_positions_count += 1

            except Exception as e:
                if verbose:
                    print(f"Error in line {idx}: {e}")
                continue

        if verbose:
            print("Successfully generated the output dataset with", dataset_positions_count, "lines / positions.")

In [None]:
%%time
# Create the JSONL file with the FEN position and the win probability
create_dataset_with_fen_and_win_probability(PATH_DATASET_JSONL_REDUCED, PATH_DATASET_JSONL_REDUCED_WIN_PROB, verbose=True)

## Split the new dataset into many files to reduce the single file size

In [None]:
def split_jsonl_dataset(input_json_path:str, output_directory_path:str, max_lines_per_file:int=100_000, verbose=False) -> None:
    """
    Split a JSON file into multiple JSON files with a maximum number of lines per file.
    The resulting files are saves inside the output directory provided.
    The resulting file names are the same as the original file with an index suffix.
    """
    if not os.path.exists(output_directory_path):
        os.makedirs(output_directory_path)

    with open(input_json_path, 'r') as infile:
        data = infile.readlines()
        num_files = len(data) // max_lines_per_file + 1

        for i in range(num_files):
            start = i * max_lines_per_file
            end = start + max_lines_per_file
            output_path = os.path.join(output_directory_path, f"{i}.jsonl")

            if verbose:
                print(f"Writing file {output_path} with {len(data[start:end])} lines.")

            with open(output_path, 'w') as outfile:
                outfile.writelines(data[start:end])

In [None]:
%%time
# Split the JSONL file with the win probability dataset into multiple files
split_jsonl_dataset(PATH_DATASET_JSONL_REDUCED_WIN_PROB, PATH_DIR_DATASET_JSONL_REDUCED_WIN_PROB_SPLIT)