In [16]:
import chess
import chess.pgn

In [17]:
prom_types = {
    5: "=Q",
    4: "=R",
    3: "=B",
    2: "=N"
}

def read_games_from_file(file_path):
    games = []
    with open(file_path, 'r') as pgn_file:
        while True:
            game = chess.pgn.read_game(pgn_file)
            if game is None:
                break
            games.append(game)
    return games

path_prep_pad = "../data/transformed/full_to_cut.txt"
path_prep = "../data/transformed/full_to_tokenize.txt"
path = "../data/raw/full.pgn"
games = read_games_from_file(path)
for cnt, game in enumerate(games):
    board = chess.Board()
    lines = []
    line = "<SOS>"
    for move in game.mainline_moves():
        temp_move = ""
        uci = move.uci()
        assert len(uci) >= 4
        first = uci[:2]
        sec = uci[2:4]
        temp_move += first + " " + sec
        if move.promotion is not None:
            temp_move += " " + prom_types[int(move.promotion)]
        elif board.is_castling(move):
            temp_move += " <CAS>"
        elif board.is_en_passant(move):
            temp_move += " <ENP>"
        temp_move += " <EOM>"
        board.push(move)
        lines.append(line + "-<SOS> " + temp_move + "\n")
        line += " " + temp_move
    with open(path_prep_pad, "+a") as f:
        f.writelines(lines)
    if cnt % 1000 == 0:
        print(f"{cnt}/{len(games)}")


0/5000
1000/5000
2000/5000
3000/5000
4000/5000


In [19]:
lines = []
with open(path_prep_pad, "r") as f:
    lines = f.readlines()

max_x = 0
max_y = 0
for line in lines:
    x, y = line.split("-")
    xs = x.split(" ")
    ys = y.split(" ")
    max_x = max(len(xs), max_x)
    max_y = max(len(ys), max_y)
new_lines = []
seq_accepted_len = 200

for idx, line in enumerate(lines):
    x, y = line.split("-")
    xs = x[:-1].split(" ") if x[-1] == "<SOS>" else x.split(" ")
    ys = y[:-1].split(" ")
    to_add = seq_accepted_len - len(xs)
    if to_add < 0:
        continue
    pad = ['<PAD>' for _ in range(to_add)]
    xs.extend(pad)
    while len(ys) != max_y:
        ys.append("<PAD>")
    new_x = " ".join(xs)
    new_y = " ".join(ys)
    new_lines.append(new_x+"-"+new_y+"\n")
    if idx % 100000 == 0:
        print(f"{idx}/{len(lines)}")

with open(path_prep, "w") as f:
    f.writelines(new_lines)

0/455060
100000/455060
300000/455060
400000/455060


In [22]:
prep_tokenized_path = "../data/prep/tokenized.csv"
to_tokenize_path = "../data/transformed/full_to_tokenize.txt"
vocab_path = "../src/nlp/vocab.json"

import json
import csv

with open(vocab_path, "r") as f:
    vocab = json.load(f)

with open(to_tokenize_path, "r") as f:
    lines = f.readlines()

with open(prep_tokenized_path, "w") as file:
    writer = csv.writer(file)
    writer.writerow(['sequence', 'target'])
    for idx, line in enumerate(lines):
        xs, ys = line[:-1].split('-')
        moves_x, moves_y = xs.split(" "), ys.split(" ")
        seq = [vocab[word] for word in moves_x]
        targets = [vocab[word] for word in moves_y]
        writer.writerow([str(seq), str(targets)])
        if idx % 30000 == 0:
            print(f"{idx}/{len(lines)}")

0/313825
30000/313825
60000/313825
90000/313825
120000/313825
150000/313825
180000/313825
210000/313825
240000/313825
270000/313825
300000/313825
