In [1]:
import chess
import chess.pgn

In [6]:
def read_games_from_file(file_path):
    games = []
    with open(file_path, 'r') as pgn_file:
        while True:
            try:
                game = chess.pgn.read_game(pgn_file)
                if game is None:
                    break
                games.append(game)
            except ValueError as e:
                print(f"Pominięto partię z powodu błędu: {e}")
    return games

def is_checkmate(game):
    board = game.board()
    for move in game.mainline_moves():
        board.push(move)
    return board.is_checkmate()

def filter_checkmate_games(games, pgn_output_path):
    checkmate_games = []
    for cnt, game in enumerate(games):
        try:
            if is_checkmate(game):
                checkmate_games.append(game)
        except Exception as e:
            print(f"Pominięto partię numer {cnt} z powodu błędu: {e}")

    with open(pgn_output_path, "a") as output_file:
        for game in checkmate_games:
            output_file.write(str(game))
            output_file.write("\n\n")

files = [
    "2200.pgn",
    "2400.pgn",
    "2600.pgn"
]
data_path = "../data/raw/"

for file in files:
    games = read_games_from_file(data_path + file)
    filter_checkmate_games(games, data_path + "mates.pgn")

unsupported variant: losers while parsing <Game at 0x1bd0362dcd0 ('joddle' vs. 'PowwerBauer', '2024.07.01' at 'FICS freechess.org')>
unsupported variant: losers while parsing <Game at 0x1bd767a78d0 ('PowwerBauer' vs. 'joddle', '2024.06.01' at 'FICS freechess.org')>
unsupported variant: losers while parsing <Game at 0x1bd03b74610 ('joddle' vs. 'PowwerBauer', '2024.06.01' at 'FICS freechess.org')>
unsupported variant: losers while parsing <Game at 0x1bd76703990 ('PowwerBauer' vs. 'joddle', '2024.06.01' at 'FICS freechess.org')>
unsupported variant: losers while parsing <Game at 0x1bd766dfed0 ('joddle' vs. 'PowwerBauer', '2024.06.01' at 'FICS freechess.org')>
unsupported variant: losers while parsing <Game at 0x1bd72889510 ('adcool' vs. 'PowwerBauer', '2024.04.25' at 'FICS freechess.org')>
unsupported variant: losers while parsing <Game at 0x1bd6eb3fe10 ('PowwerBauer' vs. 'adcool', '2024.04.25' at 'FICS freechess.org')>
unsupported variant: losers while parsing <Game at 0x1bd047a8c50 ('tr

Pominięto partię numer 303 z powodu błędu: unsupported variant: losers
Pominięto partię numer 613 z powodu błędu: unsupported variant: losers
Pominięto partię numer 620 z powodu błędu: unsupported variant: losers
Pominięto partię numer 621 z powodu błędu: unsupported variant: losers
Pominięto partię numer 622 z powodu błędu: unsupported variant: losers
Pominięto partię numer 1759 z powodu błędu: unsupported variant: losers
Pominięto partię numer 1760 z powodu błędu: unsupported variant: losers
Pominięto partię numer 2566 z powodu błędu: unsupported variant: losers
Pominięto partię numer 2567 z powodu błędu: unsupported variant: losers
Pominięto partię numer 3349 z powodu błędu: unsupported variant: losers
Pominięto partię numer 4321 z powodu błędu: unsupported variant: losers
Pominięto partię numer 4322 z powodu błędu: unsupported variant: losers
Pominięto partię numer 4633 z powodu błędu: unsupported variant: losers
Pominięto partię numer 4766 z powodu błędu: unsupported variant: lose

In [14]:
games = read_games_from_file(data_path+"mates.pgn")

for cnt, game in enumerate(games):
    data = []
    board = chess.Board()
    cor = True

    for move in game.mainline_moves():
        if move in list(board.legal_moves):
            line = board.fen()
            line+=","+move.uci()+"\n"
            data.append(line)
            board.push(move)
        else:
            cor = False
            break
    if cor:
        with open(data_path+"fen_data.txt", "a") as f:
            f.writelines(data)
    if cnt % 1000 == 0:
        print(f"{cnt}/{len(games)}")

0/3971
1000/3971
2000/3971
3000/3971


In [15]:
with open(data_path+"fen_data.txt", "r") as f:
    fens = f.readlines()

for cnt, fen in enumerate(fens):
    if "k" not in fen or "K" not in fen:
        print (fen)
        print (cnt)

In [25]:
import json
import csv
prep_tokenized_path = "../data/prep/tokenized.csv"
to_tokenize_path = "../data/raw/fen_data.txt"
vocab_src_path = "../src/nlp/fen_vocab.json"
vocab_tar_path = "../src/nlp/vocab.json"

with open(vocab_tar_path, "r") as f:
    vocab_tar = json.load(f)

with open(vocab_src_path, "r") as f:
    vocab_src = json.load(f)

with open(to_tokenize_path, "r") as f:
    lines = f.readlines()

with open(prep_tokenized_path, "w") as file:
    writer = csv.writer(file)
    writer.writerow(['sequence', 'target'])
    for idx, line in enumerate(lines):
        seq_token = [vocab_src["SOS"]]
        tar_token = [vocab_tar["SOS"]]
        seq, tar = line.split(",")
        seq_elems = seq.split(" ")
        word = ""
        seq_elems[1].replace("w", "wh")
        seq_elems[1].replace("b", "bl")
        seq_elems[2].replace("K", "Ki")
        seq_elems[2].replace("Q", "Qu")
        seq_elems[2].replace("k", "ki")
        seq_elems[2].replace("q", "qu")
        for elem in seq_elems[:4]:
            for char in elem:
                word += char
                if word in vocab_src.keys():
                    seq_token.append(vocab_src[word])
                    word = ""
        seq_token.append(vocab_src["EOS"])
        word = ""
        if len(tar) == 6 and tar[4] == "b":
            tar = tar[:4] + "bi\n"

        for char in tar[:-1]:
            word += char
            if word in vocab_tar.keys():
                tar_token.append(vocab_tar[word])
                word = ""
        if len(tar_token) == 2:
            print("dsad")
        tar_token.append(vocab_tar["EOS"])
        writer.writerow([seq_token, tar_token])

fdsf
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
fdsf
dsad
dsad
dsad
dsad
dsad
dsad
fdsf
dsad
dsad
dsad
dsad
dsad
dsad
dsad
fdsf
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
fdsf
dsad
dsad
dsad
dsad
dsad
fdsf
dsad
dsad
dsad
dsad
dsad
dsad
dsad
fdsf
dsad
dsad
dsad
dsad
dsad
dsad
fdsf
dsad
dsad
dsad
dsad
dsad
dsad
fdsf
dsad
fdsf
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
fdsf
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
fdsf
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
fdsf
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
fdsf
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
fdsf
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
dsad
fdsf
dsad
dsad
dsad
dsad
dsad
dsad
dsad
