In [12]:
import chess
import chess.pgn
import os

In [13]:
def read_games_from_file(file_path):
    games = []
    with open(file_path, 'r') as pgn_file:
        while True:
            try:
                game = chess.pgn.read_game(pgn_file)
                if game is None:
                    break
                games.append(game)
            except ValueError as e:
                print(f"Pominięto partię z powodu błędu: {e}")
    return games

def is_checkmate(game):
    board = game.board()
    for move in game.mainline_moves():
        board.push(move)
    return board.is_checkmate()

def filter_checkmate_games(games, pgn_output_path):
    checkmate_games = []
    for cnt, game in enumerate(games):
        try:
            if is_checkmate(game):
                checkmate_games.append(game)
        except Exception as e:
            print(f"Pominięto partię numer {cnt} z powodu błędu: {e}")

    with open(pgn_output_path, "a") as output_file:
        for game in checkmate_games:
            output_file.write(str(game))
            output_file.write("\n\n")


data_path = "../data/raw/"

files = os.listdir(data_path)
print(files)

# for file in files:
#     games = read_games_from_file(data_path + file)
#     print(file)
#     filter_checkmate_games(games, data_path + "mates.pgn")

['2001-1.pgn', '2024-1.pgn', '2013-2.pgn', '2013-1.pgn', '2016-1.pgn', '2003-056.pgn', '2005-2.pgn', '2003-34.pgn', '2000-1.pgn', '2002-1.pgn', '2022-2.pgn', '2004-2.pgn', '2015-2.pgn', '2015-1.pgn', '2023-1.pgn', '2012-1.pgn', '2019-1.pgn', '2003-1.pgn', '2001-2.pgn', '2007-2.pgn', '2000-2.pgn', '2009-2.pgn', '2012-2.pgn', '2011-2.pgn', '2020-1.pgn', '2022-1.pgn', '2003-2.pgn', '2019-2.pgn', '2011-1.pgn', '2008-1.pgn', '2007-1.pgn', '2006-1.pgn', '2002-2.pgn', '2018-1.pgn', '2006-2.pgn', 'fen_full_data.txt', '2005-1.pgn', '2017-1.pgn', '2004-1.pgn', '2009-1.pgn', '2014-2.pgn', '2021-1.pgn', '2021-2.pgn', '2023-2.pgn', '2010-2.pgn', '2020-2.pgn', 'mates.pgn', '2016-2.pgn', '2014-1.pgn', '2017-2.pgn', '2018-2.pgn', '2010-1.pgn', '2008-2.pgn']


In [18]:
data_path = "../data/raw/"
games = read_games_from_file(data_path+"mates.pgn")
invalid_cnt = 0

for cnt, game in enumerate(games):
    data = []
    board = chess.Board()
    cor = True
    fin = False

    for move in game.mainline_moves():
        if move in list(board.legal_moves):
            if board.is_en_passant(move):
                fin = True
                break
            fen, _, cas_rights, *oth = board.fen().split(" ")
            turn = "True" if board.turn else "False"
            line = fen+","+turn+","+cas_rights+","+move.uci()+"\n"
            data.append(line)
            board.push(move)
        else:
            invalid_cnt += 1
            cor = False
            break
    
    if cor and not fin:
        with open(data_path+"fen_full_data.txt", "a") as f:
            f.writelines(data)
    if cnt % 1000 == 0:
        print(f"{cnt}/{len(games)}")
print(f"invalid games: {invalid_cnt}")


0/51366
1000/51366
2000/51366
3000/51366
4000/51366
5000/51366
6000/51366
7000/51366
8000/51366
9000/51366
10000/51366
11000/51366
12000/51366
13000/51366
14000/51366
15000/51366
16000/51366
17000/51366
18000/51366
19000/51366
20000/51366
21000/51366
22000/51366
23000/51366
24000/51366
25000/51366
26000/51366
27000/51366
28000/51366
29000/51366
30000/51366
31000/51366
32000/51366
33000/51366
34000/51366
35000/51366
36000/51366
37000/51366
38000/51366
39000/51366
40000/51366
41000/51366
42000/51366
43000/51366
44000/51366
45000/51366
46000/51366
47000/51366
48000/51366
49000/51366
50000/51366
51000/51366
invalid games: 242


In [15]:
with open(data_path+"fen_full_data.txt", "r") as f:
    fens = f.readlines()

for cnt, fen in enumerate(fens):
    if "k" not in fen or "K" not in fen:
        print (fen)
        print (cnt)

In [19]:
import json
import csv
prep_tokenized_path = "../data/prep/tokenized_full_ones.csv"
to_tokenize_path = "../data/raw/fen_full_data.txt"
vocab_src_path = "../src/lstm_full_fen/src_vocab_full.json"
vocab_tar_path = "../src/lstm_full_fen/tar_vocab_full.json"

with open(vocab_tar_path, "r") as f:
    vocab_tar = json.load(f)

with open(vocab_src_path, "r") as f:
    vocab_src = json.load(f)

with open(to_tokenize_path, "r") as f:
    lines = f.readlines()

with open(prep_tokenized_path, "w") as file:
    writer = csv.writer(file)
    writer.writerow(['sequence', 'target'])
    for idx, line in enumerate(lines):
        fen, turn, cas_rights, move = line.split(",")
        move = move[:-1]
        seq_token = [vocab_src["SOS"]]
        tar_token = [vocab_tar["SOS"]]
        # seq_elems = seq.split(" ")
        word = ""
        cas_rights = cas_rights.replace("K", "Ki")
        cas_rights = cas_rights.replace("Q", "Qu")
        cas_rights = cas_rights.replace("k", "ki")
        cas_rights = cas_rights.replace("q", "qu")
        for elem in fen:
            seq_token.append(vocab_src[elem])
        seq_token.append(vocab_src[turn])

        if len(cas_rights) > 1:
            elems = [cas_rights[i:i+2] for i in range(0, len(cas_rights), 2)]
            for el in elems:
                seq_token.append(vocab_src[el])
        else:
            seq_token.append(vocab_src[cas_rights])
        seq_token.append(vocab_src["EOS"])
        
        first, sec, th, fo = move[:4]

        tar_token.append(vocab_tar[first])
        tar_token.append(vocab_tar[sec])
        tar_token.append(vocab_tar[th])
        tar_token.append(vocab_tar[fo])
        if len(move) == 5:
            if move[4] =="b":
                tar_token.append(vocab_tar["bi"])
            else:
                tar_token.append(vocab_tar[move[4]])
        tar_token.append(vocab_tar["EOS"])
        writer.writerow([seq_token, tar_token])