In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/thesis-chess-dataset/fourthSorted.pgn


In [2]:
import os

# Splits the PGN file into parts, each containing up to 'split_size' games from a specific year, and saves them to the output folder
def split_and_filter_games_by_year(pgn_file_path, year, split_size, output_folder):
    
    os.makedirs(output_folder, exist_ok=True)
    
    current_file_number = 1
    game_count = 0
    current_file_path = os.path.join(output_folder, f"games_2020_part_{current_file_number}.pgn")
    current_file = open(current_file_path, "w")
    
    with open(pgn_file_path, "r") as pgn:
        game_lines = []  # Hold the lines of a game
        for line in pgn:
            if line.startswith('[Event '):
                # Process the previous game
                if game_lines:
                    date_line = next((l for l in game_lines if l.startswith('[Date ')), '')
                    game_year = date_line.split('"')[1].split('.')[0]  # Extract the year part
                    if game_year == year:
                        game_count += 1
                        if game_count > split_size:
                            # Close current file and open a new one
                            current_file.close()
                            current_file_number += 1
                            current_file_path = os.path.join(output_folder, f"games_2020_part_{current_file_number}.pgn")
                            current_file = open(current_file_path, "w")
                            game_count = 1  # Reset the count for the new file
                        current_file.writelines(game_lines)
                game_lines = [line]  # Start a new game
            else:
                game_lines.append(line)
        
        # Check for the last game in the file
        if game_lines:
            date_line = next((l for l in game_lines if l.startswith('[Date ')), '')
            game_year = date_line.split('"')[1].split('.')[0]
            if game_year == year:
                current_file.writelines(game_lines)
                
    current_file.close()  # Ensure the last file is closed after the loop finishes

# Path to the directory containing the PGN file
pgn_file_path = "/kaggle/input/thesis-chess-dataset/fourthSorted.pgn"
# Selection of filter for year
year = "2020"
# File size for split by no. of games
split_size = 10000

output_folder = "pgn_2020_data"

# Call the function
split_and_filter_games_by_year(pgn_file_path, year, split_size, output_folder)

In [3]:
pip install converter pgn2data

Collecting converter
  Downloading converter-1.0.0.zip (1.3 kB)
  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting pgn2data
  Downloading pgn2data-0.0.9-py3-none-any.whl.metadata (9.8 kB)
Collecting chess (from pgn2data)
  Downloading chess-1.10.0-py3-none-any.whl.metadata (19 kB)
Downloading pgn2data-0.0.9-py3-none-any.whl (31 kB)
Downloading chess-1.10.0-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: converter
  Building wheel for converter (setup.py) ... [?25l- \ done
[?25h  Created wheel for converter: filename=converter-1.0.0-py3-none-any.whl size=1751 sha256=157479e0904ef01c83efbd9e8d0b8b8cfa9678310d92e88bd316037b0822e83e
  Stored in directory: /root/.cache/pip/wheels/fd/74/44/8a22cacdcff0e3a951e569210219b458168eba1e34f03e24f3
Successfully built converter
Installing collected packages: converter, che

In [4]:
from converter.pgn_data import PGNData
from concurrent.futures import ThreadPoolExecutor, as_completed
import os

# function to process pgn to csv with games info and moves csv files
def process_pgn_file(pgn_file_path):

    try:
        pgn_data = PGNData(pgn_file_path)
        result = pgn_data.export()
        result.print_summary()  # Print summary of the export
        return f"Processing {pgn_file_path} completed successfully."
    except Exception as e:
        return f"Error processing {pgn_file_path}: {e}"

# Process a list of PGN files in parallel
def process_pgn_files_in_parallel(files):

    with ThreadPoolExecutor(max_workers=15) as executor:
        futures = {executor.submit(process_pgn_file, file): file for file in files}
        for future in as_completed(futures):
            print(future.result())

# Path to the directory containing the PGN files
pgn_directory = "/kaggle/working/pgn_2020_data"

# Get all PGN files in the directory
pgn_files = [os.path.join(pgn_directory, f) for f in os.listdir(pgn_directory) if f.endswith('.pgn')]

# Process in batches of 12
batch_size = 12
for i in range(0, len(pgn_files), batch_size):
    batch_files = pgn_files[i:i+batch_size]
    process_pgn_files_in_parallel(batch_files)

is complete: True
games file: games_2020_part_35_game_info.csv | size: 495690
moves file: games_2020_part_35_moves.csv | size: 74953501
Processing /kaggle/working/pgn_2020_data/games_2020_part_35.pgn completed successfully.
is complete: True
games file: games_2020_part_6_game_info.csv | size: 2543207
moves file: games_2020_part_6_moves.csv | size: 343963475
Processing /kaggle/working/pgn_2020_data/games_2020_part_6.pgn completed successfully.
is complete: True
games file: games_2020_part_5_game_info.csv | size: 2501835
moves file: games_2020_part_5_moves.csv | size: 352231903
Processing /kaggle/working/pgn_2020_data/games_2020_part_5.pgn completed successfully.
is complete: True
games file: games_2020_part_27_game_info.csv | size: 2583113
moves file: games_2020_part_27_moves.csv | size: 366574046
Processing /kaggle/working/pgn_2020_data/games_2020_part_27.pgn completed successfully.
is complete: True
games file: games_2020_part_7_game_info.csv | size: 2584421
moves file: games_2020_par