In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/2020-chess-thesis-1-split-pgn-files/games_2020_part_29_game_info.csv
/kaggle/input/2020-chess-thesis-1-split-pgn-files/games_2020_part_10_game_info.csv
/kaggle/input/2020-chess-thesis-1-split-pgn-files/games_2020_part_20_game_info.csv
/kaggle/input/2020-chess-thesis-1-split-pgn-files/games_2020_part_33_moves.csv
/kaggle/input/2020-chess-thesis-1-split-pgn-files/games_2020_part_30_game_info.csv
/kaggle/input/2020-chess-thesis-1-split-pgn-files/games_2020_part_28_game_info.csv
/kaggle/input/2020-chess-thesis-1-split-pgn-files/games_2020_part_18_game_info.csv
/kaggle/input/2020-chess-thesis-1-split-pgn-files/games_2020_part_7_moves.csv
/kaggle/input/2020-chess-thesis-1-split-pgn-files/games_2020_part_21_game_info.csv
/kaggle/input/2020-chess-thesis-1-split-pgn-files/games_2020_part_4_moves.csv
/kaggle/input/2020-chess-thesis-1-split-pgn-files/games_2020_part_34_moves.csv
/kaggle/input/2020-chess-thesis-1-split-pgn-files/games_2020_part_35_game_info.csv
/kaggle/input/2020-che

In [2]:
# Define the piece values for middlegame for non-pawn pieces (N, B, R, Q)
piece_values_mg = [781, 825, 1276, 2538]

# Function to calculate non-pawn material score from a FEN string
def calculate_non_pawn_material_score(fen):
    non_pawn_material_score = 0
    # Count non-pawn pieces for both white and black and calculate their total value
    for i, piece in enumerate("NBRQ"):
        non_pawn_material_score += (fen.count(piece) + fen.count(piece.lower())) * piece_values_mg[i]
    return non_pawn_material_score

In [3]:
# Define the path to the directory containing the files
source_dir = "/kaggle/input/2020-chess-thesis-1-split-pgn-files/"
target_dir = "/kaggle/working/moves/"

# Make sure the target directory exists
os.makedirs(target_dir, exist_ok=True)

for part in range(1, 36):  # Adjust the range based on the number of parts + 1
    file_path = f"{source_dir}games_2020_part_{part}_moves.csv"
    df = pd.read_csv(file_path)
    
    # Calculate non-pawn material scores and add as a new column
    df['non_pawn_material_score'] = df['fen'].apply(calculate_non_pawn_material_score)
    
    # Filter out rows based on material score
    filtered_df = df[(df['non_pawn_material_score'] <= 15258) & (df['non_pawn_material_score'] > 3915)]
    
    # Save the filtered dataframe to a new CSV file
    filtered_df.to_csv(f"{target_dir}filtered_games_2020_part_{part}_moves.csv", index=False)

In [4]:
# Step 1: Identify the first ply of the middle game for each unique game_id.
def get_first_ply_middle_game(moves_df):
    # Group by game_id and get the minimum move_no for each game
    first_ply_middle_game = moves_df.groupby('game_id')['move_no'].min()
    return first_ply_middle_game

# Step 2: Compile moves for the whole middle game in SAN format.
def get_middle_game_moves(moves_df, first_ply_middle_game):
    middle_game_moves = {}
    for game_id, first_ply in first_ply_middle_game.items():
        game_moves = moves_df[moves_df['game_id'] == game_id]
        # Assuming move_sequence is sorted by move_no, get the last move_sequence
        last_ply_moves = game_moves.iloc[-1]['move_sequence']
        # Split the move_sequence string and get moves after the first ply
        moves_list = last_ply_moves.split('|')
        middle_game_moves_san = '|'.join(moves_list[first_ply - 1:])  # Keeping the 1st move of the first ply
        middle_game_moves[game_id] = middle_game_moves_san
    return middle_game_moves

# Define the path to the directory containing filtered moves files
source_moves_dir = "/kaggle/working/moves/"
# Define the path to the directory containing games files
source_games_dir = "/kaggle/input/2020-chess-thesis-1-split-pgn-files/"
# Define the target directory for saving updated game info files
target_dir = "/kaggle/working/games/"

# Make sure the target directory exists
os.makedirs(target_dir, exist_ok=True)

# Loop through each part and process the files
for part in range(1, 36):  # Adjust the range based on the number of parts + 1
    moves_file = f"{source_moves_dir}filtered_games_2020_part_{part}_moves.csv"
    game_info_file = f"{source_games_dir}games_2020_part_{part}_game_info.csv"
    
    # Check if the moves file exists before proceeding
    if os.path.exists(moves_file):
        # Read the moves CSV file
        moves_df = pd.read_csv(moves_file)
        
        # Check if the game info file exists before proceeding
        if os.path.exists(game_info_file):
            # Read the game info CSV file
            game_info_df = pd.read_csv(game_info_file)
            
            # Get the first ply of the middle game for each game
            first_ply_middle_game = get_first_ply_middle_game(moves_df)
            game_info_df['start_middle_game_ply'] = game_info_df['game_id'].map(first_ply_middle_game)
            
            # Get the moves for the whole middle game in SAN format for each game
            middle_game_moves = get_middle_game_moves(moves_df, first_ply_middle_game)
            game_info_df['middle_game_moves'] = game_info_df['game_id'].map(middle_game_moves)
            
            # Save the updated game_info_df to the target directory with a new file name
            updated_game_info_file = f"{target_dir}updated_games_2020_part_{part}_game_info.csv"
            game_info_df.to_csv(updated_game_info_file, index=False)
        else:
            print(f"Game info file for part {part} does not exist. Skipping.")
    else:
        print(f"Moves file for part {part} does not exist. Skipping.")

In [5]:
# Remove games where no more moves are found. e.g. Games ending before middle game happens and convert plys to int
def clean_and_adjust_game_info_files(directory):
    files = [f for f in os.listdir(directory) if f.startswith('updated_games_2020_part') and f.endswith('_game_info.csv')]
    for file in files:
        full_path = os.path.join(directory, file)
        df = pd.read_csv(full_path)

        # Remove rows where 'middle_game_moves' is NaN or empty
        df = df.dropna(subset=['middle_game_moves'])
        df = df[df['middle_game_moves'].str.strip().astype(bool)]

        # Convert 'start_middle_game_ply' from float to int
        df['start_middle_game_ply'] = df['start_middle_game_ply'].fillna(0).astype(int)

        # Save the cleaned and adjusted DataFrame back to CSV
        df.to_csv(full_path, index=False)
        print(f"Processed and saved cleaned data for {file}")

# Process all updated game info files
clean_and_adjust_game_info_files(target_dir)

Processed and saved cleaned data for updated_games_2020_part_35_game_info.csv
Processed and saved cleaned data for updated_games_2020_part_3_game_info.csv
Processed and saved cleaned data for updated_games_2020_part_21_game_info.csv
Processed and saved cleaned data for updated_games_2020_part_2_game_info.csv
Processed and saved cleaned data for updated_games_2020_part_14_game_info.csv
Processed and saved cleaned data for updated_games_2020_part_28_game_info.csv
Processed and saved cleaned data for updated_games_2020_part_1_game_info.csv
Processed and saved cleaned data for updated_games_2020_part_29_game_info.csv
Processed and saved cleaned data for updated_games_2020_part_19_game_info.csv
Processed and saved cleaned data for updated_games_2020_part_10_game_info.csv
Processed and saved cleaned data for updated_games_2020_part_30_game_info.csv
Processed and saved cleaned data for updated_games_2020_part_33_game_info.csv
Processed and saved cleaned data for updated_games_2020_part_31_gam