In [3]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
import pandas as pd
import re

# Define the column names for the DataFrame
column_names = ["Puzzle Number", "Game", "Location & Year", "Solution: Desc Notation",
                "Solution: Modern Notation", "Processing Notes"]

# Create a list to store the rows
data = []

# Provide the path to your PDF file
pdf_path = '/Users/Sean/Library/CloudStorage/OneDrive-Personal/Datasets/chess-endgames.pdf'

# Output file path
output_file_path = '/Users/Sean/Library/CloudStorage/OneDrive-Personal/Datasets/endgames-py-output.txt'

In [4]:
# Iterate over the PDF file
for i, page_layout in enumerate(extract_pages(pdf_path)):
    if i < 782 or i > 785:  # Skip pages outside of 785-1153
        continue

    # Initialize puzzle_number to "Unknown"
    puzzle_number = "Unknown"
    try:
        # Initialize text boxes list
        text_boxes = []

        # Iterate over the text boxes in the page layout
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                text_boxes.append(element)

        # Sort the text boxes by their x-position
        text_boxes.sort(key=lambda box: box.bbox[0])

        # Process each text box separately
        for box in text_boxes:
            text = box.get_text()

            # Extract the puzzle number, game, location & year, and solution
            puzzle_number_match = re.findall(r'(\d+)', text)
            puzzle_number = puzzle_number_match[0] if puzzle_number_match else "Unknown"

            game_match = re.findall(r'([A-Za-z -]+)', text)
            game = game_match[1] if len(game_match) > 1 else "Unknown"

            location_year_match = re.findall(r'([A-Za-z ,0-9]+)', text)
            location_year = location_year_match[2] if len(location_year_match) > 2 else "Unknown"

            solution_match = re.findall(r'((?:[♚♛♜♝♞♟♔♕♖♗♘♙].*? )+\d-\d)', text)
            solution = solution_match[0] if solution_match else "Unknown"

            # Add a row to the data list
            data.append({"Puzzle Number": puzzle_number, "Game": game, "Location & Year": location_year,
                         "Solution: Desc Notation": solution, "Solution: Modern Notation": "",
                         "Processing Notes": ""})

        # Print an update for every 400 puzzles added to the data list
        if (i - 782 + 1) % 400 == 0:
            print(f'Processed {i - 782 + 1} puzzles.')
    except Exception as e:
        # Add an error row to the data list
        data.append({"Puzzle Number": puzzle_number, "Game": "", "Location & Year": "",
                     "Solution: Desc Notation": "", "Solution: Modern Notation": "",
                     "Processing Notes": str(e)})

# Convert the data list into a DataFrame
df = pd.concat([pd.DataFrame([i], columns=column_names) for i in data], ignore_index=True)

# Save the DataFrame to a file
df.to_csv(output_file_path, sep='|', index=False)