In [9]:
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
import chess
import chess.pgn
import re
import os
from project_keys import tesseract_path #add key specific information here

# Print environment variables to check
print(f"TESSDATA_PREFIX: {os.getenv('TESSDATA_PREFIX')}")
print(f"PATH: {os.getenv('PATH')}")

# Set the TESSDATA_PREFIX environment variable
os.environ['TESSDATA_PREFIX'] = '/home/sean/anaconda3/envs/chess_projects/bin/tesseract'

# Set the path to the Tesseract executable
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract' #tesseract_path

# Function to extract text from image using pytesseract
def extract_text_from_image(image_path):
    return pytesseract.image_to_string(Image.open(image_path))

# Function to convert PDF to images
def convert_pdf_to_images(pdf_path):
    return convert_from_path(pdf_path)

# Function to recognize chess pieces and board state
def recognize_chess_board(text):
    # Mapping Unicode characters to chess pieces
    unicode_pieces = {
        '♔': 'K', '♕': 'Q', '♖': 'R', '♗': 'B', '♘': 'N', '♙': 'P',
        '♚': 'k', '♛': 'q', '♜': 'r', '♝': 'b', '♞': 'n', '♟': 'p'
    }
    board = chess.Board()
    rows = text.split('\n')
    board_state = ""
    
    # Extract game information (players and date)
    game_info = re.search(r'(\d+)\.\s+(\w+)\s*-\s*(\w+)\n(\w+,\s*\d+)', text)
    if game_info:
        game_number, white_player, black_player, date = game_info.groups()
    else:
        white_player, black_player, date = "Unknown", "Unknown", "Unknown"

    # Read board state
    for row in rows:
        row_state = []
        for char in row:
            if char in unicode_pieces:
                row_state.append(unicode_pieces[char])
            elif char == ' ':
                row_state.append('1')  # Empty square as '1' in FEN notation
        if row_state:
            board_state += ''.join(row_state) + '/'

    # Convert board state to FEN format
    fen = board_state.strip('/').replace('11111111', '8').replace('1111111', '7').replace('111111', '6').replace('11111', '5').replace('1111', '4').replace('111', '3').replace('11', '2')
    board.set_fen(fen + ' w KQkq - 0 1')  # Assuming it's White to move and standard initial conditions
    
    return white_player, black_player, date, board

# Function to save game to PGN format
def save_to_pgn(white_player, black_player, date, board, output_file):
    game = chess.pgn.Game()
    game.headers["White"] = white_player
    game.headers["Black"] = black_player
    game.headers["Date"] = date
    game.setup(board)
    with open(output_file, "w") as pgn_file:
        pgn_file.write(str(game))

# Main function
def process_chess_image(file_path, output_file):
    if file_path.endswith('.pdf'):
        images = convert_pdf_to_images(file_path)
        text = "\n".join([extract_text_from_image(image) for image in images])
    else:
        text = extract_text_from_image(file_path)

    white_player, black_player, date, board = recognize_chess_board(text)
    save_to_pgn(white_player, black_player, date, board, output_file)

# Example usage
process_chess_image('/home/sean/Documents/chess_processing/board_pdf_image_files/Puzzle-One.jpg', '/home/sean/Documents/chess-processing/output_files/py_output_p1_20240730.pgn')


TESSDATA_PREFIX: /usr/share/tesseract-ocr/4.00/
PATH: /home/sean/anaconda3/envs/chess_projects/bin:/home/sean/anaconda3/condabin:/usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games


TesseractError: (1, 'Error opening data file /home/sean/anaconda3/envs/chess_projects/bin/tesseract/eng.traineddata Please make sure the TESSDATA_PREFIX environment variable is set to your "tessdata" directory. Failed loading language \'eng\' Tesseract couldn\'t load any languages! Could not initialize tesseract.')

In [13]:
import os
import pytesseract
from PIL import Image

# Verify environment variables
print(f"TESSDATA_PREFIX: {os.getenv('TESSDATA_PREFIX')}")
print(f"PATH: {os.getenv('PATH')}")

# Set the path to the Tesseract executable
pytesseract.pytesseract.tesseract_cmd = '/home/sean/anaconda3/envs/chess_projects/bin/tesseract'

# Load an image using PIL
image = Image.open('/home/sean/Documents/chess_processing/board_pdf_image_files/Puzzle-One.jpg')

# Perform OCR using Tesseract
text = pytesseract.image_to_string(image)

# Print the extracted text
print(text)


TESSDATA_PREFIX: /home/sean/anaconda3/envs/chess_projects/bin/tesseract
PATH: /home/sean/anaconda3/envs/chess_projects/bin:/home/sean/anaconda3/condabin:/usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games


TesseractNotFoundError: /home/sean/anaconda3/envs/chess_projects/bin/tesseract is not installed or it's not in your PATH. See README file for more information.