In [1]:
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import io
import os


# Configure Tesseract OCR executable path (update if needed)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Update with your Tesseract path

def extract_images_from_pdf(pdf_path):
    
    doc = fitz.open(pdf_path)  # Open the PDF
    images = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))  # Convert to PIL image
            images.append(image)

    return images


def perform_ocr_on_images(images):
   
    extracted_text = ""
    for i, image in enumerate(images):
        text = pytesseract.image_to_string(image)  # Perform OCR on the image
        extracted_text += f"\n--- Page {i + 1} ---\n{text}\n"
    return extracted_text


def save_text_to_file(text, output_path):
    
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(text)


def main(pdf_path, output_txt_path):
    
    if not pdf_path.endswith(".pdf"):
        raise ValueError("The input file must be a PDF.")

    if not os.path.exists(pdf_path):
        raise FileNotFoundError("The specified PDF file does not exist.")

    print("Extracting images from PDF...")
    images = extract_images_from_pdf(pdf_path)

    if not images:
        print("No images found in the PDF.")
        return

    print("Performing OCR on extracted images...")
    extracted_text = perform_ocr_on_images(images)

    print("Saving extracted text to file...")
    save_text_to_file(extracted_text, output_txt_path)

    print(f"Text successfully saved to {output_txt_path}.")


# Example Usage
pdf_input = "handwritten.pdf"  # Replace with your PDF file path
output_txt = "output_text.txt"  # Desired output file path

main(pdf_input, output_txt)


Extracting images from PDF...
Performing OCR on extracted images...
Saving extracted text to file...
Text successfully saved to output_text.txt.


In [6]:
def read_file_content(file_path):
    """
    Read the content of a text file.
    :param file_path: Path to the text file.
    :return: File content as a string.
    """
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()


# Paths to the files
ground_truth_file = "ground_truth_text.txt"  # Replace with the manually prepared ground truth file
ocr_output_file = "output_text.txt"         # OCR output file generated by the script

# Read content from files
ground_truth_text = read_file_content(ground_truth_file)
ocr_output_text = read_file_content(ocr_output_file)

# Evaluate OCR performance
metrics = calculate_metrics(ground_truth_text, ocr_output_text)

# Print the results
print("Model Evaluation Report:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.2%}")


Model Evaluation Report:
Character Error Rate (CER): 44.16%
Word Error Rate (WER): 91.67%
Character-Level Accuracy: 55.84%
Word-Level Accuracy: 8.33%
