In [1]:
import pytesseract
from pdf2image import convert_from_path
import cv2
import numpy as np
import os
from PIL import Image


In [2]:
def preprocess_image(image_path):
    """Enhance image for better OCR accuracy."""
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    image = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    return image

def extract_text_from_image(image):
    """Extract text from a preprocessed image using Tesseract OCR."""
    pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"  # Specify Tesseract path
    text = pytesseract.image_to_string(image, lang='eng')
    return text

def extract_text_from_pdf(pdf_path, output_txt_path, poppler_path):
    """Convert a scanned PDF into text by extracting images and applying OCR."""
    images = convert_from_path(pdf_path, poppler_path=poppler_path)
    extracted_text = ""
    
    for i, image in enumerate(images):
        image_path = f"temp_page_{i}.png"
        image.save(image_path, "PNG")
        
        processed_image = preprocess_image(image_path)
        text = extract_text_from_image(processed_image)
        extracted_text += f"\n--- Page {i+1} ---\n{text}\n"
        
        os.remove(image_path)
    
    with open(output_txt_path, "w", encoding="utf-8") as f:
        f.write(extracted_text)
    
    return output_txt_path

# Example Usage
pdf_path = r"E:\git\SmartOCR-Mail--Automated-Email-Parsing--Document-Digitization--and-Intelligent-Search\sampleInput_1.pdf"  # Replace with your actual PDF file
output_txt_path = r"E:\git\SmartOCR-Mail--Automated-Email-Parsing--Document-Digitization--and-Intelligent-Search\outputs\extracted_text.txt"
poppler_path = r"E:\git\poppler-24.08.0\Library\bin"  # Update with your Poppler path

extract_text_from_pdf(pdf_path, output_txt_path, poppler_path)


'E:\\git\\SmartOCR-Mail--Automated-Email-Parsing--Document-Digitization--and-Intelligent-Search\\outputs\\extracted_text.txt'