In [None]:
!apt-get install -y poppler-utils  # Required for pdf2image
!pip install pytesseract easyocr pdf2image transformers torchvision
!pip install opencv-python-headless  # Avoids Colab's OpenCV conflicts
!pip install torch torchvision torchaudio  # Ensures latest PyTorch

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 29 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.6 [186 kB]
Fetched 186 kB in 1s (215 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126209 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.6_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.6) ...
Setting up poppler-utils (22.02.0-2ubuntu0.6) ...
Processing triggers for man-db (2.10.2-1) ...
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting pdf2ima

In [None]:
import os
import cv2
import pytesseract
import easyocr
import torch
import torchvision.transforms as transforms
from pdf2image import convert_from_path
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image

In [None]:
# =========================== STEP 1: Convert PDF to Images ===========================
def pdf_to_images(pdf_path, output_folder="pdf_images"):
    """Convert PDF pages to images."""
    os.makedirs(output_folder, exist_ok=True)
    images = convert_from_path(pdf_path)

    image_paths = []
    for i, img in enumerate(images):
        img_path = os.path.join(output_folder, f"page_{i}.jpg")
        img.save(img_path, "JPEG")
        image_paths.append(img_path)

    return image_paths


In [None]:
# ========================== STEP 2: Preprocess Image ===========================
def preprocess_image(image_path):
    """Convert image to grayscale, remove noise, and apply thresholding."""
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    image = cv2.GaussianBlur(image, (5, 5), 0)
    _, image = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    processed_path = "processed_" + os.path.basename(image_path)
    cv2.imwrite(processed_path, image)
    return processed_path

In [None]:
# ========================== STEP 3: Extract Text Using OCR ===========================
def extract_text_tesseract(image_path):
    """Use Tesseract OCR for text extraction."""
    text = pytesseract.image_to_string(image_path, lang="eng")
    return text.strip()

def extract_text_easyocr(image_path):
    """Use EasyOCR for text extraction (better for handwritten text)."""
    reader = easyocr.Reader(['en'])
    result = reader.readtext(image_path, detail=0)
    return " ".join(result)

In [None]:
# ========================== STEP 4: Advanced OCR with TrOCR ===========================
def extract_text_trocr(image_path):
    """Use TrOCR (Transformer-based OCR) for handwritten text recognition."""
    processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
    model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

    image = Image.open(image_path).convert("RGB")
    transform = transforms.Compose([
        transforms.Resize((384, 384)),
        transforms.ToTensor()
    ])

    image = transform(image).unsqueeze(0)
    generated_ids = model.generate(image)
    text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return text


In [None]:
# ========================== STEP 5: Process Documents (PDFs or Images) ===========================
def process_document(file_path, use_trocr=False):
    """Extract text from PDF or image using OCR."""
    extracted_text = ""

    if file_path.lower().endswith(".pdf"):
        images = pdf_to_images(file_path)
    else:
        images = [file_path]

    for img_path in images:
        processed_img = preprocess_image(img_path)

        # Choose OCR method
        if use_trocr:
            text = extract_text_trocr(processed_img)
        else:
            text = extract_text_easyocr(processed_img)

        extracted_text += text + "\n\n"

    # Save extracted text
    with open("extracted_text.txt", "w", encoding="utf-8") as file:
        file.write(extracted_text)

    print("✅ Text extraction complete! Output saved to extracted_text.txt")

In [None]:
# ========================== STEP 6: Run Script ===========================
if __name__ == "__main__":
    file_path = input("Enter the path of the PDF or Image: ").strip()
    use_trocr = input("Use TrOCR for better handwritten recognition? (yes/no): ").strip().lower() == "yes"

    process_document(file_path, use_trocr)

Enter the path of the PDF or Image: /content/Screenshot 2025-03-01 194146.pdf
Use TrOCR for better handwritten recognition? (yes/no): no




Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |--------------------------------------------------| 0.0% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.3% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.7% Complet