<a href="https://colab.research.google.com/github/sssangeetha/OutamationAI_OCR_RAG_Automation/blob/main/Differenciating_Models_OCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ---- Install (Colab) ----
!apt-get -qq update && apt-get -qq install -y tesseract-ocr
!pip -q install pymupdf opencv-python pytesseract
!pip -q install "paddlepaddle==2.6.1" paddleocr
!pip -q install easyocr

# ---- Imports ----
import fitz, cv2, json, pytesseract, numpy as np, os
from paddleocr import PaddleOCR
import easyocr
from google.colab import files
from pathlib import Path

# (Windows users would need pytesseract.pytesseract.tesseract_cmd, not needed on Colab)


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m96.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.5/80.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.4/40.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.9/125.9 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 kB[0m [31m5.3 MB/s[

In [3]:
# ---- Upload your PDF (if not already in the runtime) ----
# files.upload()

PDF_PATH = "LenderFeesWorksheetNew (2).pdf"  # rename if needed

doc = fitz.open(PDF_PATH)
render_scale = 2.0
images = []  # [(page_index, rgb_image)]
for i, page in enumerate(doc):
    pix = page.get_pixmap(matrix=fitz.Matrix(render_scale, render_scale), alpha=False)
    rgb = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
    images.append((i, rgb))
len(images)


1

In [4]:
out_dir = Path("ocr_outputs"); out_dir.mkdir(exist_ok=True)

tesseract_results = []
for i, rgb in images:
    bgr = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
    # data with word-level boxes
    data = pytesseract.image_to_data(bgr, output_type=pytesseract.Output.DICT)
    # raw text (line-ish)
    raw = pytesseract.image_to_string(bgr)

    # draw boxes
    vis = bgr.copy()
    for x, y, w, h, conf, txt in zip(data["left"], data["top"], data["width"], data["height"], data["conf"], data["text"]):
        if txt.strip() and conf != '-1':
            cv2.rectangle(vis, (x,y), (x+w, y+h), (0,255,0), 2)

    # save
    cv2.imwrite(str(out_dir/f"tess_p{i}.png"), vis)
    Path(out_dir/f"tess_p{i}.txt").write_text(raw)

    # collect JSON
    page_items = []
    for j in range(len(data["text"])):
        txt = data["text"][j].strip()
        if txt:
            page_items.append({
                "text": txt,
                "bbox": [int(data["left"][j]), int(data["top"][j]), int(data["left"][j]+data["width"][j]), int(data["top"][j]+data["height"][j])],
                "conf": float(data["conf"][j]) if data["conf"][j] != '-1' else None
            })
    tesseract_results.append({"page": i, "raw_text": raw, "items": page_items})

Path(out_dir/"tesseract.json").write_text(json.dumps(tesseract_results, indent=2))
print("Tesseract done → images & JSON saved under", out_dir)


Tesseract done → images & JSON saved under ocr_outputs


In [7]:
# Initialize EasyOCR reader
reader = easyocr.Reader(['en']) # 'en' for English

easyocr_results = []
for i, rgb in images:
    # EasyOCR expects RGB
    result = reader.readtext(rgb) # list of [ [bbox, text, conf], ... ]

    # draw boxes
    vis = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR).copy()
    page_items = []

    for (bbox, text, conf) in result:
        bbox_int = np.array(bbox, dtype=np.int32)
        cv2.polylines(vis, [bbox_int], isClosed=True, color=(0, 0, 255), thickness=2)

        x0, y0 = bbox_int[0][0], bbox_int[0][1]
        x1, y1 = bbox_int[2][0], bbox_int[2][1]

        page_items.append({"text": text, "bbox": [int(x0), int(y0), int(x1), int(y1)], "conf": float(conf)})

    # raw text (joined by lines)
    raw = "\n".join([it["text"] for it in page_items])

    cv2.imwrite(str(out_dir/f"easyocr_p{i}.png"), vis)
    Path(out_dir/f"easyocr_p{i}.txt").write_text(raw)
    easyocr_results.append({"page": i, "raw_text": raw, "items": page_items})

Path(out_dir/"easyocr.json").write_text(json.dumps(easyocr_results, indent=2))
print("EasyOCR done → images & JSON saved under", out_dir)



Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |██████████████████████████████████████████████████| 100.0% Complete



EasyOCR done → images & JSON saved under ocr_outputs
