<a href="https://colab.research.google.com/github/sssangeetha/OutamationAI_OCR_RAG_Automation/blob/main/AutomaticLoanProcessingSystem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# ==== Setup ====
!pip -q install pymupdf opencv-python

import fitz  # PyMuPDF
import cv2, numpy as np, json
from google.colab import files
from pathlib import Path

# Upload once if needed
# files.upload()

pdf_path = "LenderFeesWorksheetNew.pdf"  # ensure this filename matches your upload

# ==== Helpers ====
def to_int_bbox(b):  # PyMuPDF rect -> [x0,y0,x1,y1] ints
    return [int(b[0]), int(b[1]), int(b[2]), int(b[3])]

def words_in_bbox(words, x0, y0, x1, y1):
    # words: list of [x0,y0,x1,y1,"text", block_no, line_no, word_no]
    out = []
    for w in words:
        wx0, wy0, wx1, wy1, txt = w[0], w[1], w[2], w[3], w[4]
        if (wx0 >= x0 and wy0 >= y0 and wx1 <= x1 and wy1 <= y1):
            out.append((txt, (wx0, wy0, wx1, wy1)))
    return out

def find_label_value(words, label_variants, search_width=250):
    """
    Find nearest value to the right of a label (same line neighborhood).
    """
    for w in words:
        txt = w[4].strip().lower()
        if any(lbl in txt for lbl in label_variants):
            x0,y0,x1,y1 = w[0],w[1],w[2],w[3]
            # neighborhood to the right of the label
            cand = [u for u in words if (u[1] >= y0-8 and u[3] <= y1+8 and u[0] >= x1 and u[0] <= x1+search_width)]
            if cand:
                # join nearest consecutive tokens
                cand_sorted = sorted(cand, key=lambda z: z[0])
                text = " ".join([c[4] for c in cand_sorted])
                bbox = [x1, y0, min(x1+search_width, max(c[2] for c in cand_sorted)), y1]
                return text.strip(), bbox
    return None, None

def find_section_rows(words, section_title, y_tol=12):
    """
    Locate a section by its title and collect line-like rows beneath it
    until next big title or large vertical gap.
    """
    # find the section header token
    hdr = None
    for w in words:
        if section_title.lower() in w[4].lower():
            hdr = w; break
    if hdr is None:
        return []

    # collect rows below header using line number grouping
    # PyMuPDF words = [x0,y0,x1,y1,"text", block_no, line_no, word_no]
    below = [w for w in words if w[1] > hdr[3]+4]  # strictly below the header baseline
    # group by (approx) line y using tolerance
    rows = []
    for w in sorted(below, key=lambda z: (z[1], z[0])):
        placed = False
        for r in rows:
            # if close in y to existing row
            if abs(r["y"] - w[1]) <= y_tol:
                r["words"].append(w); r["y"] = (r["y"] + w[1]) / 2; placed = True; break
        if not placed:
            rows.append({"y": w[1], "words": [w]})

    # stop at next uppercase-looking title or a big gap (heuristic kept simple here)
    # We'll just return all rows; filtering happens when we parse "name ... amount $"
    return rows

# ==== Parse ====
doc = fitz.open(pdf_path)
page = doc[0]
words = page.get_text("words")  # list of [x0,y0,x1,y1, "text", block, line, word]
# Sort words reading order
words = sorted(words, key=lambda w: (w[1], w[0]))

extracted = []

# Basic top-level fields (labels vary; we match loosely)
BASIC_FIELDS = {
    "Applicants": ["applicants"],
    "Application No": ["application no"],
    "Date Prepared": ["date prepared"],
    "Loan Program": ["loan program"],
    "Prepared By": ["prepared by"],
    "Total Loan Amount": ["total loan amount", "loan amount"],
    "Interest Rate": ["interest rate"],
    "Term / Due In": ["term/due in", "term / due in", "term", "due in"],
    "Total Funds to Close": ["total estimated funds needed to close"],
    "Total Monthly Payment": ["total estimated monthly payment"],
}

for label, variants in BASIC_FIELDS.items():
    val, bbox = find_label_value(words, variants, search_width=450)
    if val and bbox:
        extracted.append({"label": label, "text": f"{label}: {val}", "bbox": [int(x) for x in bbox]})

# Table sections
sections = ["ORIGINATION CHARGES", "OTHER CHARGES"]
for sec in sections:
    rows = find_section_rows(words, sec)
    for r in rows:
        tokens = sorted(r["words"], key=lambda z: z[0])
        line_text = " ".join(t[4] for t in tokens).strip()
        # Keep only plausible fee rows: must end with a money value like 525.00 or $ 525.00
        if any(sym in line_text for sym in ["$", ".00", ".50"]):
            x0 = int(min(t[0] for t in tokens)); y0 = int(min(t[1] for t in tokens))
            x1 = int(max(t[2] for t in tokens)); y1 = int(max(t[3] for t in tokens))
            extracted.append({"label": f"{sec} Row", "text": line_text, "bbox": [x0,y0,x1,y1]})

# Save JSON
Path("extracted.json").write_text(json.dumps(extracted, indent=2))
print(json.dumps(extracted[:12], indent=2))  # preview first dozen

# ==== Optional: Visual overlay ====
# Render page to image and draw rectangles
pix = page.get_pixmap(matrix=fitz.Matrix(2,2), alpha=False)  # 2x for clarity
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

for e in extracted:
    x0,y0,x1,y1 = [int(v*2) for v in e["bbox"]]  # scale bboxes to rendering matrix
    cv2.rectangle(img, (x0,y0), (x1,y1), (0,255,0), 2)

cv2.imwrite("annotated.png", img)
print("Saved: extracted.json, annotated.png")


[
  {
    "label": "Applicants",
    "text": "Applicants: John Q. Smith / Mary A. Smith Application No: samplesmith",
    "bbox": [
      77,
      78,
      520,
      87
    ]
  },
  {
    "label": "Term / Due In",
    "text": "Term / Due In: an estimate of cash that may be required to close and an estimate of your proposed monthly mortgage",
    "bbox": [
      117,
      128,
      567,
      138
    ]
  }
]
Saved: extracted.json, annotated.png
