# **Routing PDF Queries Using Metadata (No Embeddings Needed)**
This notebook is an extenstion of the Project 7 Step 2 DEMO code for classifying user queries to most relevant PDF pages using metadata stored, without depending on embeddings exclusively.



In [None]:
# -------- 1) Setup & Imports -----------------------------------
# Try to use either pypdf or PyPDF2 for text extraction.
# (We keep both to increase the chance of extraction working
#  across different environments like Colab vs. local.)

import os, re, json, uuid
from typing import List, Dict, Any
from collections import OrderedDict

# Optional PDF backends
try:
    import pypdf
    HAVE_PYPDF = True
except Exception:
    HAVE_PYPDF = False
try:
    import PyPDF2
    HAVE_PYPDF2 = True
except Exception:
    HAVE_PYPDF2 = False

DOC_TYPES = ["pay_stub", "loan_form", "resume", "contract", "w2", "unknown"]

In [None]:
# -------- 2) Load a Set of Documents ----------------------------
# Tip: In Colab, your files may live under /content; when running here,
# they might be under /mnt/data. We include both and pick existing files.

FILES = [
    "/content/payslip-1752803610.pdf",
    "/content/payslip-1752804713.pdf",
    "/content/COE-Sample.pdf",
    "/content/functionalsample.pdf",
    "/content/SampleContract-Shuttle.pdf",
    "/content/LenderFeesWorksheetNew.pdf",
]

In [None]:
def _extract_text_pypdf(path: str):
    reader = pypdf.PdfReader(path)
    return [(page.extract_text() or "") for page in reader.pages]

def _extract_text_pypdf2(path: str):
    reader = PyPDF2.PdfReader(path)
    return [(page.extract_text() or "") for page in reader.pages]

def extract_text_pages(path: str):
    if HAVE_PYPDF:
        try:
            return _extract_text_pypdf(path)
        except Exception:
            pass
    if HAVE_PYPDF2:
        try:
            return _extract_text_pypdf2(path)
        except Exception:
            pass
    return [""]

def year_from_text(t: str) -> str:
    m = re.search(r"(20\d{2}|19\d{2})", t or "")
    return m.group(1) if m else ""

In [None]:
def build_pdf_metadata_store(files: List[str]) -> List[Dict[str, Any]]:
    store: List[Dict[str, Any]] = []
    for path in files:
        if not os.path.exists(path):
            continue
        file_id = str(uuid.uuid4())
        filename = os.path.basename(path)
        pages = extract_text_pages(path)
        for i, text in enumerate(pages):
            store.append({
                "file_id": file_id,
                "user_id": "xyz",           # FIXED: don't reference an undefined variable
                "filename": filename,
                "page_number": i + 1,
                "year": year_from_text(text),
                "text": text or "",
                "doc_type": None,
            })
    return store

In [None]:
# -------- 3) Classify the User Query ----------------------------

def classify_query_llm(query: str) -> str:
    q = (query or "").lower()
    if any(k in q for k in ["salary", "net pay", "paystub", "pay stub", "gross pay", "pay date", "monthly pay", "pay statement"]):
        return "pay_stub"
    if any(k in q for k in ["loan", "mortgage", "escrow", "origination", "title insurance", "fees worksheet", "closing costs", "lender"]):
        return "loan_form"
    if any(k in q for k in ["resume", "cv", "work experience", "employment history"]):
        return "resume"
    if any(k in q for k in ["contract", "agreement", "professional services", "terms of employment", "probationary"]):
        return "contract"
    if "w2" in q or "w-2" in q or "w2 form" in q:
        return "w2"
    return "unknown"

In [None]:
# -------- 4) Assign a doc_type to Each Page ---------------------

def classify_doc_type_llm(text: str) -> str:
    t = (text or "").lower()
    if any(k in t for k in ["payslip", "pay date", "net pay", "earnings", "this is system generated payslip", "working days"]):
        return "pay_stub"
    if any(k in t for k in ["fees worksheet", "loan program", "origination", "escrow fee", "hazard insurance premium", "daily interest charges", "lender's title insurance"]):
        return "loan_form"
    if any(k in t for k in ["functional resume", "career summary", "employment history", "education", "gpa (4.0 scale)"]):
        return "resume"
    if any(k in t for k in ["professional services agreement", "this agreement", "sample contract", "contract no.", "probationary"]):
        return "contract"
    if "w-2" in t or "form w-2" in t:
        return "w2"
    return "unknown"

In [None]:
def assign_doc_types(store: List[Dict[str, Any]]) -> None:
    for rec in store:
        rec["doc_type"] = classify_doc_type_llm(rec.get("text", ""))

# -------- 5) Filter Pages by Predicted doc_type (+ fallback) ----
def _keyword_fallback_score(text: str, query: str) -> int:
    base_terms = re.findall(r"[a-zA-Z0-9']+", (query or "").lower())
    extra = []
    if any(k in (query or "").lower() for k in ["salary", "pay"]):
        extra += ["net pay", "gross", "pay date", "earnings"]
    if any(k in (query or "").lower() for k in ["loan", "mortgage", "fees"]):
        extra += ["origination", "escrow", "title", "interest", "closing costs"]
    terms = set([t for t in base_terms if len(t) > 2] + extra)
    tl = (text or "").lower()
    return sum(tl.count(term) for term in terms)

In [None]:
def route_query(store: List[Dict[str, Any]], query: str) -> Dict[str, Any]:
    predicted = classify_query_llm(query)
    candidates = [rec for rec in store if rec.get("doc_type") == predicted]

    if not candidates:
        candidates = sorted(store, key=lambda r: _keyword_fallback_score(r.get("text", ""), query), reverse=True)[:1]
    else:
        candidates = sorted(candidates, key=lambda r: _keyword_fallback_score(r.get("text", ""), query), reverse=True)[:1]


    # -------- 6) Return the Final Output (EXACT schema) ----------
    # We use OrderedDict to keep key order matching the expected format.
    md_list = []
    for rec in candidates:   # <-- stays INSIDE this function
        item = OrderedDict()
        item["file_id"] = rec["file_id"]
        item["user_id"] = "xyz"
        item["doc_type"] = rec["doc_type"] or "unknown"
        item["year"] = str(rec.get("year") or "")
        item["filename"] = rec["filename"]
        item["page_number"] = int(rec["page_number"])
        item["text"] = (rec["text"] or "").strip()
        md_list.append(item)

    output = OrderedDict()
    output["query"] = query
    output["predicted_doc_type"] = predicted
    output["matched_documents"] = md_list
    return output

In [None]:
# === Build & run once (sample) ===
store = build_pdf_metadata_store(FILES)
assign_doc_types(store)
query = "What is my monthly salary?"
result = route_query(store, query)

# Save exact-schema JSON (optional)
with open("/content/routing_result_exact.json", "w", encoding="utf-8") as f:
    json.dump(result, f, indent=2)

result

OrderedDict([('query', 'What is my monthly salary?'),
             ('predicted_doc_type', 'pay_stub'),
             ('matched_documents',
              [OrderedDict([('file_id',
                             '55052b7e-063d-4ea5-a4e0-646b65862452'),
                            ('user_id', 'xyz'),
                            ('doc_type', 'unknown'),
                            ('year', ''),
                            ('filename', 'payslip-1752803610.pdf'),
                            ('page_number', 1),
                            ('text', '')])])])