## This Python notebook builds a document-blob based segmentation pipeline using smart prompting, page-level classification, and structured metadata output.

## Set up Mistral (GGUF) + PyMuPDF / PyPDF2

In [None]:
# System & model setup
!pip -q install "llama-cpp-python==0.3.16" pymupdf PyPDF2 pandas

from llama_cpp import Llama
import os, json, pandas as pd
from PyPDF2 import PdfReader
import textwrap

# Download a lightweight quant of Mistral Instruct (GGUF)
MODEL_PATH = "/content/mistral.gguf"
if not os.path.exists(MODEL_PATH):
    !wget -q https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf -O {MODEL_PATH}

# Load Mistral with llama.cpp (GPU-accelerated kernels auto-used if available)
llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=4096,          # ample for our short prompts
    n_gpu_layers=20,     # tweak based on GPU; 0 for CPU-only
    verbose=False
)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone


llama_context: n_ctx_per_seq (4096) < n_ctx_train (32768) -- the full capacity of the model will not be utilized


## Load the new blob-style PDF (page-by-page)

In [None]:
PDF_PATH = "/content/Test Blob File.pdf"  # adjust if needed
reader = PdfReader(PDF_PATH)
pages_text = [(i, (reader.pages[i].extract_text() or "")) for i in range(len(reader.pages))]
len(pages_text), pages_text[0][1][:400]

pages_text

[(0,
  'Your actual rate, payment, and cost could be higher. Get an official Loan Estimate before choosing a loan.\nFee Details and Summary\nApplicants: Application No:\nDate Prepared:\nLoan Program:Prepared By:\nTHIS IS NOT A GOOD FAITH ESTIMATE (GFE). This "Fees W orksheet" is provided for informational purposes ONLY, to assist\nyou in determining an estimate of cash that may be required to close and an estimate of your proposed monthly mortgage \npayment. Actual charges may be more or less, and your transaction may not involve a fee for every item listed.\nTotal Loan Amount:  Interest Rate: Term/Due In:\nFee Paid To Paid By (Fee Split**) Amount PFC / F / POC\nTOTAL ESTIMATED FUNDS NEEDED TO CLOSE: TOTAL ESTIMATED MONTHLY PAYMENT:\nTotal Estimated Funds Total Monthly PaymentPurchase Price (+)\nAlterations (+)\nLand (+)\nRefi (incl. debts to be paid off) (+)\nEst. Prepaid Items/Reserves (+)\nEst. Closing Costs (+)Loan Amount (-) Principal & Interest\nOther Financing (P & I)\nHazard In

## Mistral chat helper function (JSON-based)

In [None]:
def mistral_json(prompt: str, max_tokens: int = 128, temperature: float = 0.0):
    """Call mistral instruct and return raw text; we keep T=0 for deterministic JSON."""
    # simple completion call; GGUF chat template is embedded in tokenizer; we can prompt plainly
    out = llm(
        prompt=prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        stop=["\n\n\n", "\n\n}"]  # light guard against trailing ramble
    )
    return out["choices"][0]["text"].strip()

## Prompt for segmenting: is_new_doc + doc_type (fixed label set)

In [None]:
DOC_TYPES = ["Resume","Contract","Lender Fee Sheet","ID","Payslip","Other"]

def make_segmentation_prompt(prev_doc_type, prev_text, curr_text):
    # keep things short; enforce schema & labels only
    return f"""
You segment multi-document PDFs page-by-page.
Return ONLY valid JSON with this schema (no extra text):
{{
  "is_new_doc": "Yes" or "No",
  "doc_type": one of {DOC_TYPES}
}}

Rules:
- If current page continues the previous document, use "is_new_doc":"No" and keep the same doc_type.
- If it starts a new document, use "is_new_doc":"Yes" and choose ONLY from {DOC_TYPES}.
- Be decisive. If ambiguous, choose "Other".

Previous page type: {prev_doc_type or "unknown"}

Previous Page (truncated to 1200 chars):
{prev_text[:1200]}

Current Page (truncated to 1200 chars):
{curr_text[:1200]}
""".strip()

def safe_parse_json(s: str):
    try:
        return json.loads(s)
    except:
        # quick cleanups if the model adds stray code fences or trailing text
        s2 = s.strip().strip("`").strip()
        try:
            return json.loads(s2)
        except:
            return None

## Heuristic fallbacks (only if document parsing fails)

In [None]:
def classify_heuristic(text: str) -> str:
    t = (text or "").lower()
    if "fees worksheet" in t or "calyx form" in t or "lender's title" in t:
        return "Lender Fee Sheet"
    if "payslip" in t or "net pay" in t or "employee id" in t:
        return "Payslip"
    if "contract of employment" in t or "this agreement" in t or "termination of employment" in t:
        return "Contract"
    if ("resume" in t or "curriculum vitae" in t) or ("experience" in t and "education" in t):
        return "Resume"
    if "passport" in t or "driver license" in t or "identity" in t:
        return "ID"
    return "Other"

## Main loop: build page-level metadata

In [None]:
results = []
current_doc_type = None
doc_counter = -1
page_in_doc = -1

for i, (idx, curr_text) in enumerate(pages_text):
    if i == 0:
        prompt = make_segmentation_prompt(None, "", curr_text)
        raw = mistral_json(prompt)
        parsed = safe_parse_json(raw)
        if not parsed or ("is_new_doc" not in parsed or "doc_type" not in parsed):
            # fallback
            parsed = {"is_new_doc":"Yes", "doc_type": classify_heuristic(curr_text)}
        is_new = parsed["is_new_doc"]
        current_doc_type = parsed["doc_type"]
        doc_counter += 1
        page_in_doc = 0
    else:
        prev_text = pages_text[i-1][1]
        prompt = make_segmentation_prompt(current_doc_type, prev_text, curr_text)
        raw = mistral_json(prompt)
        parsed = safe_parse_json(raw)
        if not parsed or ("is_new_doc" not in parsed or "doc_type" not in parsed):
            # heuristic: same type ⇒ continuation; else new doc
            cand_type = classify_heuristic(curr_text)
            if cand_type == current_doc_type:
                parsed = {"is_new_doc":"No","doc_type":current_doc_type}
            else:
                parsed = {"is_new_doc":"Yes","doc_type":cand_type}

        is_new = parsed["is_new_doc"]
        cand_type = parsed["doc_type"]

        if is_new == "Yes":
            doc_counter += 1
            current_doc_type = cand_type
            page_in_doc = 0
        else:
            # keep doc_type stable on continuation
            page_in_doc += 1

    results.append({
        "page": idx,
        "is_new_doc": is_new,
        "doc_type": current_doc_type,
        "page_in_doc": page_in_doc,
        "doc_id": doc_counter
    })

df = pd.DataFrame(results, columns=["page","is_new_doc","doc_type","page_in_doc","doc_id"])
df

Unnamed: 0,page,is_new_doc,doc_type,page_in_doc,doc_id
0,0,Yes,Lender Fee Sheet,0,0
1,1,Yes,Payslip,0,1
2,2,Yes,Contract,0,2
3,3,No,Contract,1,2
4,4,Yes,Other,0,3
5,5,Yes,Contract,0,4
6,6,Yes,Other,0,5


## Save outputs (JSON + CSV)

In [None]:
with open("/content/segmentation_results.json","w") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

df.to_csv("/content/segmentation_results.csv", index=False)

print("Saved:", "/content/segmentation_results.json", "/content/segmentation_results.csv")

Saved: /content/segmentation_results.json /content/segmentation_results.csv
