In [None]:
# Colab cell 1 - setup
!pip install -q easyocr sentence-transformers faiss-cpu
# (If using PDFs you may also need pdf2image & poppler; for images alone this is enough.)

# Colab cell 2 - imports
import io, json
from google.colab import files
import easyocr
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Colab cell 3 - helper: upload file
print("Upload an invoice image (JPG/PNG).")
uploaded = files.upload()
fname = list(uploaded.keys())[0]
print("Uploaded:", fname)

# Colab cell 4 - run OCR (easyocr)
reader = easyocr.Reader(['en'], gpu=False)   # set gpu=True if Colab instance has GPU
ocr_result = reader.readtext(fname, detail=0)  # detail=0 returns plain text segments
full_text = "\n".join(ocr_result)
print("=== OCR TEXT ===")
print(full_text[:1000])  # print first 1000 chars

# Colab cell 5 - embedding & FAISS index
embed_model = SentenceTransformer('all-MiniLM-L6-v2')  # small, fast, accurate
# For a POC, we'll index the whole invoice text as one doc
docs = [full_text]
embs = embed_model.encode(docs, convert_to_numpy=True)

# Build FAISS index
d = embs.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embs)
print("FAISS index contains:", index.ntotal)

# Colab cell 6 - test query (semantic search)
query = "Who is the vendor on this invoice?"
q_emb = embed_model.encode([query], convert_to_numpy=True)
k = 1
D, I = index.search(q_emb, k)
print("Top match index:", I, "distances:", D)
print("Matched doc text snippet:", docs[I[0][0]][:500])

# Colab cell 7 - save results as JSON
out = {
  "file": fname,
  "ocr_text": full_text,
  "embeddings_shape": embs.shape,
}
with open("invoice_extraction.json","w") as f:
  json.dump(out, f)
print("Saved invoice_extraction.json")
files.download("invoice_extraction.json")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.7/180.7 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m963.8/963.8 kB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.1/292.1 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hUpload an invoice image (JPG/PNG).




Saving IndiGo Itinerary.pdf to IndiGo Itinerary.pdf
Uploaded: IndiGo Itinerary.pdf
Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |--------------------------------------------------| 0.0% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.3% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.7% Complet

OSError: Could not find a backend to open `IndiGo Itinerary.pdf`` with iomode `r`.

First, install the necessary libraries and dependencies for PDF conversion.

In [None]:
# Install pdf2image and poppler-utils
!apt-get install -y poppler-utils
!pip install -q pdf2image

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 38 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.10 [186 kB]
Fetched 186 kB in 1s (188 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126675 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.10_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.10) ...
Setting up poppler-utils (22.02.0-2ubuntu0.10) ...
Processing triggers for man-db (2.10.2-1) ...


Now, convert the uploaded PDF file to images. Since a PDF can have multiple pages, this will create a list of images, one for each page.

In [None]:
from pdf2image import convert_from_path

# Convert PDF to a list of images
images = convert_from_path(fname)

# For simplicity, we'll process the first page
# If you need to process all pages, you'll need to loop through the 'images' list
img_to_process = images[0]

# Convert the image to a format easyocr can read (e.g., numpy array)
import numpy as np
img_np = np.array(img_to_process)

Now we can use EasyOCR to read the text from the image. We'll replace `fname` with the converted image data (`img_np`).

In [None]:
# Colab cell 4 - run OCR (easyocr) - MODIFIED TO USE IMAGE DATA
reader = easyocr.Reader(['en'], gpu=False)   # set gpu=True if Colab instance has GPU
ocr_result = reader.readtext(img_np, detail=0)  # detail=0 returns plain text segments
full_text = "\n".join(ocr_result)
print("=== OCR TEXT ===")
print(full_text[:1000])  # print first 1000 chars



=== OCR TEXT ===
9/30/25, 12:12 PM
Itinerary
IndiGo
Date of booking 30 Sep 2025 06.42
PNR/Booking Reference
L3Z9XV
Confirmed
Payment Status
Complete
Passenger Information
Mr Shravan Goud Parepally
Adult
Male
IndiGo Bluchip Membership No:: 007009343
Sector
Seat
6E Add-ons
HYD-PNQ
2C(Aisle)
Chicken
lee Sandwich + Beverage of
choice, Fast forward
PNQ-HYD
2D(Aisle)
Chicken Junglee Sandwich + Beverage of
choice, Fast forward
6E 6471 (A320)
13 Oct 2025
Check-in/Bag Drop Closes: 05.50 hrs
Hyderabad
Travel Time Ih ]0m
Pune
HYD
Rajiv Gandhi International Airport
PNQ
Pune International Airport
06:50
13 Oct 2025
08:00
13 Oct 2025
6E 103 (4321)
14 Oct 2025
Check-in/Bag Drop Closes: 21.35 hrs
Pune
Travel Time Ih 20m
Hyderabad
PNQ
Pune International Airport
HYD
Rajiv Gandhi International Airport
22:35 hrs, 14 Oct 2025
23.55
14 Oct 2025
#Booking date reflects in UTC (Coordinated Universal Time) , all other timings mentioned are aS per local TIME
https:IIww:goindigo in/booklitinerary html?order_id-JFT

The rest of the notebook cells will work as before, using the `full_text` variable generated from the image data.

In [None]:
# Install required packages (run this cell first)
!pip install -q sentence-transformers faiss-cpu

# Imports
import json, re, uuid, time
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from pprint import pprint


In [None]:
# Synthetic vendor master and PO dataset for the POC
vendors = {
    "V0001": {
        "vendor_id": "V0001",
        "name": "ABC Emulators Co.",
        "gstin": "29AAAAA1111A1Z5",
        "address": "12 Industrial Area, Hyderabad, India",
        "aliases": ["ABC Emulators", "ABC Emulators Company", "ABC Emu Co"]
    },
    "V0002": {
        "vendor_id": "V0002",
        "name": "IndiGo Airlines",
        "gstin": "07BBBBB2222B2Z2",
        "address": "Airport Road, Bangalore",
        "aliases": ["Indigo", "IndiGo"]
    },
    "V0003": {
        "vendor_id": "V0003",
        "name": "Global Supplies Ltd",
        "gstin": "27CCCCC3333C3Z3",
        "address": "Mumbai Industrial Park",
        "aliases": ["Global Supplies", "G Supplies"]
    }
}

# Simple PO list (each PO has vendor_id, po_number, total_amount, lines)
pos = [
    {"po_id":"PO-12345","vendor_id":"V0001","po_number":"PO-12345","total":59000.0,"lines":[{"line":1,"qty":10,"unit_price":5000}]},
    {"po_id":"PO-500","vendor_id":"V0002","po_number":"PO-500","total":8500.0,"lines":[{"line":1,"qty":1,"unit_price":8500}]},
    {"po_id":"PO-999","vendor_id":"V0003","po_number":"PO-999","total":120000.0,"lines":[{"line":1,"qty":10,"unit_price":12000}]}
]

print("Vendors and POs loaded for POC. Vendors:", list(vendors.keys()), "POs:", [p["po_id"] for p in pos])


Vendors and POs loaded for POC. Vendors: ['V0001', 'V0002', 'V0003'] POs: ['PO-12345', 'PO-500', 'PO-999']


In [None]:
# Load embedding model
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# Prepare vendor documents (text to index)
vendor_texts = []
vendor_ids = []
vendor_meta = []
for vid, v in vendors.items():
    text = f"{v['name']} | GSTIN: {v['gstin']} | {v['address']} | aliases: {', '.join(v['aliases'])}"
    vendor_texts.append(text)
    vendor_ids.append(vid)
    vendor_meta.append({"vendor_id": vid, "text": text})

# Create embeddings
vendor_embs = embed_model.encode(vendor_texts, convert_to_numpy=True)
d = vendor_embs.shape[1]
vendor_index = faiss.IndexFlatL2(d)
vendor_index.add(vendor_embs)
print("Vendor index built. Count:", vendor_index.ntotal)

# Prepare PO documents (index by PO text)
po_texts = []
po_ids = []
for p in pos:
    txt = f"PO {p['po_number']} vendor_id:{p['vendor_id']} total:{p['total']} lines:{len(p['lines'])}"
    po_texts.append(txt)
    po_ids.append(p["po_id"])

po_embs = embed_model.encode(po_texts, convert_to_numpy=True)
po_index = faiss.IndexFlatL2(d)
po_index.add(po_embs)
print("PO index built. Count:", po_index.ntotal)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Vendor index built. Count: 3
PO index built. Count: 3


In [None]:
# Extractor: simple regex-based extraction to get header fields
def extract_fields_from_text(text):
    out = {}
    t = text
    # Invoice number / PNR
    m = re.search(r"(?:Invoice|Ticket|Inv|Invoice No|Invoice#|Invoice Number)[^\dA-Z]*([A-Z0-9\-/]+)", t, re.IGNORECASE)
    if m:
        out["invoice_number"] = {"value": m.group(1).strip(), "confidence": 0.9}
    # Date detection (simple)
    m2 = re.search(r"(\d{1,2}\s*[A-Za-z]{3,9}\s*\d{4}|\d{4}-\d{2}-\d{2})", t)
    if m2:
        out["invoice_date"] = {"value": m2.group(1).strip(), "confidence": 0.9}
    # Amount (first currency+amount match)
    m3 = re.search(r"(?:INR|Rs\.?|₹|\$)?\s*([0-9,]+(?:\.[0-9]{1,2})?)", t.replace(',',''))
    if m3:
        try:
            out["grand_total"] = {"value": float(m3.group(1)), "confidence": 0.9}
        except:
            pass
    # Vendor raw name: naive heuristic (first capitalized line)
    lines = [ln.strip() for ln in t.splitlines() if ln.strip()]
    if lines:
        candidate = lines[0]
        out["vendor_raw"] = {"value": candidate, "confidence": 0.6}
    return out

# VendorRetriever: semantic search in vendor_index
def vendor_retriever(query_text, top_k=3):
    q_emb = embed_model.encode([query_text], convert_to_numpy=True)
    D, I = vendor_index.search(q_emb, top_k)
    results = []
    for score, idx in zip(D[0], I[0]):
        if idx < 0 or idx >= len(vendor_ids):
            continue
        vid = vendor_ids[idx]
        results.append({"vendor_id": vid, "score": float(score), "text": vendor_meta[idx]["text"], "vendor_record": vendors[vid]})
    return results

# POMatcher: simple matching using vendor_id and amount tolerance or PO number
def po_matcher(invoice_header, top_k=3):
    results = []
    inv_amount = invoice_header.get("grand_total", {}).get("value")
    inv_po = invoice_header.get("po_number", {}).get("value") if invoice_header.get("po_number") else None
    # If PO number provided, check direct match
    if inv_po:
        for p in pos:
            if p["po_number"] == inv_po:
                # compute score (exact match -> high)
                results.append({"po_id": p["po_id"], "po_number": p["po_number"], "match_score": 1.0, "po": p})
                return results
    # else search by semantic on PO text
    query = f"{invoice_header.get('vendor_raw',{}).get('value','')} amount {inv_amount if inv_amount else ''}"
    q_emb = embed_model.encode([query], convert_to_numpy=True)
    D, I = po_index.search(q_emb, top_k)
    for score, idx in zip(D[0], I[0]):
        if idx < 0 or idx >= len(po_ids):
            continue
        pid = po_ids[idx]
        # find canonical PO record
        p = next((x for x in pos if x["po_id"]==pid), None)
        # compute a simple normalized match score from distance
        match_score = 1.0 / (1.0 + float(score))
        results.append({"po_id": pid, "match_score": match_score, "po": p})
    return results

# Validator: run basic guardrails & create exceptions
def validate_invoice(canonical_json, conf_thresholds=None):
    if conf_thresholds is None:
        conf_thresholds = {"mandatory":0.8, "amount_tolerance_abs":50.0, "amount_tolerance_pct":0.005}
    exceptions = []
    header = canonical_json.get("header", {})
    # Mandatory checks: invoice_number and grand_total
    if not header.get("invoice_number") or header["invoice_number"].get("confidence",0) < conf_thresholds["mandatory"]:
        exceptions.append({"code":"E_MAND_001","severity":"E","message":"Missing or low-confidence invoice number"})
    if not header.get("grand_total") or header["grand_total"].get("confidence",0) < conf_thresholds["mandatory"]:
        exceptions.append({"code":"E_MAND_002","severity":"E","message":"Missing or low-confidence total amount"})
    # Amount tolerance: check sum(lines) if lines present
    lines = canonical_json.get("lines", [])
    if lines:
        sum_lines = sum([l.get("line_total",{}).get("value",0) for l in lines])
        grand = header.get("grand_total",{}).get("value",None)
        if grand is not None:
            diff = abs(grand - sum_lines)
            pctdiff = diff / (grand if grand else 1)
            if diff > conf_thresholds["amount_tolerance_abs"] and pctdiff > conf_thresholds["amount_tolerance_pct"]:
                exceptions.append({"code":"E_AMT_001","severity":"E","message":f"Amount mismatch: invoice total {grand} vs lines sum {sum_lines}"})
    # PO match check (if po_flags indicate PO-based)
    poflags = canonical_json.get("poflags", {})
    if poflags.get("is_po_based"):
        matches = poflags.get("po_matches", [])
        if not matches:
            exceptions.append({"code":"E_PO_001","severity":"E","message":"PO-based invoice but no PO match found"})
        else:
            # if best match score low, raise warning
            best_score = max([m["match_score"] for m in matches]) if matches else 0
            if best_score < 0.7:
                exceptions.append({"code":"W_PO_002","severity":"W","message":"PO match score low"})
    return exceptions


In [None]:
if result:
    print("\n--- Full canonical JSON (truncated) ---")
    print(json.dumps(result, indent=2)[:2000])
    # Save result
    fname = f"invoice_result_{result['invoice_id']}.json"
    with open(fname, "w") as f:
        json.dump(result, f, indent=2)
    from google.colab import files
    files.download(fname)
    print("Saved and downloaded:", fname)


NameError: name 'result' is not defined

In [None]:
# 1) Try different thresholds:
res2 = plan_and_execute(full_text, auto_approve_threshold=0.9, high_value_threshold=10000)
pprint({"actions": res2["ml_metadata"].get("actions"), "decision": res2["decision"], "exceptions": res2["validation"]["exceptions"]})

# 2) Simulate a captured JSON (incoming payload) and re-run
incoming_payload = {"capture_id":"cap-xyz","vendor_proposed":{"vendor_id":"V0001","name":"ABC Emulators Co.","confidence":0.92}}
res3 = plan_and_execute(full_text, incoming_payload=incoming_payload)
pprint({"actions": res3["ml_metadata"].get("actions"), "decision": res3["decision"]})


NameError: name 'plan_and_execute' is not defined