In [None]:

# ALTERNATIVE: Simplified client without olmocr toolkit (less accurate)
# This approach doesn't use document anchoring but still works for basic OCR

import base64
from openai import OpenAI
from PIL import Image
import pdf2image
import io
import json

def simple_pdf_to_olmocr(pdf_path, page_num=1):
    """
    Simple PDF to OLMoCR without document anchoring
    Note: This is less accurate than the full approach above
    """

    # Convert PDF page to image
    pages = pdf2image.convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
    if not pages:
        raise ValueError("Could not convert PDF page to image")

    # Resize image to 1024px longest dimension
    image = pages[0]
    max_dim = max(image.size)
    if max_dim > 1024:
        scale = 1024 / max_dim
        new_size = (int(image.size[0] * scale), int(image.size[1] * scale))
        image = image.resize(new_size, Image.Resampling.LANCZOS)

    # Convert to base64
    buffer = io.BytesIO()
    image.save(buffer, format='PNG')
    image_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')

    # Simple prompt (not as good as document anchoring)
    prompt = """
You are an intelligent data extraction assistant. 
You are given the first page of a bank statement that contains both account information and transaction table headers.

Your goal:
1. Identify and extract all account metadata fields (top section).
2. Identify the column headers used for the transaction table on this page.

Return a JSON object in this format:

{
  "account_info_fields": [
    "Account Name",
    "Account Number",
    "Branch",
    "IFS Code",
    "MICR Code",
    "Account Type",
    "Balance as on",
    "Period From",
    "Period To",
    ...
  ],
  "transaction_columns": [
    "Date",
    "Details",
    "Ref No./Cheque No",
    "Debit",
    "Credit",
    "Balance"
  ]
}

Rules:
- Capture **only visible text**. 
- Preserve original capitalization and punctuation.
- If some labels are repeated or similar (e.g., “Ref No.” and “Ref No./Cheque No”), keep only the most complete one.
- Do not extract actual data values in this step — only the labels/headers.
- Always return valid JSON.

"""

    # Send to vLLM
    client = OpenAI(api_key="EMPTY", base_url="http://localhost:8000/v1")

    response = client.chat.completions.create(
        model="InternVL3_5",
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
            ]
        }],
        max_tokens=4096,
        temperature=0.1
    )

    return response.choices[0].message.content

# Usage
result = simple_pdf_to_olmocr("sbi.pdf", 1)
print(result)


In [None]:
def build_metadata_prompt(account_info_fields):
    field_list = ", ".join([f'"{f}"' for f in account_info_fields])
    return f"""
You are a precise data extraction assistant.
Extract the following account metadata fields from the input text:

{field_list}

Return JSON strictly in this format:

{{
  "account_info": {{
    {", ".join([f'"{f}": ""' for f in account_info_fields])}
  }}
}}

Rules:
- Use the exact field names provided above as JSON keys.
- If a field is missing, set its value to "-".
- Only return JSON. No comments or explanations.
"""


In [None]:
def build_transaction_prompt(transaction_columns):
    # Normalize to JSON keys
    normalized_keys = [c.lower().replace(" ", "_").replace("/", "_").replace(".", "").replace("(", "").replace(")", "") for c in transaction_columns]
    json_fields = ", ".join([f'"{k}": ""' for k in normalized_keys])
    
    return f"""
You are a structured data extraction assistant.

You are given one page of a bank statement and must extract transaction data using the following columns:
{transaction_columns}

Return valid JSON strictly in this format:

{{
  "transactions": [
    {{
      {json_fields}
    }}
  ]
}}

Rules:
- Map each extracted value to the appropriate column from the list above.
- Keep the exact text for details and reference numbers.
- If a field is not found, use "-".
- Keep two decimal places for numeric values.
- Do not include extra commentary or text outside of JSON.
"""


In [None]:
import base64
import io
import json
from openai import OpenAI
from PIL import Image
import pdf2image

# ---- CONFIG ----
API_BASE = "http://localhost:8000/v1"
MODEL_NAME = "olmOCR-7B-0225-preview"
MAX_IMG_DIM = 1024

client = OpenAI(api_key="EMPTY", base_url=API_BASE)

# ---- UTILITIES ----
def pdf_to_images(pdf_path, dpi=200):
    """Convert all PDF pages to images."""
    return pdf2image.convert_from_path(pdf_path, dpi=dpi)

def image_to_base64(image: Image.Image):
    """Convert PIL Image to base64 PNG."""
    max_dim = max(image.size)
    if max_dim > MAX_IMG_DIM:
        scale = MAX_IMG_DIM / max_dim
        new_size = (int(image.size[0] * scale), int(image.size[1] * scale))
        image = image.resize(new_size, Image.Resampling.LANCZOS)
    buffer = io.BytesIO()
    image.save(buffer, format='PNG')
    return base64.b64encode(buffer.getvalue()).decode('utf-8')

def call_ocr_llm(prompt, image_base64):
    """Call OCR-capable LLM with an image and prompt."""
    response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
            ]
        }],
        max_tokens=4096,
        temperature=0.1,
        response_format={"type": "json_object"}
    )
    return json.loads(response.choices[0].message.content)

# ---- PROMPTS ----
def schema_prompt():
    return """
You are an intelligent data extraction assistant.
You are given the first page of a bank statement that contains both account information and transaction table headers.

Your goal:
1. Identify and extract all account metadata fields (top section).
2. Identify the column headers used for the transaction table on this page.

Return JSON in this format:
{
  "account_info_fields": ["..."],
  "transaction_columns": ["..."]
}

Rules:
- Extract only visible labels (no values).
- Preserve original capitalization.
- Prefer complete labels (e.g., “Ref No./Cheque No” over “Ref No.”).
- Return valid JSON only.
"""

def build_metadata_prompt(account_info_fields):
    field_list = ", ".join([f'"{f}"' for f in account_info_fields])
    json_fields = ", ".join([f'"{f}": ""' for f in account_info_fields])
    return f"""
You are a precise data extraction assistant.
Extract the following metadata fields from the image:

{field_list}

Return JSON:
{{
  "account_info": {{
    {json_fields}
  }}
}}

Rules:
- Use exact field names.
- If a field is missing, set its value to "-".
- Return only JSON.
"""

def build_transaction_prompt(transaction_columns):
    json_fields = ", ".join([f'"{c}": ""' for c in transaction_columns])
    return f"""
You are a structured data extraction assistant.
Extract all transactions visible in this page using the following headers:
{transaction_columns}

Return JSON:
{{
  "transactions": [
    {{
      {json_fields}
    }}
  ]
}}

Rules:
- Map values based on column headers.
- Use '-' for missing values.
- Keep numeric precision to two decimals.
- Do not include commentary or metadata.
- Return valid JSON only.
"""

# ---- MAIN PIPELINE ----
def process_bank_statement(pdf_path):
    pages = pdf_to_images(pdf_path)

    # Stage 1 → Schema Discovery (first page)
    first_page_b64 = image_to_base64(pages[0])
    schema = call_ocr_llm(schema_prompt(), first_page_b64)

    # Stage 2 → Metadata Extraction (first page)
    metadata_prompt = build_metadata_prompt(schema["account_info_fields"])
    account_info = call_ocr_llm(metadata_prompt, first_page_b64)["account_info"]

    # Stage 3 → Transaction Extraction (all pages)
    txn_prompt = build_transaction_prompt(schema["transaction_columns"])
    all_txns = []

    for idx, page in enumerate(pages, start=1):
        print(f"Extracting transactions from page {idx}...")
        image_b64 = image_to_base64(page)
        txns = call_ocr_llm(txn_prompt, image_b64)
        all_txns.extend(txns["transactions"])

    # Combine results
    return {
        "account_info": account_info,
        "transactions": all_txns,
        "schema": schema
    }

# ---- USAGE ----
if __name__ == "__main__":
    result = process_bank_statement("sbi.pdf")
    print(json.dumps(result, indent=2))


In [None]:
import base64
import io
import json
import asyncio
from PIL import Image
import pdf2image
from openai import AsyncOpenAI

# ---- CONFIG ----
API_BASE = "http://localhost:8000/v1"
MODEL_NAME = "InternVL3_5"
MAX_IMG_DIM = 1024

client = AsyncOpenAI(api_key="EMPTY", base_url=API_BASE)

# ---- UTILITIES ----
def pdf_to_images(pdf_path, dpi=200):
    """Convert all PDF pages to images."""
    return pdf2image.convert_from_path(pdf_path, dpi=dpi)

def image_to_base64(image: Image.Image):
    """Convert PIL Image to base64 PNG."""
    max_dim = max(image.size)
    if max_dim > MAX_IMG_DIM:
        scale = MAX_IMG_DIM / max_dim
        new_size = (int(image.size[0] * scale), int(image.size[1] * scale))
        image = image.resize(new_size, Image.Resampling.LANCZOS)
    buffer = io.BytesIO()
    image.save(buffer, format='PNG')
    return base64.b64encode(buffer.getvalue()).decode('utf-8')

# ---- PROMPTS ----
def schema_prompt():
    return """
You are an intelligent data extraction assistant.
You are given the first page of a bank statement that contains both account information and transaction table headers.

Your goal:
1. Identify and extract all account metadata fields (top section).
2. Identify the column headers used for the transaction table on this page.

Return JSON in this format:
{
  "account_info_fields": ["..."],
  "transaction_columns": ["..."]
}

Rules:
- Extract only visible labels (no values).
- Preserve capitalization.
- Prefer complete labels (e.g., “Ref No./Cheque No” over “Ref No.”).
- Return valid JSON only.
"""

def build_metadata_prompt(account_info_fields):
    json_fields = ", ".join([f'"{f}": ""' for f in account_info_fields])
    return f"""
Extract these account metadata fields from the image:
{account_info_fields}

Return JSON:
{{
  "account_info": {{
    {json_fields}
  }}
}}

Rules:
- Use the exact field names provided.
- Use "-" if a value is missing.
- Return valid JSON only.
"""

def build_transaction_prompt(transaction_columns):
    json_fields = ", ".join([f'"{c}": ""' for c in transaction_columns])
    return f"""
Extract all transactions visible in this page using these headers:
{transaction_columns}

Return JSON:
{{
  "transactions": [
    {{
      {json_fields}
    }}
  ]
}}

Rules:
- Map values based on column headers.
- Use "-" for missing values.
- Keep numeric precision to two decimals.
- Return valid JSON only.
"""

# ---- ASYNC FUNCTIONS ----
async def call_ocr_llm(prompt, image_base64):
    """Call OCR-capable LLM asynchronously."""
    response = await client.chat.completions.create(
        model=MODEL_NAME,
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
            ]
        }],
        max_tokens=4096,
        temperature=0.1,
        response_format={"type": "json_object"}
    )
    return json.loads(response.choices[0].message.content)

async def extract_transactions_for_page(page_idx, image, txn_prompt):
    """Extract transactions from one page (async)."""
    image_b64 = image_to_base64(image)
    print(f"Extracting transactions from page {page_idx}...")
    try:
        data = await call_ocr_llm(txn_prompt, image_b64)
        return data.get("transactions", [])
    except Exception as e:
        print(f"⚠️ Page {page_idx} extraction failed: {e}")
        return []

# ---- MAIN PIPELINE ----
async def process_bank_statement_async(pdf_path):
    # Convert all PDF pages to images
    pages = pdf_to_images(pdf_path)

    # Stage 1: Schema discovery (sync — only 1 page)
    first_page_b64 = image_to_base64(pages[0])
    schema = await call_ocr_llm(schema_prompt(), first_page_b64)

    # Stage 2: Metadata extraction (first page)
    metadata_prompt = build_metadata_prompt(schema["account_info_fields"])
    account_info = (await call_ocr_llm(metadata_prompt, first_page_b64))["account_info"]

    # Stage 3: Parallel transaction extraction
    txn_prompt = build_transaction_prompt(schema["transaction_columns"])

    tasks = [
        extract_transactions_for_page(i + 1, page, txn_prompt)
        for i, page in enumerate(pages)
    ]
    results = await asyncio.gather(*tasks)

    all_txns = [txn for page_txns in results for txn in page_txns]

    return {
        "account_info": account_info,
        "transactions": all_txns,
        "schema": schema
    }


In [None]:
import asyncio

result = await process_bank_statement_async("icici.pdf")
print(result)


In [None]:

# ALTERNATIVE: Simplified client without olmocr toolkit (less accurate)
# This approach doesn't use document anchoring but still works for basic OCR

import base64
from openai import OpenAI
from PIL import Image
import pdf2image
import io
import json

def simple_pdf_to_olmocr(pdf_path, page_num=1):
    """
    Simple PDF to OLMoCR without document anchoring
    Note: This is less accurate than the full approach above
    """

    # Convert PDF page to image
    pages = pdf2image.convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
    if not pages:
        raise ValueError("Could not convert PDF page to image")

    # Resize image to 1024px longest dimension
    image = pages[0]
    max_dim = max(image.size)
    if max_dim > 1024:
        scale = 1024 / max_dim
        new_size = (int(image.size[0] * scale), int(image.size[1] * scale))
        image = image.resize(new_size, Image.Resampling.LANCZOS)

    # Convert to base64
    buffer = io.BytesIO()
    image.save(buffer, format='PNG')
    image_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')

    # Simple prompt (not as good as document anchoring)
    prompt = """
Extract all transactions from this bankstatement page and return as json. 
transaction description field might be in multiple lines.
Do not fill empty fields and do not include any new field.
"""

    # Send to vLLM
    client = OpenAI(api_key="EMPTY", base_url="http://localhost:8000/v1")

    response = client.chat.completions.create(
        model="dots-ocr",
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
            ]
        }],
        max_tokens=4096,
        temperature=0.1
    )

    return response.choices[0].message.content

# Usage
result = simple_pdf_to_olmocr("icici.pdf", 1)
print(result)


In [None]:
! pip install mineru

In [None]:
from mineru.api import paerse

In [None]:
!mineru -p icici.pdf -b vlm-http-client -u http://localhost:8000 --start-page 1 --end-page 2 -o output

In [None]:
import base64
import json
import requests

# Replace with your actual service URL
OCR_API_URL = "http://localhost:8080/ocr"

# The image you want to send
IMAGE_PATH = "test.jpg"

# Encode image to base64
with open(IMAGE_PATH, "rb") as img_file:
    image_b64 = base64.b64encode(img_file.read()).decode("utf-8")

# Prepare payload
payload = {
    "prompt": "<image>\n<|grounding|>Convert the document to markdown",
    "image_base64": image_b64
}

# Send POST request
response = requests.post(
    OCR_API_URL,
    headers={"Content-Type": "application/json"},
    data=json.dumps(payload),
    timeout=300  # increase for large images
)

# Check response
if response.status_code == 200:
    data = response.json()
    print("\n✅ OCR Success!")
    print("Extracted Text:\n")
    print(data.get("text_output", ""))
else:
    print("\n❌ Error:", response.status_code, response.text)


In [20]:
import base64
import pdf2image
from PIL import Image
import io
import json
import requests

# === CONFIG ===
OCR_API_URL = "http://localhost:8080/ocr"
PDF_PATH = "icici.pdf"
PAGE_NUMBER = 1  # 1-based index


def pdf_page_to_base64(pdf_path: str, page_num: int = 1) -> str:
    """
    Converts a single PDF page to Base64 PNG image.
    Uses pdf2image (Poppler) instead of fitz.
    """
    # Convert the selected page to image
    pages = pdf2image.convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
    if not pages:
        raise ValueError(f"Could not read page {page_num} from PDF")

    image = pages[0]

    # Resize to 1024px longest side to avoid huge payloads
    max_dim = max(image.size)
    if max_dim > 1024:
        scale = 1024 / max_dim
        new_size = (int(image.size[0] * scale), int(image.size[1] * scale))
        image = image.resize(new_size, Image.Resampling.LANCZOS)

    # Convert to Base64 PNG
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    image_b64 = base64.b64encode(buffer.getvalue()).decode("utf-8")

    return image_b64


def send_to_ocr_api(image_b64: str):
    """
    Sends the Base64-encoded image to the OCR API.
    """
    payload = {
        "prompt": "<image>\n<|grounding|>Convert the document to markdown",
        "image_base64": image_b64,
    }

    response = requests.post(
        OCR_API_URL,
        headers={"Content-Type": "application/json"},
        data=json.dumps(payload),
        timeout=300,
    )

    if response.status_code == 200:
        data = response.json()
        print("\n✅ OCR Success!")
        print("Extracted Text:\n")
        print(data.get("text_output", ""))
        return data
    else:
        print("\n❌ Error:", response.status_code, response.text)
        return None


def process_pdf_with_ocr(pdf_path: str, page_number: int):
    """
    Extracts the given PDF page, converts it to Base64, and sends it to OCR.
    """
    print(f"🔹 Extracting page {page_number} from {pdf_path}...")
    base64_img = pdf_page_to_base64(pdf_path, page_number)
    print(f"✅ Page {page_number} converted to Base64.")

    print("🔹 Sending to OCR API...")
    result = send_to_ocr_api(base64_img)
    return result


if __name__ == "__main__":
    process_pdf_with_ocr(PDF_PATH, PAGE_NUMBER)


🔹 Extracting page 1 from icici.pdf...
✅ Page 1 converted to Base64.
🔹 Sending to OCR API...

✅ OCR Success!
Extracted Text:

 

<|ref|>text<|/ref|><|det|>[[44, 137, 166, 146]]<|/det|>
MHWI/181D/1-1/WBF-MO3-12 

<|ref|>image<|/ref|><|det|>[[44, 145, 420, 172]]<|/det|>
 

<|ref|>text<|/ref|><|det|>[[44, 196, 273, 209]]<|/det|>
MR.KASULABADA PRASHANTH 

<|ref|>text<|/ref|><|det|>[[44, 211, 477, 255]]<|/det|>
13-1-55/AT/4 PLOT NO 4 AVANTHI NGR THOTA NR
POCHAMMA TEMPLE MOTHI NGR BALANAGAR SANAT NGR
REDDY
TELANGANA - INDIA - 500018 

<|ref|>text<|/ref|><|det|>[[644, 196, 875, 216]]<|/det|>
Your Base Branch: 4TH NERELLA HOUSE,
PANJ AGUTTA, 500034 

<|ref|>text<|/ref|><|det|>[[644, 233, 840, 247]]<|/det|>
Visit www.icicibank.com 

<|ref|>text<|/ref|><|det|>[[650, 250, 847, 264]]<|/det|>
Dial your Bank 33667777 

<|ref|>text<|/ref|><|det|>[[44, 307, 940, 335]]<|/det|>
Did you know? It's mandatory to be KYC compliant as per RBI guidelines. If you have not submitted your KYC documents, please vis

In [None]:
!pip install frontend