# VLM demo (Colab, Qwen2-VL)
- Clone repo `https://github.com/smidolt/x.git`
- Install deps
- Run only VLM reasoner on `input/google.jpg` (replace path if needed)
- Tries to force valid JSON output with custom prompt

> Default model: `Qwen/Qwen2-VL-7B-Instruct` (more stable). You can switch to `Qwen/Qwen2-VL-2B-Instruct` if lighter is needed.


In [None]:
%cd /content
!rm -rf /content/x
!git clone https://github.com/smidolt/x.git /content/x
%cd /content/x

# Install dependencies (VLM + basics)
!pip install -r requirements.txt -r requirements-vlm.txt

# Tesseract usually preinstalled on Colab; uncomment if needed
# !apt-get update && apt-get install -y tesseract-ocr

In [None]:
from pathlib import Path

img_path = Path('input/google.jpg')  # change if needed
if not img_path.exists():
    raise FileNotFoundError(f"No image found at {img_path}, upload or set a valid path.")
print("Using image:", img_path)

In [None]:
import json
from src.vlm.services import run_reasoner

# Model: swap to 'Qwen/Qwen2-VL-2B-Instruct' if you want lighter
model_name = 'Qwen/Qwen2-VL-7B-Instruct'

custom_prompt = (
    "Return ONLY valid JSON with keys: meta, items, notes. Example: "
    '{"meta": {"seller_name": "...", "buyer_name": "...", "invoice_number": "...", "currency": "...", '
    '"total_net": 0, "total_vat": 0, "total_gross": 0}, "items": [{"description": "...", "quantity": 1, '
    '"unit_price": 0, "amount": 0, "currency": "..."}], "notes": []}'
)

res = run_reasoner({
    "image_path": str(img_path),
    "params": {
        "model_name": model_name,
        "device": "auto",
        "max_new_tokens": 128,
        "temperature": 0,
        "prompt": custom_prompt,
        "do_sample": True,
        "top_p": 0.9,
        "top_k": 50,
    }
})

print("Elapsed:", res.get("elapsed_seconds"))
print("Error:", res.get("error"))
print("Raw response (first 1000 chars):\n", res.get("raw_response", "")[:1000])

Path('output_vlm_single.json').write_text(json.dumps(res, indent=2, ensure_ascii=False), encoding='utf-8')
print("Saved output to output_vlm_single.json")