# VLM pipeline (no OCR) — Colab
- Clone repo
- Install deps
- Run VLM orchestrator without OCR/blocks on `input/google.jpg` (now with validation)
- Default model: Qwen/Qwen2-VL-2B-Instruct (lighter). Swap to 7B if GPU allows.


In [None]:
%cd /content
!rm -rf /content/x
!git clone https://github.com/smidolt/x.git /content/x
%cd /content/x
!pip install --upgrade pip
!pip install -r requirements.txt -r requirements-vlm.txt


In [None]:
# Update image path if needed
from pathlib import Path
img = Path('input/google.jpg')
assert img.exists(), 'Upload or point to an image in input/'
print('Using', img)


In [None]:
# Run VLM orchestrator (no OCR/blocks) with validation + artifacts
PROMPT="Return ONLY valid JSON with keys: meta, items, notes. meta must include seller_name, seller_address, seller_tax_id (DDV if exists), buyer_name, buyer_address, buyer_tax_id (if exists), invoice_number, issue_date, supply_date, currency (ISO), total_net, total_vat, total_gross, vat_exemption_reason (if applicable). items must include description, quantity, unit_price, net_amount, vat_rate, vat_amount, gross_amount, currency. notes as array. No markdown, no prose, just JSON."
!python -m src.orchestrator_vlm \
  --input input/google.jpg \
  --output output_vlm \
  --vlm-model-reasoner Qwen/Qwen2-VL-2B-Instruct \
  --vlm-device auto \
  --vlm-max-tokens 384 \
  --vlm-temperature 0.05 \
  --vlm-prompt "$PROMPT" \
  --seller-name "Google LLC" \
  --seller-tax-id "US" \
  --currency-hint USD \
  --amount-tolerance 0.5


In [None]:
# Inspect result (summary, normalized, validation, raw)
import json
from pathlib import Path

summary = Path('output_vlm/summary_vlm_orchestrator.json')
if summary.exists():
    print('--- summary ---')
    print(summary.read_text()[:1500])
    doc = json.loads(summary.read_text())[0]
    normalized = Path(doc.get('artifacts', {}).get('normalized_vlm_json', ''))
    validation = Path(doc.get('artifacts', {}).get('validation_report', ''))
    raw = Path(doc.get('artifacts', {}).get('raw_vlm_response', ''))
    if normalized.exists():
        print('\n--- normalized ---')
        print(normalized.read_text()[:1500])
    if validation.exists():
        print('\n--- validation ---')
        print(validation.read_text()[:1500])
    if raw.exists():
        print('\n--- raw ---')
        print(raw.read_text()[:800])
else:
    print('No summary file; check run logs.')
