# VLM Demo (Google Colab)

Clones the repo, installs dependencies, runs VLM blocks + reasoner on a sample image.

**Repo:** https://github.com/smidolt/x.git


In [None]:
# Clone repo
!rm -rf /content/x && git clone https://github.com/smidolt/x.git /content/x
%cd /content/x

# Install deps (CPU-friendly, adjust if GPU available)
!pip install -r requirements.txt -r requirements-vlm.txt

# Optional: ensure tesseract is available (Colab usually has it)
!apt-get update && apt-get install -y tesseract-ocr

In [None]:
# Example input: use existing sample or upload your own
import os
from pathlib import Path

sample = Path('input/google.jpg')
if not sample.exists():
    print("Sample not found, please upload an image to /content/x/input/")
else:
    print("Using sample:", sample)

In [None]:
# Run VLM orchestrator (preprocess -> OCR -> VLM blocks -> VLM reasoner)
# WARNING: VLM model is heavy; ensure runtime has GPU and enough RAM.
import subprocess, sys

cmd = [
    sys.executable, "-m", "src.orchestrator_vlm",
    "--input", "input/google.jpg",
    "--output", "output_vlm",
    "--vlm-model-reasoner", "Qwen/Qwen2-VL-2B-Instruct",
    "--vlm-backend-blocks", "heuristic",
    "--vlm-device", "auto",
    "--vlm-max-tokens", "128",
    "--vlm-temperature", "0.1",
]

print("Running:", " ".join(cmd))
subprocess.run(cmd, check=True)


In [None]:
# Inspect results
import json
from pathlib import Path

summary = Path('output_vlm/summary_vlm_orchestrator.json')
if summary.exists():
    print(summary.read_text()[:1000])
else:
    print("No summary found; check if the orchestrator ran successfully.")