In [1]:
# Device diagnostics (CPU/CUDA/MPS)
import torch

def available_devices():
    devices = []
    if torch.cuda.is_available():
        for idx in range(torch.cuda.device_count()):
            name = torch.cuda.get_device_name(idx)
            cap = torch.cuda.get_device_capability(idx)
            devices.append({"type": "cuda", "index": idx, "name": name, "capability": cap})
    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        devices.append({"type": "mps", "index": 0, "name": "Apple MPS"})
    devices.append({"type": "cpu", "index": 0, "name": "CPU"})
    return devices

for d in available_devices():
    print(d)

x = torch.tensor([1.0])
print("Default tensor device:", x.device)
device = 'cuda' if torch.cuda.is_available() else ('mps' if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() else 'cpu')
print("Chosen device:", device)


{'type': 'cuda', 'index': 0, 'name': 'NVIDIA A100-SXM4-40GB', 'capability': (8, 0)}
{'type': 'cpu', 'index': 0, 'name': 'CPU'}
Default tensor device: cpu
Chosen device: cuda


In [2]:
# Quick smoke test
import torch
device = torch.device('cuda' if torch.cuda.is_available() else ('mps' if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() else 'cpu'))
a = torch.randn(1024, 1024, device=device)
b = torch.randn(1024, 1024, device=device)
with torch.inference_mode():
    c = a @ b
print("Matmul done on", device, "shape", c.shape)


Matmul done on cuda shape torch.Size([1024, 1024])


In [3]:
# Clone repo if missing and install deps (quiet).
import subprocess, time, os
from pathlib import Path

def run(cmd):
    t0 = time.time()
    print('>>>', ' '.join(cmd))
    proc = subprocess.run(cmd, capture_output=True, text=True)
    dt = time.time() - t0
    print(f'<<< done {dt:.1f}s rc={proc.returncode}')
    if proc.stdout:
        print('stdout:', proc.stdout[:400])
    if proc.stderr:
        print('stderr:', proc.stderr[:400])

repo_root = Path('/content/OCR')
print('Working dir', os.getcwd())
if not (repo_root / 'vlm_lab' / 'run_poc.py').exists():
    run(['git', 'clone', 'https://github.com/smidolt/x.git', str(repo_root)])
else:
    print('Repo already present at', repo_root)

run(['pip', 'install', '-q', '-r', str(repo_root / 'requirements.txt')])
run(['pip', 'install', '-q', '-r', str(repo_root / 'vlm_lab' / 'requirements.txt')])


Working dir /content
Repo already present at /content/OCR
>>> pip install -q -r /content/OCR/requirements.txt
<<< done 2.7s rc=0
>>> pip install -q -r /content/OCR/vlm_lab/requirements.txt
<<< done 2.7s rc=0


In [4]:
# Ensure documents exist (dummy if empty)
from pathlib import Path
from PIL import Image, ImageDraw
repo_root = Path('/content/OCR')
input_dir = repo_root / 'data' / 'input'
input_dir.mkdir(parents=True, exist_ok=True)
files = list(input_dir.glob('*'))
if not files:
    img = Image.new('RGB', (800, 1100), 'white')
    draw = ImageDraw.Draw(img)
    draw.text((50, 50), 'DUMMY INVOICE\nSeller: Example Corp\nTotal: 123.45', fill='black')
    img_path = input_dir / 'dummy_invoice.png'
    img.save(img_path)
    print('Created dummy invoice at', img_path)
else:
    print('Found documents:', [p.name for p in files])


Found documents: ['dummy_invoice.png']


In [5]:
# Run VLM PoC
import subprocess
from pathlib import Path
import torch

repo_root = Path('/content/OCR')
repo = repo_root / 'vlm_lab'
python_bin = repo / '.venv/bin/python'
python_exec = python_bin if python_bin.exists() else 'python'
models_file = repo / 'models.yaml'
documents = repo_root / 'data' / 'input'
output_dir = repo / 'results'
device = 'cuda' if torch.cuda.is_available() else 'auto'

cmd = [
    str(python_exec), str(repo / 'run_poc.py'),
    '--documents', str(documents),
    '--models-file', str(models_file),
    '--output-dir', str(output_dir),
    '--max-pages', '1',
    '--device', device,
]

print('Running:', ' '.join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True)
print('STDOUT:\n', result.stdout)
print('STDERR:\n', result.stderr)


Running: python /content/OCR/vlm_lab/run_poc.py --documents /content/OCR/data/input --models-file /content/OCR/vlm_lab/models.yaml --output-dir /content/OCR/vlm_lab/results --max-pages 1 --device cuda
STDOUT:
 === Model phi3-vision (microsoft/Phi-3-vision-128k-instruct) on device cuda ===
[ERROR] Failed to load phi3-vision: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
=== Model llava-mistral (llava-hf/llava-v1.6-mistral-7b-hf) on device cuda ===
[ERROR] Failed to load llava-mistral: Unrecognized configuration class <class 'transformers.models.llava_next.configuration_llava_next.LlavaNextConfig'> for this kind of AutoModel: AutoModelForCausalLM.
Model type should be one of ApertusConfig, ArceeConfig, AriaTextConfig, BambaConfig, BartConfig, BertConfig, Be

In [6]:
# Bundle latest VLM results for download
import shutil, subprocess
from pathlib import Path
repo_root = Path('/content/OCR')
results_root = repo_root / 'vlm_lab' / 'results'

def latest_run(root: Path):
    if not root.exists():
        return None
    runs = sorted([p for p in root.iterdir() if p.is_dir()], reverse=True)
    return runs[0] if runs else None

latest = latest_run(results_root)
print('Listing /content/OCR:')
subprocess.run(['ls', '-l', str(repo_root)])
if latest:
    print('Latest VLM run:', latest.name)
    subprocess.run(['ls', '-l', str(latest)])
    zip_path = repo_root / f'vlm_results_{latest.name}.zip'
    shutil.make_archive(zip_path.with_suffix(''), 'zip', latest)
    print('Archive ready at', zip_path)
    try:
        from google.colab import files
        files.download(str(zip_path))
    except Exception as e:
        print('Download manually:', zip_path, 'error:', e)
else:
    print('No VLM results found in', results_root)


Listing /content/OCR:
Latest VLM run: 20251126-130340
Archive ready at /content/OCR/vlm_results_20251126-130340.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>