In [2]:
import torch
from colpali_engine.models import ColModernVBert, ColModernVBertProcessor
from PIL import Image
from huggingface_hub import hf_hub_download

model_id = "ModernVBERT/colmodernvbert"

processor = ColModernVBertProcessor.from_pretrained(model_id)
model = ColModernVBert.from_pretrained(
            model_id,
            torch_dtype=torch.float32,
            trust_remote_code=True
).to("cuda" if torch.cuda.is_available() else "cpu")

model.eval()

ColModernVBert(
  (model): ModernVBertModel(
    (vision_model): SiglipVisionTransformer(
      (embeddings): SiglipVisionEmbeddings(
        (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), padding=valid)
        (position_embedding): Embedding(1024, 768)
      )
      (encoder): SiglipEncoder(
        (layers): ModuleList(
          (0-11): 12 x SiglipEncoderLayer(
            (layer_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
            (self_attn): SiglipAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (layer_norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
            (mlp): SiglipMLP(
              (activation_fn): PytorchGELUTanh()

In [3]:
from PIL import Image
from huggingface_hub import hf_hub_download

# Your inputs
query = "ColModernVBERT matches the performance of models nearly 10x larger on visual document benchmarks."
images = [
    Image.open(hf_hub_download("HuggingFaceTB/SmolVLM", "example_images/rococo.jpg", repo_type="space")),
    Image.open(hf_hub_download("ModernVBERT/colmodernvbert", "table.png", repo_type="model"))
]

# Prepare inputs
text_inputs = processor.process_texts([query]).to(model.device)
image_inputs = processor.process_images(images).to(model.device)

# Inference
q_embeddings = model(**text_inputs)
corpus_embeddings = model(**image_inputs)

# Get the similarity scores
scores = processor.score(q_embeddings, corpus_embeddings)[0]

print(f"Query: {query}\n")
print("Similarities:")
for label, score in zip(["Painting Image", "Result Table Image (TARGET)"], scores):
    print(f"  - {label}: {score}")



Query: ColModernVBERT matches the performance of models nearly 10x larger on visual document benchmarks.

Similarities:
  - Painting Image: 7.177845001220703
  - Result Table Image (TARGET): 11.67164421081543
