In [23]:
# --- Installation (Run in a Colab code cell) ---
%pip install -U transformers datasets pandas scipy sentence-transformers
# Note: ColModernVBERT requires custom code/processor from the colpali library
!git clone https://github.com/illuin-tech/colpali.git
!cd colpali && git checkout vbert && pip install -e .

Note: you may need to restart the kernel to use updated packages.
Cloning into 'colpali'...
remote: Enumerating objects: 3822, done.[K
remote: Counting objects: 100% (1291/1291), done.[K
remote: Compressing objects: 100% (364/364), done.[K
remote: Total 3822 (delta 1125), reused 927 (delta 927), pack-reused 2531 (from 2)[K
Receiving objects: 100% (3822/3822), 867.78 KiB | 7.42 MiB/s, done.
Resolving deltas: 100% (2422/2422), done.
branch 'vbert' set up to track 'origin/vbert'.
Switched to a new branch 'vbert'
Obtaining file:///Users/sjoerdgunneweg/Documents/MSc_AI/IR2/modernvbert/colpali/colpali/colpali/colpali
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: colpali_engine
  Building editab

In [None]:
%%capture
!git clone https://github.com/illuin-tech/colpali.git
%cd colpali
%pip install -e .

In [None]:
import torch
import time
import numpy as np
from colpali_engine.models import ColModernVBert, ColModernVBertProcessor
from typing import List
import pandas as pd 

ModuleNotFoundError: No module named 'transformers'

In [None]:
def load_queries_from_csv(file_path: str, query_column: str) -> List[str]:
    df = pd.read_csv(file_path)
    return df[query_column].tolist()

queries = load_queries_from_csv('/content/sampled_queries.csv', 'query')

In [None]:
device = torch.device("cpu")

def measure_latency_modernvbert(queries: List[str], model_id: str) -> float:
    """Loads the model, encodes the queries, and measures average latency."""
    
    print(f" Loading {model_id} onto {device}")
    
    processor = ColModernVBertProcessor.from_pretrained(model_id)
    # Use float32 for CPU compatibility, or torch_dtype=torch.float16 if supported for speed
    model = ColModernVBert.from_pretrained(model_id, dtype=torch.float32, trust_remote_code=True).to(device).eval()
    
    print("Warm-up Run (10 queries)")
    warmup_queries = queries[:10]
    
    with torch.no_grad():
        warmup_inputs = processor.process_texts(warmup_queries).to(device)
        _ = model(**warmup_inputs)
    
    print(f"--- Timing {len(queries)} Queries on CPU ---")
    
    start_time = time.time()
    for query in queries:
        with torch.no_grad():
            inputs = processor.process_texts([query]).to(device)
            _ = model(**inputs) 
            
    end_time = time.time()
    
    total_time = end_time - start_time
    avg_latency = total_time / len(queries)
    
    print("\n Latency Results")
    print(f"Total time for {len(queries)} queries: {total_time:.3f} seconds")
    print(f"Average Query Encoding Latency: {avg_latency:.3f} seconds/query")
    print(f"Paper's Reported Latency: 0.032 seconds/query")
    print("----------------------------")
    
    return avg_latency

NameError: name 'List' is not defined

In [None]:
from colpali_engine.models import ColQwen2_5, ColQwen2_5_Processor
def measure_latency_colqwen(queries: List[str], model_id: str) -> float:

    """Loads the model, encodes the queries, and measures average latency."""
    
    print(f" Loading {model_id} onto {device}")
    
    processor = ColQwen2_5_Processor.from_pretrained(model_id, dtype=torch.float32, trust_remote_code=True)
    # Use float32 for CPU compatibility, or torch_dtype=torch.float16 if supported for speed
    model = ColQwen2_5.from_pretrained(model_id, dtype=torch.bfloat16).to(device).eval()
    
    print("Warm-up Run (10 queries)")
    warmup_queries = queries[:10]
    
    with torch.no_grad():
        warmup_inputs = processor.process_texts(warmup_queries).to(device)
        _ = model(**warmup_inputs)
    
    print(f"--- Timing {len(queries)} Queries on CPU ---")
    
    start_time = time.time()
    inputs = processor.process_texts(queries).to(device)
    with torch.no_grad():
        _ = model(**inputs) 
            
    end_time = time.time()
    
    total_time = end_time - start_time
    avg_latency = total_time / len(queries)
    
    print("\n Latency Results")
    print(f"Total time for {len(queries)} queries: {total_time:.3f} seconds")
    print(f"Average Query Encoding Latency: {avg_latency:.3f} seconds/query")
    print(f"Paper's Reported Latency: 0.032 seconds/query")
    print("----------------------------")
    
    return avg_latency

  

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoProcessor
from PIL import Image

from colpali_engine.models import ColPali, ColPaliProcessor

def measure_latency_colpali(queries: List[str], model_id: str) -> float:

    """Loads the model, encodes the queries, and measures average latency."""
    
    print(f" Loading {model_id} onto {device}")
    
    processor = AutoProcessor.from_pretrained(model_id)
    # Use float32 for CPU compatibility, or torch_dtype=torch.float16 if supported for speed
    model = ColPali.from_pretrained(model_id, dtype=torch.bfloat16).to_device(device).eval()
    
    print("Warm-up Run (10 queries)")
    warmup_queries = queries[:10]
    
    warmup_batch_queries = processor.process_queries(warmup_queries).to(device)

    with torch.no_grad():
      query_embeddings = model(**warmup_batch_queries)
    
    print(f"--- Timing {len(queries)} Queries on CPU ---")
    
    start_time = time.time()
    
    batch_queries = processor.process_queries(queries).to(device)

    with torch.no_grad():
      query_embeddings = model(**batch_queries)
            
    end_time = time.time()
    
    total_time = end_time - start_time
    avg_latency = total_time / len(queries)
    
    print("\n Latency Results")
    print(f"Total time for {len(queries)} queries: {total_time:.3f} seconds")
    print(f"Average Query Encoding Latency: {avg_latency:.3f} seconds/query")
    print(f"Paper's Reported Latency: 0.032 seconds/query")
    print("----------------------------")
    
    return avg_latency

In [None]:
colmodernvbert_id = "ModernVBERT/colmodernvbert" #TODO flash attention gebruiken?
bi_modernvbert_id = "ModernVBERT/bimodernvbert"
colqwen2_5_id = "vidore/colqwen2-5-v0.2" #TODO maybe set to bfloat16 
colpali_id = "vidore/colpali-v1.1"


# --- Run Latency Measurement ---
avg_latency = measure_latency_modernvbert(queries, colmodernvbert_id)
print("ColModernVBERT Average Latency:", avg_latency)
avg_latency = measure_latency_modernvbert(queries, bi_modernvbert_id)
print("Bi-ModernVBERT Average Latency:", avg_latency)
avg_latency = measure_latency_colqwen(queries, colqwen2_5_id) 
print("ColQwen2-5 Average Latency:", avg_latency)
avg_latency = measure_latency_colpali(queries, colpali_id)
print("ColPali Average Latency:", avg_latency)