# Module 3: Evaluating Search Pipelines

## Setup

Install required dependencies.

In [1]:
!pip install -q fastembed qdrant-client ranx


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Ground Truth (Qrels)

In [2]:
# Ground truth: which documents are relevant to which queries?
# We'll use our 4 sample images and create meaningful queries
qrels_dict = {
    "company quarterly financial results and revenue": {
        "images/financial-report.png": 3,  # Highly relevant
    },
    "historic ship disaster at sea": {
        "images/titanic-newspaper.jpg": 3,  # Highly relevant
    },
    "space exploration and astronauts": {
        "images/men-walk-on-moon-newspaper.jpg": 3,  # Highly relevant
    },
    "physics theory and scientist": {
        "images/einstein-newspaper.jpg": 3,  # Highly relevant
    },
    "news headline from early 1900s": {
        "images/titanic-newspaper.jpg": 3,  # Highly relevant
        "images/einstein-newspaper.jpg": 2,  # Somewhat relevant
    },
    "business earnings report": {
        "images/financial-report.png": 3,  # Highly relevant
    },
    "NASA moon landing mission": {
        "images/men-walk-on-moon-newspaper.jpg": 3,  # Highly relevant
    },
    "ocean liner sinking": {
        "images/titanic-newspaper.jpg": 3,  # Highly relevant
    },
}

## Collection Setup

In [3]:
from qdrant_client import QdrantClient, models
from qdrant_client.models import (
    VectorParams, Distance, MultiVectorConfig, MultiVectorComparator,
    ScalarQuantization, ScalarQuantizationConfig, ScalarType,
)

client = QdrantClient(url="http://localhost:6333")

COLLECTION_NAME = "eval-multi-vector"

# Delete collection if it exists (for clean reruns)
client.delete_collection(COLLECTION_NAME, timeout=60)

client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config={
        # Full ColModernVBERT multi-vector (no quantization)
        "colmodernvbert": VectorParams(
            size=128,
            distance=Distance.DOT,
            multivector_config=MultiVectorConfig(
                comparator=MultiVectorComparator.MAX_SIM
            ),
            hnsw_config=models.HnswConfigDiff(m=0),  # Disable HNSW for multi-vector
        ),
        # ColModernVBERT with scalar quantization enabled
        "colmodernvbert_sq": VectorParams(
            size=128,
            distance=Distance.DOT,
            multivector_config=MultiVectorConfig(
                comparator=MultiVectorComparator.MAX_SIM
            ),
            hnsw_config=models.HnswConfigDiff(m=0),
            quantization_config=ScalarQuantization(
                scalar=ScalarQuantizationConfig(
                    type=ScalarType.INT8,
                    quantile=0.99,
                    always_ram=True,
                )
            ),
        ),
        # MUVERA single-vector approximation for fast HNSW search
        "muvera": VectorParams(
            size=40960,  # muvera.embedding_size from k_sim=6, dim_proj=32, r_reps=20
            distance=Distance.COSINE,
        ),
        # Hierarchical pooled multi-vector (k=32 clusters)
        "hierarchical": VectorParams(
            size=128,
            distance=Distance.DOT,
            multivector_config=MultiVectorConfig(
                comparator=MultiVectorComparator.MAX_SIM
            ),
            hnsw_config=models.HnswConfigDiff(m=0),
        ),
    },
)

print(f"Created collection '{COLLECTION_NAME}' with 4 named vector configurations")

Created collection 'eval-multi-vector' with 4 named vector configurations


## Load Models and Define Helper Functions

In [4]:
from fastembed import LateInteractionMultimodalEmbedding
from fastembed.postprocess import Muvera
from scipy.cluster.vq import kmeans2
import numpy as np

# Load the embedding model
print("Loading ColModernVBERT model...")
model = LateInteractionMultimodalEmbedding(
    model_name="Qdrant/colmodernvbert"
)

# Initialize MUVERA with same configuration as the collection
print("Initializing MUVERA...")
muvera = Muvera.from_multivector_model(model=model, k_sim=6, dim_proj=32, r_reps=20)

print(f"MUVERA embedding size: {muvera.embedding_size}")

Loading ColModernVBERT model...


Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

Initializing MUVERA...
MUVERA embedding size: 40960


Define a helper to pool multi-vector embeddings into k centroids using k-means clustering.

In [5]:
def hierarchical_pool(embeddings: np.ndarray, k: int = 32) -> np.ndarray:
    """Pool multi-vector to k centroids using k-means clustering."""
    if len(embeddings) <= k:
        return embeddings  # No pooling needed
    centroids, labels = kmeans2(embeddings.astype(np.float64), k, minit="++")
    # Return mean of embeddings in each cluster
    pooled = np.array([
        embeddings[labels == i].mean(axis=0)
        for i in range(k)
        if (labels == i).any()
    ])
    return pooled.astype(np.float32)

Define the ingestion pipeline that generates all four vector representations (full multi-vector, scalar-quantized, MUVERA, and hierarchical-pooled) and uploads them as a single point.

In [6]:
def embed_and_upload_document(doc_path: str, doc_id: int) -> None:
    """Embed a document and upload all four vector representations."""
    # Generate full multi-vector embeddings
    full_multivec = np.array(list(model.embed_image([doc_path]))[0])
    
    print(f"  Document {doc_id}: {doc_path}")
    print(f"    Full multi-vector shape: {full_multivec.shape}")

    # Generate MUVERA approximation
    muvera_vec = muvera.process_document(full_multivec)
    print(f"    MUVERA vector shape: {muvera_vec.shape}")

    # Generate hierarchical pooled version (k=32)
    hierarchical_vec = hierarchical_pool(full_multivec, k=32)
    print(f"    Hierarchical pooled shape: {hierarchical_vec.shape}")

    # Upload all representations in one point
    client.upsert(
        collection_name=COLLECTION_NAME,
        points=[
            models.PointStruct(
                id=doc_id,
                payload={"filename": doc_path},
                vector={
                    "colmodernvbert": full_multivec.tolist(),
                    "colmodernvbert_sq": full_multivec.tolist(),  # Same data, quantized config
                    "muvera": muvera_vec.tolist(),
                    "hierarchical": hierarchical_vec.tolist(),
                },
            )
        ],
    )

## Ingest Sample Documents

We'll use the 4 sample images and generate all 4 vector representations for each.

In [7]:
# Document paths - our sample dataset
DOC_PATHS = [
    "images/financial-report.png",
    "images/titanic-newspaper.jpg",
    "images/men-walk-on-moon-newspaper.jpg",
    "images/einstein-newspaper.jpg",
]

# Upload all documents with all 4 vector representations
print("Embedding and uploading documents...\n")
for doc_id, doc_path in enumerate(DOC_PATHS):
    embed_and_upload_document(doc_path, doc_id)
    print()

# Verify ingestion
collection_info = client.get_collection(COLLECTION_NAME)
print(f"\nCollection '{COLLECTION_NAME}' now has {collection_info.points_count} points")

Embedding and uploading documents...

  Document 0: images/financial-report.png
    Full multi-vector shape: (884, 128)
    MUVERA vector shape: (40960,)
    Hierarchical pooled shape: (32, 128)

  Document 1: images/titanic-newspaper.jpg
    Full multi-vector shape: (1149, 128)
    MUVERA vector shape: (40960,)
    Hierarchical pooled shape: (32, 128)

  Document 2: images/men-walk-on-moon-newspaper.jpg
    Full multi-vector shape: (1149, 128)
    MUVERA vector shape: (40960,)
    Hierarchical pooled shape: (32, 128)

  Document 3: images/einstein-newspaper.jpg
    Full multi-vector shape: (1149, 128)
    MUVERA vector shape: (40960,)
    Hierarchical pooled shape: (32, 128)


Collection 'eval-multi-vector' now has 4 points


## Pipeline Configurations

In [8]:
PIPELINES = {
    # Baseline: full quality, no optimization
    "baseline": {
        "using": "colmodernvbert",
        "prefetch_using": None,
    },

    # Scalar quantized: reduced memory, minimal quality loss
    "scalar_quantized": {
        "using": "colmodernvbert_sq",
        "prefetch_using": None,
    },

    # Hierarchical pooling: fewer vectors per document
    "hierarchical": {
        "using": "hierarchical",
        "prefetch_using": None,
    },

    # Two-stage: fast MUVERA prefetch + full quality rerank
    "muvera_rerank": {
        "using": "colmodernvbert",
        "prefetch_using": "muvera",
        "prefetch_limit": 50,
    },

    # Two-stage with quantized rerank
    "muvera_quantized": {
        "using": "colmodernvbert_sq",
        "prefetch_using": "muvera",
        "prefetch_limit": 50,
    },

    # Maximum compression: MUVERA prefetch + pooled rerank
    "muvera_hierarchical": {
        "using": "hierarchical",
        "prefetch_using": "muvera",
        "prefetch_limit": 50,
    },
}

print(f"Defined {len(PIPELINES)} pipeline configurations")

Defined 6 pipeline configurations


## Search Function

In [9]:
def search_pipeline(
    query_embedding: np.ndarray,
    using: str,
    prefetch_using: str | None = None,
    prefetch_limit: int = 50,
    limit: int = 10,
) -> list[tuple[str, float]]:
    """
    Execute a search pipeline with optional prefetch stage.

    Args:
        query_embedding: The query's multi-vector embedding
        using: Named vector for final ranking
        prefetch_using: Named vector for prefetch (None = single-stage)
        prefetch_limit: How many candidates to retrieve in prefetch
        limit: Final number of results

    Returns:
        List of (filename, score) tuples
    """
    if prefetch_using is None:
        # Single-stage search
        response = client.query_points(
            collection_name=COLLECTION_NAME,
            query=query_embedding.tolist(),
            using=using,
            limit=limit,
        )
    else:
        # Two-stage search: prefetch with one vector, rerank with another
        # For MUVERA prefetch, we need the MUVERA query embedding
        if prefetch_using == "muvera":
            prefetch_query = muvera.process_query(query_embedding).tolist()
        else:
            prefetch_query = query_embedding.tolist()

        response = client.query_points(
            collection_name=COLLECTION_NAME,
            prefetch=[
                models.Prefetch(
                    query=prefetch_query,
                    using=prefetch_using,
                    limit=prefetch_limit,
                )
            ],
            query=query_embedding.tolist(),
            using=using,
            limit=limit,
        )

    return [
        (point.payload["filename"], point.score)
        for point in response.points
    ]

## Embed Queries

Generate embeddings for all evaluation queries.

In [10]:
# Get all unique queries from qrels
QUERIES = list(qrels_dict.keys())

print(f"Embedding {len(QUERIES)} queries...")

# Generate embeddings for all queries
query_embeddings = {}
for query in QUERIES:
    embedding = np.array(list(model.embed_text([query]))[0])
    query_embeddings[query] = embedding
    print(f"  '{query[:40]}...' -> shape {embedding.shape}")

print(f"\nEmbedded {len(query_embeddings)} queries")

Embedding 8 queries...
  'company quarterly financial results and ...' -> shape (18, 128)
  'historic ship disaster at sea...' -> shape (17, 128)
  'space exploration and astronauts...' -> shape (17, 128)
  'physics theory and scientist...' -> shape (17, 128)
  'news headline from early 1900s...' -> shape (18, 128)
  'business earnings report...' -> shape (15, 128)
  'NASA moon landing mission...' -> shape (17, 128)
  'ocean liner sinking...' -> shape (16, 128)

Embedded 8 queries


## Test Search Pipelines

Let's test a single query with each pipeline to verify everything works.

In [11]:
test_query = "company quarterly financial results and revenue"
test_embedding = query_embeddings[test_query]

print(f"Query: '{test_query}'\n")
print("Expected: images/financial-report.png (relevance=3)\n")
print("-" * 60)

for pipeline_name, config in PIPELINES.items():
    results = search_pipeline(test_embedding, **config, limit=4)
    print(f"\n{pipeline_name}:")
    for filename, score in results:
        print(f"  {score:.4f} | {filename}")

Query: 'company quarterly financial results and revenue'

Expected: images/financial-report.png (relevance=3)

------------------------------------------------------------

baseline:
  10.3651 | images/financial-report.png
  6.4249 | images/einstein-newspaper.jpg
  6.3433 | images/titanic-newspaper.jpg
  6.2661 | images/men-walk-on-moon-newspaper.jpg

scalar_quantized:
  10.3651 | images/financial-report.png
  6.4249 | images/einstein-newspaper.jpg
  6.3433 | images/titanic-newspaper.jpg
  6.2661 | images/men-walk-on-moon-newspaper.jpg

hierarchical:
  7.2940 | images/financial-report.png
  3.6461 | images/men-walk-on-moon-newspaper.jpg
  2.6607 | images/einstein-newspaper.jpg
  1.9426 | images/titanic-newspaper.jpg

muvera_rerank:
  10.3651 | images/financial-report.png
  6.4249 | images/einstein-newspaper.jpg
  6.3433 | images/titanic-newspaper.jpg
  6.2661 | images/men-walk-on-moon-newspaper.jpg

muvera_quantized:
  10.3651 | images/financial-report.png
  6.4249 | images/einstein-ne

## Evaluation

In [12]:
from ranx import Qrels, Run, compare
import time

# Create ranx Qrels object from our ground truth
qrels = Qrels(qrels_dict)

# Collect runs from each pipeline
runs = []
latency_results = {}

for pipeline_name, config in PIPELINES.items():
    print(f"Evaluating {pipeline_name}...")
    pipeline_results = {}
    latencies = []

    for query_text, query_embedding in query_embeddings.items():
        start = time.perf_counter()
        search_results = search_pipeline(query_embedding, **config, limit=10)
        latencies.append((time.perf_counter() - start) * 1000)

        # Convert to ranx format: {doc_id: score}
        pipeline_results[query_text] = {
            filename: score for filename, score in search_results
        }

    runs.append(Run(pipeline_results, name=pipeline_name))
    latency_results[pipeline_name] = np.mean(latencies)
    print(f"  Avg latency: {latency_results[pipeline_name]:.2f} ms")

print("\nAll pipelines evaluated!")

Evaluating baseline...
  Avg latency: 12.37 ms
Evaluating scalar_quantized...
  Avg latency: 11.12 ms
Evaluating hierarchical...
  Avg latency: 10.58 ms
Evaluating muvera_rerank...
  Avg latency: 78.48 ms
Evaluating muvera_quantized...
  Avg latency: 83.82 ms
Evaluating muvera_hierarchical...
  Avg latency: 77.97 ms

All pipelines evaluated!


## Compare All Pipelines

ranx provides a convenient `compare` function that shows metrics side-by-side with statistical significance indicators.

In [13]:
# Compare all pipelines
report = compare(
    qrels=qrels,
    runs=runs,
    metrics=["ndcg@10", "recall@10", "mrr"],
    max_p=0.05,  # Statistical significance threshold
)
print(report)

#    Model                  NDCG@10    Recall@10    MRR
---  -------------------  ---------  -----------  -----
a    baseline                 0.894            1  0.854
b    scalar_quantized         0.894            1  0.854
c    hierarchical             0.956            1  0.938
d    muvera_rerank            0.894            1  0.854
e    muvera_quantized         0.894            1  0.854
f    muvera_hierarchical      0.956            1  0.938


## Latency Results

Let's also look at the latency measurements for each pipeline.

In [14]:
print("Average Query Latency (ms)")
print("-" * 40)
for pipeline_name, latency in sorted(latency_results.items(), key=lambda x: x[1]):
    print(f"{pipeline_name:25} {latency:8.2f} ms")

Average Query Latency (ms)
----------------------------------------
hierarchical                 10.58 ms
scalar_quantized             11.12 ms
baseline                     12.37 ms
muvera_hierarchical          77.97 ms
muvera_rerank                78.48 ms
muvera_quantized             83.82 ms
