# Module 3: Vector Quantization Techniques

This notebook demonstrates how to use scalar and binary quantization with multi-vector collections in Qdrant.

Install the Qdrant client and fastembed libraries.

In [1]:
!pip install -q fastembed qdrant-client


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


Load the ColPali vision-language model for generating multi-vector embeddings from document images.

In [2]:
from fastembed import LateInteractionMultimodalEmbedding

# Load ColPali model for generating multi-vector embeddings
model = LateInteractionMultimodalEmbedding(
    model_name="Qdrant/colpali-v1.3-fp16"
)

Embed sample document images. Each image produces 1030 vectors of 128 dimensions  -  1024 image patches plus 6 instruction tokens.

In [3]:
# Sample document images for our collection
# We'll use images from the course materials
image_paths = [
    "images/financial-report.png",
    "images/titanic-newspaper.jpg",
    "images/men-walk-on-moon-newspaper.jpg",
    "images/einstein-newspaper.jpg",
]

# Metadata for each document
documents = [
    {"title": "Financial Report", "type": "report", "topic": "finance"},
    {"title": "Titanic Sinking", "type": "newspaper", "topic": "history"},
    {"title": "Moon Landing", "type": "newspaper", "topic": "space"},
    {"title": "Einstein Theory", "type": "newspaper", "topic": "science"},
]

# Generate embeddings for all images
print("Generating embeddings for document images...")
image_embeddings = list(model.embed_image(image_paths))
print(f"Generated embeddings for {len(image_embeddings)} documents")
print(f"Each document has {image_embeddings[0].shape[0]} vectors of dimension {image_embeddings[0].shape[1]}")

Generating embeddings for document images...
Generated embeddings for 4 documents
Each document has 1030 vectors of dimension 128


## Scalar Quantization

Scalar quantization converts float32 values to 8-bit integers (uint8), reducing memory by **4x**.

In [4]:
from qdrant_client import QdrantClient, models

client = QdrantClient("http://localhost:6333")

# Delete collections if they exist (for clean reruns)
client.delete_collection("colpali-scalar", timeout=60)
client.delete_collection("colpali-binary", timeout=60)

# Create collection with scalar quantization (4x compression)
client.create_collection(
    collection_name="colpali-scalar",
    vectors_config={
        "colpali": models.VectorParams(
            size=128,  # ColPali embedding dimension
            distance=models.Distance.DOT,
            multivector_config=models.MultiVectorConfig(
                comparator=models.MultiVectorComparator.MAX_SIM,
            ),
            hnsw_config=models.HnswConfigDiff(m=0),  # Disable HNSW for multi-vector
        ),
    },
    quantization_config=models.ScalarQuantization(
        scalar=models.ScalarQuantizationConfig(
            type=models.ScalarType.INT8,
            quantile=0.99,  # Exclude 1% outliers for better scaling
            always_ram=True,
        ),
    ),
)

True

In [5]:
# Ingest data into the scalar-quantized collection
client.upsert(
    collection_name="colpali-scalar",
    points=[
        models.PointStruct(
            id=i,
            vector={"colpali": embedding.tolist()},
            payload=documents[i],
        )
        for i, embedding in enumerate(image_embeddings)
    ],
)

# Verify the data was ingested
collection_info = client.get_collection("colpali-scalar")
print(f"Collection 'colpali-scalar' has {collection_info.points_count} points")
print(f"Quantization: {collection_info.config.quantization_config}")

Collection 'colpali-scalar' has 4 points
Quantization: scalar=ScalarQuantizationConfig(type=<ScalarType.INT8: 'int8'>, quantile=0.99, always_ram=True)


## Binary Quantization

Binary quantization represents each component as a single bit (positive/negative), achieving **32x compression**.

In [6]:
# Create collection with binary quantization (32x compression)
client.create_collection(
    collection_name="colpali-binary",
    vectors_config={
        "colpali": models.VectorParams(
            size=128,  # ColPali embedding dimension
            distance=models.Distance.DOT,
            multivector_config=models.MultiVectorConfig(
                comparator=models.MultiVectorComparator.MAX_SIM,
            ),
            hnsw_config=models.HnswConfigDiff(m=0),  # Disable HNSW for multi-vector
        ),
    },
    quantization_config=models.BinaryQuantization(
        binary=models.BinaryQuantizationConfig(
            always_ram=True,
        ),
    ),
)

True

In [7]:
# Ingest the same data into the binary-quantized collection
client.upsert(
    collection_name="colpali-binary",
    points=[
        models.PointStruct(
            id=i,
            vector={"colpali": embedding.tolist()},
            payload=documents[i],
        )
        for i, embedding in enumerate(image_embeddings)
    ],
)

# Verify the data was ingested
collection_info = client.get_collection("colpali-binary")
print(f"Collection 'colpali-binary' has {collection_info.points_count} points")
print(f"Quantization: {collection_info.config.quantization_config}")

Collection 'colpali-binary' has 4 points
Quantization: binary=BinaryQuantizationConfig(always_ram=True, encoding=None, query_encoding=None)


## Search with Rescoring

Qdrant provides automatic rescoring: the quantized index quickly finds candidates, then re-ranks them using the original float32 vectors for accuracy.

In [8]:
# Generate query embeddings from a text query
query = "financial quarterly results revenue"
query_embeddings = list(model.embed_text([query]))[0]
print(f"Query has {query_embeddings.shape[0]} vectors of dimension {query_embeddings.shape[1]}")

# Search with rescoring enabled (default behavior)
results = client.query_points(
    collection_name="colpali-scalar",
    query=query_embeddings.tolist(),
    using="colpali",
    limit=10,
    search_params=models.SearchParams(
        quantization=models.QuantizationSearchParams(
            ignore=False,       # Use quantized vectors for initial search
            rescore=True,       # Re-rank with original float32 vectors
            oversampling=2.0,   # Fetch 2x candidates before rescoring
        ),
    ),
)

Query has 18 vectors of dimension 128


In [9]:
# Display search results from scalar-quantized collection
print("Search results from scalar-quantized collection:")
print("-" * 50)
for point in results.points:
    print(f"Score: {point.score:.4f} | {point.payload['title']} ({point.payload['topic']})")

Search results from scalar-quantized collection:
--------------------------------------------------
Score: 12.5780 | Financial Report (finance)
Score: 6.7059 | Moon Landing (space)
Score: 6.4723 | Einstein Theory (science)
Score: 5.2878 | Titanic Sinking (history)


In [10]:
# Compare results from binary-quantized collection
results_binary = client.query_points(
    collection_name="colpali-binary",
    query=query_embeddings.tolist(),
    using="colpali",
    limit=10,
    search_params=models.SearchParams(
        quantization=models.QuantizationSearchParams(
            ignore=False,
            rescore=True,
            oversampling=2.0,
        ),
    ),
)

print("Search results from binary-quantized collection:")
print("-" * 50)
for point in results_binary.points:
    print(f"Score: {point.score:.4f} | {point.payload['title']} ({point.payload['topic']})")

Search results from binary-quantized collection:
--------------------------------------------------
Score: 12.5780 | Financial Report (finance)
Score: 6.7059 | Moon Landing (space)
Score: 6.4723 | Einstein Theory (science)
Score: 5.2878 | Titanic Sinking (history)


## Impact of Rescoring

Let's compare results with and without rescoring to see the impact on result quality.

In [11]:
# Search WITHOUT rescoring (uses only quantized vectors)
results_no_rescore = client.query_points(
    collection_name="colpali-binary",
    query=query_embeddings.tolist(),
    using="colpali",
    limit=10,
    search_params=models.SearchParams(
        quantization=models.QuantizationSearchParams(
            ignore=False,
            rescore=False,  # Disabled: results come from quantized vectors only
        ),
    ),
)

print("Binary quantization WITHOUT rescoring:")
print("-" * 50)
for point in results_no_rescore.points:
    print(f"Score: {point.score:.4f} | {point.payload['title']} ({point.payload['topic']})")

print("\n")
print("Binary quantization WITH rescoring (from earlier):")
print("-" * 50)
for point in results_binary.points:
    print(f"Score: {point.score:.4f} | {point.payload['title']} ({point.payload['topic']})")

Binary quantization WITHOUT rescoring:
--------------------------------------------------
Score: 12.5780 | Financial Report (finance)
Score: 6.7059 | Moon Landing (space)
Score: 6.4723 | Einstein Theory (science)
Score: 5.2878 | Titanic Sinking (history)


Binary quantization WITH rescoring (from earlier):
--------------------------------------------------
Score: 12.5780 | Financial Report (finance)
Score: 6.7059 | Moon Landing (space)
Score: 6.4723 | Einstein Theory (science)
Score: 5.2878 | Titanic Sinking (history)
