# Module 3: Pooling Techniques

Load the ColPali model for generating multi-vector image embeddings.

In [3]:
from fastembed import LateInteractionMultimodalEmbedding

# Load ColPali model
model = LateInteractionMultimodalEmbedding(
    model_name="Qdrant/colpali-v1.3-fp16"
)

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

**Spatial pooling**: reshape the 1024 patch embeddings into a 32×32 grid and average along rows or columns. This reduces 1024 vectors to just 32, achieving a 32× memory reduction.

In [4]:
import numpy as np

# Embed a document image (returns 1024 patches × 128 dimensions)
image_path = "images/financial-report.png"  # Your document image
embeddings = list(model.embed_image([image_path]))[0]
print(f"Original shape: {embeddings.shape}")  # (1024, 128)

# Reshape to spatial grid: (rows, columns, embedding_dim)
# Get only the first 1024 embeddings, as instruction tokens do
# not represent images
grid = embeddings[:1024].reshape(32, 32, 128)

# Row pooling: average across columns (axis=1)
row_pooled = grid.mean(axis=1)  # Shape: (32, 128)

# Column pooling: average across rows (axis=0)
col_pooled = grid.mean(axis=0)  # Shape: (32, 128)

# Combined approach (optional): concatenate row and column pooled
combined = np.vstack([row_pooled, col_pooled])  # Shape: (64, 128)

# Memory comparison
original_memory = embeddings.nbytes  # 1024 × 128 × 4 = 524,288 bytes
pooled_memory = row_pooled.nbytes    # 32 × 128 × 4 = 16,384 bytes

print(f"Original: {original_memory:,} bytes ({original_memory // 1024} KB)")
print(f"Row pooled: {pooled_memory:,} bytes ({pooled_memory // 1024} KB)")
print(f"Reduction: {original_memory // pooled_memory}×")

Original shape: (1030, 128)
Original: 263,680 bytes (257 KB)
Row pooled: 8,192 bytes (8 KB)
Reduction: 32×


**Hierarchical pooling**: use k-means clustering to group similar patch embeddings, then average within each cluster. This approach is content-aware and lets you choose any compression ratio.

In [5]:
from scipy.cluster.vq import kmeans2

# Embed a document image
image_path = "images/financial-report.png"
embeddings = list(model.embed_image([image_path]))[0]

def hierarchical_pool(embeddings: np.ndarray, k: int) -> np.ndarray:
    """Pool embeddings using k-means clustering."""
    # Cluster embeddings into k groups
    # kmeans2 supports only float32, so we need to convert the embeddings
    centroids, labels = kmeans2(embeddings.astype(np.float32), k, minit='++')

    # Pool within each cluster using mean
    pooled = np.array([
        embeddings[labels == i].mean(axis=0)
        for i in range(k)
    ])
    return pooled

# Compare different compression levels
for k in [16, 32, 64, 128]:
    pooled = hierarchical_pool(embeddings, k)
    reduction = len(embeddings) / k
    print(f"k={k:3d}: {len(embeddings)} → {k} vectors ({reduction:.0f}× reduction)")

k= 16: 1030 → 16 vectors (64× reduction)
k= 32: 1030 → 32 vectors (32× reduction)
k= 64: 1030 → 64 vectors (16× reduction)
k=128: 1030 → 128 vectors (8× reduction)
