# Module 3: Pooling Techniques

In [1]:
!pip uninstall -y fastembed

Found existing installation: fastembed 0.7.4
Uninstalling fastembed-0.7.4:
  Successfully uninstalled fastembed-0.7.4


In [2]:
!pip install --upgrade git+https://github.com/qdrant/fastembed.git@main

Collecting git+https://github.com/qdrant/fastembed.git@main
  Cloning https://github.com/qdrant/fastembed.git (to revision main) to /tmp/pip-req-build-htpam8si
  Running command git clone --filter=blob:none --quiet https://github.com/qdrant/fastembed.git /tmp/pip-req-build-htpam8si
  Resolved https://github.com/qdrant/fastembed.git to commit 800f3887b725f1d16c93036567c8e12b3a182f6e
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: fastembed
  Building wheel for fastembed (pyproject.toml) ... [?25ldone
[?25h  Created wheel for fastembed: filename=fastembed-0.7.4-py3-none-any.whl size=116469 sha256=9589f88509d040e37ea6878c48cb03ee8e5c3ebf00ca1982c63f18e8160427e0
  Stored in directory: /tmp/pip-ephem-wheel-cache-zoi705lh/wheels/17/e7/0a/7cd97c194ecc0e6ad3b40b5d5a42e201e636781d5e3954a605
Successfully built fastembed
Installing collecte

In [3]:
from fastembed import LateInteractionMultimodalEmbedding

# Load ColPali model
model = LateInteractionMultimodalEmbedding(
    model_name="Qdrant/colpali-v1.3-fp16"
)

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

In [4]:
import numpy as np

# Embed a document image (returns 1024 patches × 128 dimensions)
image_path = "images/financial-report.png"  # Your document image
embeddings = list(model.embed_image([image_path]))[0]
print(f"Original shape: {embeddings.shape}")  # (1024, 128)

# Reshape to spatial grid: (rows, columns, embedding_dim)
# Get only the first 1024 embeddings, as instruction tokens do
# not represent images
grid = embeddings[:1024].reshape(32, 32, 128)

# Row pooling: average across columns (axis=1)
row_pooled = grid.mean(axis=1)  # Shape: (32, 128)

# Column pooling: average across rows (axis=0)
col_pooled = grid.mean(axis=0)  # Shape: (32, 128)

# Combined approach (optional): concatenate row and column pooled
combined = np.vstack([row_pooled, col_pooled])  # Shape: (64, 128)

# Memory comparison
original_memory = embeddings.nbytes  # 1024 × 128 × 4 = 524,288 bytes
pooled_memory = row_pooled.nbytes    # 32 × 128 × 4 = 16,384 bytes

print(f"Original: {original_memory:,} bytes ({original_memory // 1024} KB)")
print(f"Row pooled: {pooled_memory:,} bytes ({pooled_memory // 1024} KB)")
print(f"Reduction: {original_memory // pooled_memory}×")

Original shape: (1030, 128)
Original: 263,680 bytes (257 KB)
Row pooled: 8,192 bytes (8 KB)
Reduction: 32×


In [5]:
from scipy.cluster.vq import kmeans2

# Embed a document image
image_path = "images/financial-report.png"
embeddings = list(model.embed_image([image_path]))[0]

def hierarchical_pool(embeddings: np.ndarray, k: int) -> np.ndarray:
    """Pool embeddings using k-means clustering."""
    # Cluster embeddings into k groups
    # kmeans2 supports only float32, so we need to convert the embeddings
    centroids, labels = kmeans2(embeddings.astype(np.float32), k, minit='++')

    # Pool within each cluster using mean
    pooled = np.array([
        embeddings[labels == i].mean(axis=0)
        for i in range(k)
    ])
    return pooled

# Compare different compression levels
for k in [16, 32, 64, 128]:
    pooled = hierarchical_pool(embeddings, k)
    reduction = len(embeddings) / k
    print(f"k={k:3d}: {len(embeddings)} → {k} vectors ({reduction:.0f}× reduction)")

k= 16: 1030 → 16 vectors (64× reduction)
k= 32: 1030 → 32 vectors (32× reduction)
k= 64: 1030 → 64 vectors (16× reduction)
k=128: 1030 → 128 vectors (8× reduction)
