# Optimizing HNSW Search

In [1]:
import warnings
warnings.filterwarnings('ignore')

from qdrant_client import QdrantClient, models

client = QdrantClient("http://localhost:6333", timeout=600)
client.delete_collection("wands-products")
client.recover_snapshot(
    "wands-products", 
    "https://storage.googleapis.com/deeplearning-course-c1/snapshots/wands-products.snapshot",
)
collection = client.get_collection("wands-products")
collection

CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=None, indexed_vectors_count=85988, points_count=42994, segments_count=2, config=CollectionConfig(params=CollectionParams(vectors={'product_description': VectorParams(size=384, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None), 'product_name': VectorParams(size=384, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None)}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=2, max_segment_size=None, memmap_thresh

## HNSW parameters

In [2]:
collection.config.hnsw_config

HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None)

## Test queries

In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

In [4]:
import pandas as pd

queries_df = pd.read_csv(
    "shared_data/WANDS/query.csv", 
    sep="\t", 
    index_col="query_id",
)
queries_df["query_embedding"] = model.encode(
    queries_df["query"].tolist()
).tolist()
queries_df.sample(n=5)

Unnamed: 0_level_0,query,query_class,query_embedding
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
71,infant girl crib bedding,Crib Bedding Sets,"[0.0211443230509758, -0.007067469879984856, -0..."
54,wayfair sleep zippered,Mattress Covers and Protectors,"[-0.033533722162246704, 0.0907270684838295, -0..."
21,living curtains pearl,Curtains & Drapes,"[-0.052172575145959854, 0.0335204042494297, 0...."
458,bubble guppies chair,Dining Chairs,"[0.0026144732255488634, -0.019996589049696922,..."
480,pictures to hang over fireplace,Wall Art,"[0.06838737428188324, 0.08617142587900162, -0...."


## ANN Search

In [5]:
client.search(
    "wands-products",
    query_vector=models.NamedVector(
        name="product_name",
        vector=model.encode(queries_df.loc[0, "query"])
    ),
    limit=3,
    with_vectors=False,
    with_payload=False,
)

[ScoredPoint(id=7465, version=116, score=0.9198917, payload=None, vector=None, shard_key=None),
 ScoredPoint(id=9234, version=144, score=0.8231317, payload=None, vector=None, shard_key=None),
 ScoredPoint(id=42329, version=661, score=0.8180744, payload=None, vector=None, shard_key=None)]

## kNN search

In [6]:
client.search(
    "wands-products",
    query_vector=models.NamedVector(
        name="product_name",
        vector=model.encode(queries_df.loc[0, "query"])
    ),
    limit=3,
    with_vectors=False,
    with_payload=False,
    search_params=models.SearchParams(
        exact=True,  # Turns on the exact search mode
    ),
)

[ScoredPoint(id=7465, version=116, score=0.9198917, payload=None, vector=None, shard_key=None),
 ScoredPoint(id=9234, version=144, score=0.8231317, payload=None, vector=None, shard_key=None),
 ScoredPoint(id=42329, version=661, score=0.8180744, payload=None, vector=None, shard_key=None)]

## Ground truth

In [7]:
from collections import defaultdict
from ranx import Qrels

knn_qrels_dict = defaultdict(dict)
for id, row in queries_df.iterrows():
    query_id = f"query_{id}"
    
    results = client.search(
        collection_name="wands-products",
        query_vector=models.NamedVector(
            name="product_name", 
            vector=row["query_embedding"]
        ),
        with_vectors=False,
        with_payload=False,
        limit=100,
        search_params=models.SearchParams(
            exact=True,  # enable exact search
        ),
    )
    
    for point in results:
        document_id = f"doc_{point.id}"
        # The conversion to integer is required because ranx expects integers
        knn_qrels_dict[query_id][document_id] = int(point.score * 100)
    
qrels = Qrels(knn_qrels_dict)
qrels

DictType[unicode_type,DictType[[unichr x 9],int32]<iv=None>]<iv=None>({query_0: {doc_7465: 91, doc_9234: 82, doc_42329: 81, doc_24010: 81, doc_18273: 81, doc_18276: 80, doc_25431: 80, doc_18272: 78, doc_36910: 78, doc_18277: 78, doc_19456: 77, doc_24006: 76, doc_40996: 76, doc_18274: 75, doc_18275: 75, doc_24008: 75, doc_18270: 75, doc_24009: 75, doc_26069: 75, doc_42330: 75, doc_31556: 75, doc_4410: 75, doc_7506: 74, doc_6168: 74, doc_4034: 74, doc_26070: 74, doc_28058: 73, doc_18271: 73, doc_26068: 73, doc_15612: 73, doc_18158: 73, doc_6982: 73, doc_12409: 73, doc_28687: 73, doc_2187: 72, doc_251: 72, doc_33689: 72, doc_39461: 72, doc_33690: 71, doc_31557: 71, doc_26071: 71, doc_31555: 70, doc_6167: 70, doc_39429: 70, doc_39428: 69, doc_9207: 69, doc_8994: 69, doc_975: 69, doc_19004: 68, doc_24007: 68, doc_28059: 68, doc_27443: 67, doc_40997: 67, doc_20026: 67, doc_16301: 66, doc_5450: 66, doc_6888: 66, doc_10976: 66, doc_19365: 66, doc_29750: 66, doc_16118: 65, doc_4444: 65, doc_185

## ANN search

In [8]:
from ranx import Run

run_dict = defaultdict(dict)
for id, row in queries_df.iterrows():
    query_id = f"query_{id}"
    
    results = client.search(
        collection_name="wands-products",
        query_vector=models.NamedVector(
            name="product_name", 
            vector=row["query_embedding"]
        ),
        with_vectors=False,
        with_payload=False,
        limit=100,
        search_params=models.SearchParams(
            exact=False,  # disable exact search
        ),
    )
    
    for point in results:
        document_id = f"doc_{point.id}"
        run_dict[query_id][document_id] = point.score

initial_run = Run(
    run_dict, 
    name="initial",
)
initial_run

DictType[unicode_type,DictType[[unichr x 9],float64]<iv=None>]<iv=None>({query_0: {doc_7465: 0.91989166, doc_9234: 0.8231317, doc_42329: 0.81807435, doc_24010: 0.8144921, doc_18273: 0.81323653, doc_18276: 0.8011743, doc_25431: 0.8008761, doc_18272: 0.7891395, doc_36910: 0.788627, doc_18277: 0.78065306, doc_19456: 0.7738904, doc_40996: 0.7677349, doc_24006: 0.7663058, doc_18274: 0.7597258, doc_18275: 0.75781846, doc_24008: 0.75755453, doc_18270: 0.7573571, doc_24009: 0.7567274, doc_26069: 0.7553595, doc_42330: 0.7552161, doc_31556: 0.752135, doc_4410: 0.75121814, doc_26070: 0.74578416, doc_4034: 0.74417406, doc_6168: 0.7408837, doc_7506: 0.74034554, doc_28058: 0.73971134, doc_18271: 0.73955727, doc_26068: 0.7357248, doc_15612: 0.7324223, doc_18158: 0.7324223, doc_12409: 0.73136127, doc_6982: 0.73136127, doc_28687: 0.73136127, doc_33689: 0.7294609, doc_39461: 0.72925216, doc_251: 0.7269762, doc_2187: 0.72043693, doc_33690: 0.7174669, doc_31557: 0.7154035, doc_26071: 0.7141184, doc_31555:

In [9]:
from ranx import evaluate

evaluate(
    qrels=qrels, 
    run=initial_run, 
    metrics=["precision@25"]
)

0.9979166666666667

## Tweaking the HNSW parameters

In [10]:
client.update_collection(
    collection_name="wands-products",
    hnsw_config=models.HnswConfigDiff(
        m=64, 
        ef_construct=200,
    )
)

True

In [11]:
import time

time.sleep(1.0)
collection = client.get_collection("wands-products")
while collection.status != models.CollectionStatus.GREEN:
    time.sleep(1.0)
    collection = client.get_collection("wands-products")
    
collection

CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=None, indexed_vectors_count=85988, points_count=42994, segments_count=2, config=CollectionConfig(params=CollectionParams(vectors={'product_description': VectorParams(size=384, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None), 'product_name': VectorParams(size=384, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None)}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=64, ef_construct=200, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=2, max_segment_size=None, memmap_thresh

In [12]:
tweaked_run_dict = defaultdict(dict)
for id, row in queries_df.iterrows():
    query_id = f"query_{id}"
    
    results = client.search(
        collection_name="wands-products",
        query_vector=models.NamedVector(
            name="product_name", 
            vector=row["query_embedding"]
        ),
        with_vectors=False,
        with_payload=False,
        limit=100,
        search_params=models.SearchParams(
            exact=False,  # disable exact search
        ),
    )
    
    for point in results:
        document_id = f"doc_{point.id}"
        tweaked_run_dict[query_id][document_id] = point.score
    
tweaked_run = Run(
    tweaked_run_dict, 
    name="tweaked"
)
tweaked_run

DictType[unicode_type,DictType[[unichr x 9],float64]<iv=None>]<iv=None>({query_0: {doc_7465: 0.91989166, doc_9234: 0.8231317, doc_42329: 0.81807435, doc_24010: 0.8144921, doc_18273: 0.81323653, doc_18276: 0.8011743, doc_25431: 0.8008761, doc_18272: 0.7891395, doc_36910: 0.788627, doc_18277: 0.78065306, doc_19456: 0.7738904, doc_40996: 0.7677349, doc_24006: 0.7663058, doc_18274: 0.7597258, doc_18275: 0.75781846, doc_24008: 0.75755453, doc_18270: 0.7573571, doc_24009: 0.7567274, doc_26069: 0.7553595, doc_42330: 0.7552161, doc_31556: 0.752135, doc_4410: 0.75121814, doc_26070: 0.74578416, doc_4034: 0.74417406, doc_6168: 0.7408837, doc_7506: 0.74034554, doc_28058: 0.73971134, doc_18271: 0.73955727, doc_26068: 0.7357248, doc_15612: 0.7324223, doc_18158: 0.7324223, doc_6982: 0.73136127, doc_28687: 0.73136127, doc_12409: 0.73136127, doc_33689: 0.7294609, doc_39461: 0.72925216, doc_251: 0.7269762, doc_2187: 0.72043693, doc_33690: 0.7174669, doc_31557: 0.7154035, doc_26071: 0.7141184, doc_31555:

In [13]:
evaluate(
    qrels=qrels, 
    run=tweaked_run, 
    metrics=["precision@25"]
)

1.0

## Well Done!