# Measuring Search Relevance

The data used can be found [here - shared_data/WANDS/...](https://s172-29-41-184p8888.lab-aws-production.deeplearning.ai/tree) on OpeinAI within the documents of the course: [Retrieval Optimization: Tokenization to Vector Quantization](https://www.deeplearning.ai/short-courses/retrieval-optimization-from-tokenization-to-vector-quantization/). Furthermore, this is the original source of this notebook!

In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

products_df = pd.read_csv(
    "shared_data/WANDS/product.csv", 
    sep="\t", 
    index_col="product_id", 
    keep_default_na=False,  # some products do not have a description
)
products_df.head()

Unnamed: 0_level_0,product_name,product_class,category hierarchy,product_description,product_features,rating_count,average_rating,review_count
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,solid wood platform bed,Beds,Furniture / Bedroom Furniture / Beds & Headboa...,"good , deep sleep can be quite difficult to ha...",overallwidth-sidetoside:64.7|dsprimaryproducts...,15.0,4.5,15.0
1,all-clad 7 qt . slow cooker,Slow Cookers,Kitchen & Tabletop / Small Kitchen Appliances ...,"create delicious slow-cooked meals , from tend...",capacityquarts:7|producttype : slow cooker|pro...,100.0,2.0,98.0
2,all-clad electrics 6.5 qt . slow cooker,Slow Cookers,Kitchen & Tabletop / Small Kitchen Appliances ...,prepare home-cooked meals on any schedule with...,features : keep warm setting|capacityquarts:6....,208.0,3.0,181.0
3,all-clad all professional tools pizza cutter,"Slicers, Peelers And Graters",Browse By Brand / All-Clad,this original stainless tool was designed to c...,overallwidth-sidetoside:3.5|warrantylength : l...,69.0,4.5,42.0
4,baldwin prestige alcott passage knob with roun...,Door Knobs,Home Improvement / Doors & Door Hardware / Doo...,the hardware has a rich heritage of delivering...,compatibledoorthickness:1.375 '' |countryofori...,70.0,5.0,42.0


In [3]:
num_products = 5000

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

product_name_embeddings = model.encode(
    products_df["product_name"][0:num_products].tolist()
)
product_name_embeddings.shape

(5000, 384)

In [4]:
product_description_embeddings = model.encode(
    products_df["product_description"][0:num_products].tolist()
)
product_description_embeddings.shape

(5000, 384)

Debug: Run Qdrant Server!

In [7]:
import requests

try:
    response = requests.get("http://127.0.0.1:6333/collections")
    if response.status_code == 200:
        print("Qdrant server is running!")
    else:
        print("Qdrant server is not responding as expected:", response.status_code)
except requests.exceptions.ConnectionError:
    print("Qdrant server is not running. Please start it.")


Qdrant server is not running. Please start it.


In [9]:
!docker run -d -p 6333:6333 qdrant/qdrant


2373c5ba5fadd34ce3d0779d9201192d751a5643e75da348842f91389fe00e71


Unable to find image 'qdrant/qdrant:latest' locally
latest: Pulling from qdrant/qdrant
4f4fb700ef54: Pulling fs layer
4f4fb700ef54: Pulling fs layer
d34e9b666e1e: Pulling fs layer
7a96f5fbe51c: Pulling fs layer
4f4fb700ef54: Pulling fs layer
4f4fb700ef54: Pulling fs layer
4f4fb700ef54: Pulling fs layer
4f4fb700ef54: Pulling fs layer
fdec4fd852b1: Pulling fs layer
0dc90c036ff7: Pulling fs layer
2d429b9e73a6: Pulling fs layer
cf45fff92489: Pulling fs layer
4133309d2fe5: Pulling fs layer
4f4fb700ef54: Download complete
4133309d2fe5: Download complete
cf45fff92489: Download complete
fdec4fd852b1: Download complete
0dc90c036ff7: Download complete
d34e9b666e1e: Download complete
2d429b9e73a6: Download complete
7a96f5fbe51c: Download complete
Digest: sha256:283ccc6ee271237b43bfd7b995752f4cbb7a63a11ab68add3ee3bb1137bdafa7
Status: Downloaded newer image for qdrant/qdrant:latest


## Building the collection

In [10]:
from qdrant_client import QdrantClient, models

client = QdrantClient("http://127.0.0.1:6333")
client.delete_collection("wands-products")
client.create_collection(
    collection_name="wands-products",
    vectors_config={
        "product_name": models.VectorParams(
            size=384,
            distance=models.Distance.COSINE,
        ),
        "product_description": models.VectorParams(
            size=384,
            distance=models.Distance.COSINE,
        ),
    },
    optimizers_config=models.OptimizersConfigDiff(
        default_segment_number=2,
        indexing_threshold=1000,
    ),
)

True

In [11]:
client.upload_collection(
    collection_name="wands-products",
    vectors={
        "product_name": product_name_embeddings,
        "product_description": product_description_embeddings,
    },
    payload=products_df.to_dict(orient="records"),
    ids=products_df.index.tolist(),
    batch_size=64,
)

In [12]:
client.count("wands-products")

CountResult(count=5000)

In [13]:
import time

time.sleep(1.0)
collection = client.get_collection("wands-products")
while collection.status != models.CollectionStatus.GREEN:
    time.sleep(1.0)
    collection = client.get_collection("wands-products")
    
collection

CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=None, indexed_vectors_count=8704, points_count=5000, segments_count=2, config=CollectionConfig(params=CollectionParams(vectors={'product_description': VectorParams(size=384, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None), 'product_name': VectorParams(size=384, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None)}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=2, max_segment_size=None, memmap_threshol

## Test queries

In [14]:
queries_df = pd.read_csv(
    "shared_data/WANDS/query.csv", 
    sep="\t", 
    index_col="query_id",
)
queries_df.head()

Unnamed: 0_level_0,query,query_class
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,salon chair,Massage Chairs
1,smart coffee table,Coffee & Cocktail Tables
2,dinosaur,Kids Wall Décor
3,turquoise pillows,Accent Pillows
4,chair and a half recliner,Recliners


## Ground truth

In [15]:
labels_df = pd.read_csv(
    "shared_data/WANDS/label.csv", 
    sep="\t", 
)
labels_df.sample(n=5)

Unnamed: 0,id,query_id,product_id,label
44390,44496,422,18219,Exact
94835,94987,155,9317,Partial
47738,47844,447,21852,Partial
99030,99182,162,25711,Partial
71333,71484,51,26012,Partial


In [16]:
relevancy_scores = {
    "Exact": 10,
    "Partial": 5,
    "Irrelevant": 0,
}

labels_df["score"] = labels_df["label"].map(relevancy_scores.get)
labels_df["query_id"] = labels_df["query_id"].map(lambda x: f"query_{x}")
labels_df["product_id"] = labels_df["product_id"].map(lambda x: f"doc_{x}")
labels_df.sample(n=5)

Unnamed: 0,id,query_id,product_id,label,score
191206,191773,query_446,doc_14138,Irrelevant,0
222185,222752,query_106,doc_26830,Irrelevant,0
156748,157315,query_397,doc_6503,Partial,5
8587,8587,query_68,doc_23109,Partial,5
31102,31181,query_247,doc_25945,Exact,10


## ranx

In [17]:
from ranx import Qrels

qrels = Qrels.from_df(
    labels_df.astype({"query_id": "str", "product_id": "str"}),
    q_id_col="query_id",
    doc_id_col="product_id", 
    score_col="score",
)

## Running all the queries

In [18]:
queries_df["query_embedding"] = model.encode(
    queries_df["query"].tolist()
).tolist()
queries_df.sample(n=5)

Unnamed: 0_level_0,query,query_class,query_embedding
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
443,patio design,Outdoor Conversation Sets,"[0.026969391852617264, 0.09537287056446075, -0..."
349,mud room sign,Wall Décor,"[-0.012671636417508125, 0.012819253839552402, ..."
367,wooden chair outdoor,Patio Lounge Chairs,"[0.03380975499749184, 0.03079218789935112, 0.0..."
73,certified international melamine,Plates & Saucers,"[-0.0040483237244188786, -0.05582291632890701,..."
348,one alium way,Sofas,"[0.03517076373100281, -0.003385811345651746, -..."


In [19]:
from collections import defaultdict

name_run_dict = defaultdict(dict)
for id, row in queries_df.iterrows():
    query_id = f"query_{id}"
    
    results = client.search(
        collection_name="wands-products",
        query_vector=models.NamedVector(
            name="product_name", 
            vector=row["query_embedding"]
        ),
        with_vectors=False,
        with_payload=False,
        limit=100,
    )

    for point in results:
        document_id = f"doc_{point.id}"
        name_run_dict[query_id][document_id] = point.score  
    
name_run_dict

defaultdict(dict,
            {'query_0': {'doc_4410': 0.75121814,
              'doc_4034': 0.74417394,
              'doc_251': 0.7269762,
              'doc_2187': 0.72043693,
              'doc_975': 0.6929847,
              'doc_1616': 0.6591317,
              'doc_4444': 0.65086097,
              'doc_746': 0.6489469,
              'doc_209': 0.6478478,
              'doc_2638': 0.64008915,
              'doc_1148': 0.6380282,
              'doc_1059': 0.63322645,
              'doc_1372': 0.63218987,
              'doc_308': 0.6319947,
              'doc_603': 0.6241783,
              'doc_1742': 0.61341786,
              'doc_4938': 0.6115167,
              'doc_4330': 0.6089172,
              'doc_1259': 0.60608137,
              'doc_1864': 0.6024618,
              'doc_4329': 0.5979623,
              'doc_1373': 0.5947802,
              'doc_1454': 0.5935928,
              'doc_3604': 0.5926492,
              'doc_187': 0.58951056,
              'doc_206': 0.58923256,
      

In [20]:
from ranx import Run

product_name_run = Run(name_run_dict, name="product_name")

In [21]:
description_run_dict = defaultdict(dict)
for id, row in queries_df.iterrows():
    query_id = f"query_{id}"
    
    results = client.search(
        collection_name="wands-products",
        query_vector=models.NamedVector(
            name="product_description", 
            vector=row["query_embedding"]
        ),
        with_vectors=False,
        with_payload=False,
        limit=100,
    )

    for point in results:
        document_id = f"doc_{point.id}"
        description_run_dict[query_id][document_id] = point.score 

product_description_run = Run(
    description_run_dict, 
    name="product_description"
)

In [22]:
from ranx import compare

compare(
    qrels=qrels,
    runs=[
        product_name_run, 
        product_description_run
    ],
    metrics=[
        "precision@10", 
        "recall@10", 
        "mrr@10",
        "dcg@10", 
        "ndcg@10",
    ],
)

#    Model                P@10    Recall@10    MRR@10    DCG@10    NDCG@10
---  -------------------  ------  -----------  --------  --------  ---------
a    product_name         0.616ᵇ  0.034ᵇ       0.807ᵇ    19.131ᵇ   0.517ᵇ
b    product_description  0.457   0.024        0.687     14.491    0.388

### Nicely Done!