In [2]:
from qdrant_client import QdrantClient,models
from pymongo import MongoClient
import polars as pl

qclient = QdrantClient(host="localhost",port=6333, prefer_grpc=False)

In [3]:
collection_name = "ondc-query"

In [9]:
offset = 0
all_points = [] 
while offset is not None:
    points,offset = qclient.scroll(
        collection_name=collection_name,
        limit=10000,
        offset=offset,
        with_vectors=False
    )
    all_points.extend(points)
    

In [10]:
points = [p.payload for p in all_points]

In [11]:
for id,p in zip([z.id for z in all_points],points):
    p['id'] = id


In [22]:
df = pl.DataFrame(points)
df_ = pl.DataFrame(points)

In [23]:
df = df.select(
    pl.col("id"),
    pl.col("query").str.to_lowercase().alias("query"),
    pl.col("product_id")
).group_by("query").agg(pl.col("product_id")).with_columns(
    pl.col("product_id").map_elements(lambda s: len(s)).alias("p_count")
)

df = df.join(df_, on="query",how="inner",).select(pl.col(["id","query","product_id","p_count"]))


In [25]:
del df_

In [35]:
# df['p_count'].value_counts().sort("count",descending=True)[:6]["count"].plot(kind="bar")
df_2 = df.filter(pl.col("p_count")==2)
df_3 = df.filter(pl.col("p_count")==3)
df_4 = df.filter(pl.col("p_count")==4)

In [36]:
# Queries unique to a product
df_1 = df.filter(pl.col("p_count")==1)
df_1 = df_1.explode("product_id").drop("p_count")


In [68]:
def get_metric(struct: dict[str,str]):
    qid = struct['id']
    product_id = struct['product_id']
    query_embedding =lambda x: qclient.retrieve(
        collection_name="ondc-query",
        ids=[x],
        with_vectors=True
    )[0].vector
    qid = 5
    product_id = '30141ae1-0d1d-4e83-a6d1-ccf207bca27c'
    vectors = query_embedding(qid)
    dense = models.NamedVector(name="dense",vector=vectors['dense'])
    sparse = models.NamedSparseVector(name="sparse",vector=vectors['sparse'])
    sparse_100_req = models.SearchRequest(
        vector=sparse,
        limit=100
    )
    dense_100_req = models.SearchRequest(
        vector=dense,
        limit=100
    )
    results = qclient.search_batch(
        collection_name="ondc-index",
        requests= [sparse_100_req,dense_100_req]
    )
    sparse_100 = results[0]
    dense_100 = results[1]
    def mrr(result, product_id):
        for i,r in enumerate(result):
            if r.id == product_id:
                return 1/(i+1), r.score
        return 0,0
    sparse_metric = mrr(result=sparse_100,product_id=product_id)
    dense_metric = mrr(result=dense_100,product_id=product_id)
    return {"sparse":sparse_metric,"dense":dense_metric}


In [69]:
search_mrr = df_1.select([
    pl.struct(["id", "product_id"]).map_elements(get_metric, strategy="thread_local").alias("search_score")
])

In [86]:
search_mrr["search_score"].struct.field("sparse").list.slice(1,1).explode().value_counts()

sparse,count
f64,u32
28.088713,12678


In [37]:
df_1.write_parquet("qrel_1.parquet",compression="zstd")

In [34]:
qid = df_1["query_id"][0]
# qclient.search(
#     collection_name="ondc-query-gen",
#     query=pl.col("query").filter(pl.col("query_id")==qid).first(),
#     limit=10
# )