In [1]:
import os
import numpy as np
from kannolo import DensePQHNSW

### Load Data

In [2]:
# MS MARCO DRAGON (dense)
query_path = '/data1/knn_datasets/dense_datasets/sift_1M/queries.npy'
data_path = '/data1/knn_datasets/dense_datasets/sift_1M/dataset.npy'

In [3]:
# Load data
dataset = np.load(data_path) # Dataset
queries = np.load(query_path) # Queries

### Search Queries

In [4]:
# Convert dataset to PyReadonlyArray1<f32>
dataset = dataset.astype(np.float32)
dataset_array = dataset.flatten()

In [5]:
index = DensePQHNSW.build_from_array(dataset_array,
                                     128,
                                     32,
                                     8,
                                     16,
                                     150,
                                     "l2",
                                     100_000)

Running K-Means for 32 subspaces
K-Means finished


In [6]:
index.save("/data3/silvio/indice_prova")

In [None]:
# Set search parameters
k = 10
efSearch = 1000

In [None]:
dists_1, ids_1 = index.search(query_1, 10, 200)
dists_2, ids_2 = index.search(query_2, 10, 200)
dists_1 = dists_1.reshape(-1, 10)
ids_1 = ids_1.reshape(-1, 10)
dists_2 = dists_2.reshape(-1, 10)
ids_2 = ids_2.reshape(-1, 10)

### Collect Results

In [None]:
import ir_datasets

In [None]:
# add your ir_dataset dataset string id
ir_dataset_string = "msmarco-passage/dev/small"
# Load the dataset
dataset = ir_datasets.load("msmarco-passage/dev/small")

In [None]:
query_passage_1 = [query for query in dataset.queries_iter()][query_id_1].text
query_passage_2 = [query for query in dataset.queries_iter()][query_id_2].text

In [None]:
documents_passages = dataset.docs_iter()[:]
results_1 = [documents_passages[int(i)].text for i in ids_1[0]]
results_2 = [documents_passages[int(i)].text for i in ids_2[0]]

### Evaluation

In [None]:
import ir_measures
ir_measure = ir_measures.parse_measure("MRR@10")

In [None]:
# Remapping the query ids for metric evaluation
real_query_id_1 = [query for query in dataset.queries_iter()][query_id_1].query_id
real_query_id_2 = [query for query in dataset.queries_iter()][query_id_2].query_id

In [None]:
# Parsing the results for metric evaluation
results_for_metric_1 = []
for dd, ii in zip(dists_1[0], ids_1[0]):
    results_for_metric_1.append(ir_measures.ScoredDoc(real_query_id_1, str(ii), dd))

results_for_metric_2 = []
for dd, ii in zip(dists_2[0], ids_2[0]):
    results_for_metric_2.append(ir_measures.ScoredDoc(real_query_id_2, str(ii), dd))

In [None]:
# Load the qrels (relevance judgments) for the dataset
qrels = dataset.qrels
qrel_1 = [q for q in qrels if q.query_id == real_query_id_1]
qrel_2 = [q for q in qrels if q.query_id == real_query_id_2]

In [None]:
# Compute the MRR@10 metric
print("Metric evaluation for query 1", ir_measures.calc_aggregate([ir_measure], qrel_1, results_for_metric_1))
print("Metric evaluation for query 2", ir_measures.calc_aggregate([ir_measure], qrel_2, results_for_metric_2))

### Display Results

In [None]:
query_passage_1

In [None]:
results_1

##### Dense representations fail in capturing the relevance of all the words. PER MONTH means that the query asks information about all the months in a year. Dense representations focus more on the more general topic, giving as results passages about average temperatures.

In [None]:
query_passage_2

In [None]:
results_2

##### Dense representations catch the more generic meaning of definition for kid, unlike sparse representations that get fooled by the word "kid", matching it even if the context is different.