In [None]:
import os
import numpy as np
import ir_datasets
from kannolo import SparsePlainHNSWf16

### Load Data

In [None]:
# MS MARCO SPLADE (sparse)
queries_path = os.path.expanduser('~/base_path/datasets_numpy/queries/ms_marco_splade/')
index_path = os.path.expanduser('~/base_path/indexes/kannolo/kannolo_sparse_efc_2000_m_32_metric_ip')

In [None]:
# Load data
queries_components = np.load(queries_path + 'components.npy') # Query components
queries_values = np.load(queries_path + 'values.npy') # Query values
queries_offsets = np.load(queries_path + 'offsets.npy') # Query offsets
index = SparsePlainHNSWf16.load(index_path) # Index

In [None]:
# Choose a query
query_id_1 = 1500
query_id_2 = 5000
query_components_1 = queries_components[queries_offsets[query_id_1]:queries_offsets[query_id_1 + 1]]
query_values_1 = queries_values[queries_offsets[query_id_1]:queries_offsets[query_id_1 + 1]]
query_components_2 = queries_components[queries_offsets[query_id_2]:queries_offsets[query_id_2 + 1]]
query_values_2 = queries_values[queries_offsets[query_id_2]:queries_offsets[query_id_2 + 1]]

### Search Queries

In [None]:
# Set search parameters
k = 10
efSearch = 1000

In [None]:
# Perform search
dists_1, ids_1 = index.search(query_components_1, query_values_1, k, efSearch)
dists_2, ids_2 = index.search(query_components_2, query_values_2, k, efSearch)
dists_1 = dists_1.reshape(-1, 10)
ids_1 = ids_1.reshape(-1, 10)
dists_2 = dists_2.reshape(-1, 10)
ids_2 = ids_2.reshape(-1, 10)

### Collect Results

In [None]:
# add ir_dataset dataset string id
ir_dataset_string = "msmarco-passage/dev/small"
# Load the dataset
dataset = ir_datasets.load("msmarco-passage/dev/small")

In [None]:
query_passage_1 = [query for query in dataset.queries_iter()][query_id_1].text
query_passage_2 = [query for query in dataset.queries_iter()][query_id_2].text

In [None]:
documents_passages = dataset.docs_iter()[:]
results_1 = [documents_passages[int(i)].text for i in ids_1[0]]
results_2 = [documents_passages[int(i)].text for i in ids_2[0]]

### Evaluation

In [None]:
import ir_measures
ir_measure = ir_measures.parse_measure("MRR@10")

In [None]:
# Remapping the query ids for metric evaluation
real_query_id_1 = [query for query in dataset.queries_iter()][query_id_1].query_id
real_query_id_2 = [query for query in dataset.queries_iter()][query_id_2].query_id

In [None]:
# Parsing the results for metric evaluation
results_for_metric_1 = []
for dd, ii in zip(dists_1[0], ids_1[0]):
    results_for_metric_1.append(ir_measures.ScoredDoc(real_query_id_1, str(ii), dd))

results_for_metric_2 = []
for dd, ii in zip(dists_2[0], ids_2[0]):
    results_for_metric_2.append(ir_measures.ScoredDoc(real_query_id_2, str(ii), dd))

In [None]:
# Load the qrels (relevance judgments) for the dataset
qrels = dataset.qrels
qrel_1 = [q for q in qrels if q.query_id == real_query_id_1]
qrel_2 = [q for q in qrels if q.query_id == real_query_id_2]

In [None]:
# Compute the MRR@10 metric
print("Metric evaluation for query 1", ir_measures.calc_aggregate([ir_measure], qrel_1, results_for_metric_1))
print("Metric evaluation for query 2", ir_measures.calc_aggregate([ir_measure], qrel_2, results_for_metric_2))

### Display Results

In [None]:
query_passage_1

In [None]:
results_1

##### Sparse representation better capture the specific question thanks to word-matching

In [None]:
query_passage_2

In [None]:
results_2

##### Sparse representations in this case are fooled by the matching word. "kids" is associated to "children" and the result is considered relevant. However, the relevant document containing a real definition of dignity is absent in the results.