In [1]:
dataset = "msmarco_tiny"

dataset_path = "../datasets/msmarco_tiny/"
corpus_file = "tiny_collection.json"
queries_file = "topics.dl20.txt"
qrels_test_file = "qrels.dl20-passage.txt"
training_set = "msmarco_triples.train.tiny.tsv"

In [2]:
# from beir.datasets.data_loader import GenericDataLoader

# data_path = f"../datasets/{dataset}"
# corpus, queries, qrels = GenericDataLoader(data_path).load(split="test") # or split = "train" or "dev"

In [3]:
import collections
import pytrec_eval
import json

def load_queries(path):
    """Returns a dictionary whose keys are query ids and values are query texts."""
    queries = {}
    with open(path) as f:
        for line in f:
            query_id, query_text = line.strip().split('\t')
            queries[query_id] = query_text
    return queries


def load_corpus_tsv(path):
    """Returns a dictionary whose keys are passage ids and values are passage texts."""
    corpus = {}
    with open(path) as f:
        for line in f:
            passage_id, passage_text = line.strip().split('\t')
            corpus[passage_id] = {'text': passage_text}
    return corpus

def load_corpus_json(path):
    with open(path, 'r') as corpus_f:
        corpus_json = json.load(corpus_f)
    return corpus_json


def load_qrels(path):
    with open(path, 'r') as f_qrel:
        qrels = pytrec_eval.parse_qrel(f_qrel)

    return qrels


def load_triplets(path):
    triplets = []
    with open(path) as f:
        for line in f:
            query, positive_passage, negative_passage = line.strip().split('\t')
            triplets.append([query, positive_passage, negative_passage])
    return triplets

# Don't need to load triplet for training bm25
# triplets = load_triplets('msmarco_triples.train.tiny.tsv')

qrels = load_qrels(f"{dataset_path}{qrels_test_file}")
queries = load_queries(f"{dataset_path}{queries_file}")
print("Loading corpus into memory ...")
# corpus = load_corpus(f"{dataset_path}{corpus_file}")
corpus = load_corpus_json(f"{dataset_path}{corpus_file}")

Loading corpus into memory ...


In [4]:
len(corpus)

510585

In [5]:
%%bash

curl -sX GET "localhost:9200/"

{
  "name" : "ab",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "JS2mz8c3RdOol8iLxCkFHA",
  "version" : {
    "number" : "7.9.2",
    "build_flavor" : "oss",
    "build_type" : "tar",
    "build_hash" : "d34da0ea4a966c4e49417f2da2f244e3e97b4e6e",
    "build_date" : "2020-09-23T00:45:33.626720Z",
    "build_snapshot" : false,
    "lucene_version" : "8.6.2",
    "minimum_wire_compatibility_version" : "6.8.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


In [6]:
from beir.retrieval.search.lexical import BM25Search as BM25
from beir.retrieval.evaluation import EvaluateRetrieval

#### Provide parameters for elastic-search
hostname = "localhost"
index_name = "msmarco_tiny"
initialize = True # True, will delete existing index with same name and reindex all documents

model = BM25(index_name=index_name, hostname=hostname, initialize=initialize)
retriever = EvaluateRetrieval(model)

#### Retrieve dense results (format of results is identical to qrels)
results = retriever.retrieve(corpus, queries)

  0%|          | 0/510585 [00:00<?, ?docs/s]               
que: 100%|██████████| 1/1 [00:08<00:00,  8.74s/it]


In [7]:
#### Evaluate your retrieval using NDCG@k, MAP@K ...
ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
ndcg, _map, recall, precision

({'NDCG@1': 0.56481,
  'NDCG@3': 0.52567,
  'NDCG@5': 0.51208,
  'NDCG@10': 0.48734,
  'NDCG@100': 0.54852,
  'NDCG@1000': 0.67747},
 {'MAP@1': 0.02848,
  'MAP@3': 0.06355,
  'MAP@5': 0.09368,
  'MAP@10': 0.13795,
  'MAP@100': 0.34717,
  'MAP@1000': 0.4222},
 {'Recall@1': 0.02848,
  'Recall@3': 0.06741,
  'Recall@5': 0.10321,
  'Recall@10': 0.16663,
  'Recall@100': 0.57261,
  'Recall@1000': 0.89188},
 {'P@1': 0.7037,
  'P@3': 0.65432,
  'P@5': 0.62593,
  'P@10': 0.54074,
  'P@100': 0.30093,
  'P@1000': 0.0553})