Training is done in the `distilbert+bm25_msmarco.ipynb` notebook

# Test

In [1]:
max_seq_length = 512
model_name = "distilbert-base-uncased" 
dataset = "msmarco_tiny"

dataset_path = "../beir/datasets/msmarco_tiny/"
corpus_file = "tiny_collection.json"
queries_file = "topics.dl20.txt"
qrels_test_file = "qrels.dl20-passage.txt"
training_set = "msmarco_triples.train.tiny.tsv"

In [2]:

from sentence_transformers import losses, models, SentenceTransformer
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.search.lexical import BM25Search as BM25
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.train import TrainRetriever
import pathlib, os, tqdm
import logging
import pickle


#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

  from tqdm.autonotebook import tqdm, trange


In [18]:
# Loading test set
# corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")

import collections
import pytrec_eval
import json

def load_queries(path):
    """Returns a dictionary whose keys are query ids and values are query texts."""
    queries = {}
    with open(path) as f:
        for line in f:
            query_id, query_text = line.strip().split('\t')
            queries[query_id] = query_text
    return queries


def load_qrels(path):
    with open(path, 'r') as f_qrel:
        qrels = pytrec_eval.parse_qrel(f_qrel)

    return qrels


def load_corpus_json(path):
    with open(path, 'r') as corpus_f:
        corpus_json = json.load(corpus_f)
    return corpus_json


qrels = load_qrels(f"{dataset_path}{qrels_test_file}")
queries = load_queries(f"{dataset_path}{queries_file}")
corpus = load_corpus_json(f"{dataset_path}{corpus_file}")

In [19]:
# Load BM25 scores

with open(f"{dataset_path}{dataset}_bm25_scores.pickle", 'rb') as f:
    results_bm25 = pickle.load(f)

In [20]:
# Load distilbert scores

with open(f"{dataset_path}{dataset}_distilBertR_scores.pickle", 'rb') as f: #_distilBertBM_scores
    results_dense = pickle.load(f)

In [21]:
def get_maxmin(results):
    max_score = -1
    min_score = 999999
    for q_id, q in results.items():
        for doc_id, score in q.items():
            max_score = max(score, max_score)
            min_score = min(score, min_score)

    return min_score, max_score

# Get range to normalize both
min_distilbert_score, max_distilbert_score = get_maxmin(results_dense)
min_bm25_score, max_bm25_score = get_maxmin(results_bm25)

min_distilbert_score, max_distilbert_score, min_bm25_score, max_bm25_score

(0.2672949731349945, 0.9520302414894104, 4.215731, 49.542587)

In [22]:
# Normalize
def normalize_results(results, min_score, max_score):
    for q_id, q in results.items():
        for doc_id, score in q.items():
            results[q_id][doc_id] = (score-min_score)/(max_score-min_score)

    return results

results_dense = normalize_results(results_dense, min_distilbert_score, max_distilbert_score)
results_bm25 = normalize_results(results_bm25, min_bm25_score, max_bm25_score)
# results

In [23]:
def ensemble_score(x,y):
    mu = 0.5
    return mu*x + (1-mu)*y

combined_result = {}

for q_id_1, q_1 in results_dense.items():
        combined_result[q_id_1] = {}
        for doc_id_1, score_1 in q_1.items():
            
            score_2 = 0
            if results_bm25[q_id_1].get(doc_id_1,None)!=None:
                score_2 = results_bm25[q_id_1][doc_id_1]
                del results_bm25[q_id_1][doc_id_1] # So that same query-doc pair is not added to combined result twice
            
            combined_score = ensemble_score(score_1, score_2)
            combined_result[q_id_1][doc_id_1] = combined_score


# Now add remaining bm25 results in combined dict
for q_id_2, q_2 in results_bm25.items():
    for doc_id_2, score_2 in q_2.items():
         score_1 = 0
         combined_score = ensemble_score(score_1, score_2)
         combined_result[q_id_1][doc_id_1] = combined_score

In [24]:
from beir.retrieval.search.lexical import BM25Search as BM25
from beir.retrieval.evaluation import EvaluateRetrieval

## elasticsearch settings
hostname = "localhost" #localhost
index_name = dataset # scifact
initialize = True # True - Delete existing index and re-index all documents from scratch 

model_bm25 = BM25(index_name=index_name, hostname=hostname, initialize=initialize)
retriever_bm25 = EvaluateRetrieval(model_bm25)


2024-06-02 20:56:49 - Activating Elasticsearch....
2024-06-02 20:56:49 - Elastic Search Credentials: {'hostname': 'localhost', 'index_name': 'msmarco_tiny', 'keys': {'title': 'title', 'body': 'txt'}, 'timeout': 100, 'retry_on_timeout': True, 'maxsize': 24, 'number_of_shards': 'default', 'language': 'english'}
2024-06-02 20:56:49 - Deleting previous Elasticsearch-Index named - msmarco_tiny
2024-06-02 20:56:49 - Unable to create Index in Elastic Search. Reason: ConnectionError(('Connection aborted.', BadStatusLine('ÿ\x00\x00\x00\x00\x00\x00\x00\x01\x7f \x01dentity\r\n'))) caused by: ProtocolError(('Connection aborted.', BadStatusLine('ÿ\x00\x00\x00\x00\x00\x00\x00\x01\x7f \x01dentity\r\n')))
2024-06-02 20:56:51 - Creating fresh Elasticsearch-Index named - msmarco_tiny
2024-06-02 20:56:51 - Unable to create Index in Elastic Search. Reason: ConnectionError(('Connection aborted.', BadStatusLine('ÿ\x00\x00\x00\x00\x00\x00\x00\x01\x7fe\x00t-Length: 117\r\n'))) caused by: ProtocolError(('Conne

In [25]:
ndcg, _map, recall, precision = retriever_bm25.evaluate(qrels, combined_result, retriever_bm25.k_values)

2024-06-02 20:56:52 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
2024-06-02 20:56:52 - 

2024-06-02 20:56:52 - NDCG@1: 0.6697
2024-06-02 20:56:52 - NDCG@3: 0.6383
2024-06-02 20:56:52 - NDCG@5: 0.6230
2024-06-02 20:56:52 - NDCG@10: 0.5963
2024-06-02 20:56:52 - NDCG@100: 0.6119
2024-06-02 20:56:52 - NDCG@1000: 0.6717
2024-06-02 20:56:52 - 

2024-06-02 20:56:52 - MAP@1: 0.0308
2024-06-02 20:56:52 - MAP@3: 0.0784
2024-06-02 20:56:52 - MAP@5: 0.1075
2024-06-02 20:56:52 - MAP@10: 0.1647
2024-06-02 20:56:52 - MAP@100: 0.4013
2024-06-02 20:56:52 - MAP@1000: 0.4484
2024-06-02 20:56:52 - 

2024-06-02 20:56:52 - Recall@1: 0.0308
2024-06-02 20:56:52 - Recall@3: 0.0840
2024-06-02 20:56:52 - Recall@5: 0.1170
2024-06-02 20:56:52 - Recall@10: 0.1900
2024-06-02 20:56:52 - Recall@100: 0.6144
2024-06-02 20:56:52 - Recall@1000: 0.7627
2024-06-02 20:56:52 - 

2024-06-02 20:56:52 - P@1: 0.7963
2024-06-02 20:56:52