# Train

In [7]:
max_seq_length = 350
model_name = "distilbert-base-uncased" 
dataset = "scifact"
data_path = f"../beir/datasets/{dataset}"


In [8]:

from sentence_transformers import losses, models, SentenceTransformer
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.search.lexical import BM25Search as BM25
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.train import TrainRetriever
import pathlib, os, tqdm
import logging

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

# Evaluate-distilBert

In [9]:
import os

model_save_path = os.path.join("../", "output", "{}-v1-{}".format(model_name, dataset)) 

# R: os.path.join("../", "output", "{}-v1-{}".format(model_name, dataset)) 
# BM: os.path.join(os.getcwd(), "../output", "{}-v2-{}-bm25-hard-negs".format(model_name, dataset)) 

In [10]:
# Loading test set
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")

2024-06-02 21:00:48 - Loading Corpus...


100%|██████████| 5183/5183 [00:00<00:00, 129640.80it/s]

2024-06-02 21:00:48 - Loaded 5183 TEST Documents.
2024-06-02 21:00:48 - Doc Example: {'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 vers




In [12]:
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval import models
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

## Load retriever from saved model

model = DRES(models.SentenceBERT(model_save_path), batch_size=128)
retriever = EvaluateRetrieval(model, score_function="cos_sim")

#### Retrieve dense results (format of results is identical to qrels)
results = retriever.retrieve(corpus, queries)

2024-06-02 21:06:59 - Use pytorch device_name: cuda
2024-06-02 21:06:59 - Load pretrained SentenceTransformer: ../output/distilbert-base-uncased-v1-scifact
2024-06-02 21:07:01 - Encoding Queries...


Batches: 100%|██████████| 3/3 [00:00<00:00,  5.80it/s]


2024-06-02 21:07:01 - Sorting Corpus by document length (Longest first)...
2024-06-02 21:07:01 - Scoring Function: Cosine Similarity (cos_sim)
2024-06-02 21:07:01 - Encoding Batch 1/1...


Batches: 100%|██████████| 41/41 [00:10<00:00,  3.94it/s]


In [55]:
# import json

# output_path = "../output/"
# with open(f"{output_path}{dataset}_distilBert_results.json", 'w') as f:
#     json.dump(results, f)

In [12]:
# import json

# output_path = "../output/"
# with open(f"{output_path}{dataset}_distilBert_results.json", 'r') as f:
#     results = json.load(f)

In [13]:
#### Evaluate your retrieval using NDCG@k, MAP@K ...
logging.info("Retriever evaluation for k in: {}".format(retriever.k_values))
ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
ndcg, _map, recall, precision

2024-06-02 21:07:22 - Retriever evaluation for k in: [1, 3, 5, 10, 100, 1000]
2024-06-02 21:07:22 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
2024-06-02 21:07:22 - 

2024-06-02 21:07:22 - NDCG@1: 0.5000
2024-06-02 21:07:22 - NDCG@3: 0.5845
2024-06-02 21:07:22 - NDCG@5: 0.6121
2024-06-02 21:07:22 - NDCG@10: 0.6354
2024-06-02 21:07:22 - NDCG@100: 0.6641
2024-06-02 21:07:22 - NDCG@1000: 0.6725
2024-06-02 21:07:22 - 

2024-06-02 21:07:22 - MAP@1: 0.4772
2024-06-02 21:07:22 - MAP@3: 0.5551
2024-06-02 21:07:22 - MAP@5: 0.5741
2024-06-02 21:07:22 - MAP@10: 0.5847
2024-06-02 21:07:22 - MAP@100: 0.5912
2024-06-02 21:07:22 - MAP@1000: 0.5915
2024-06-02 21:07:22 - 

2024-06-02 21:07:22 - Recall@1: 0.4772
2024-06-02 21:07:22 - Recall@3: 0.6464
2024-06-02 21:07:22 - Recall@5: 0.7123
2024-06-02 21:07:22 - Recall@10: 0.7797
2024-06-02 21:07:22 - Recall@100: 0.9073
2024-06-02 21:07:22 - Recall@1000: 0.9767

({'NDCG@1': 0.5,
  'NDCG@3': 0.58448,
  'NDCG@5': 0.61214,
  'NDCG@10': 0.63538,
  'NDCG@100': 0.66406,
  'NDCG@1000': 0.67247},
 {'MAP@1': 0.47722,
  'MAP@3': 0.55514,
  'MAP@5': 0.57411,
  'MAP@10': 0.58474,
  'MAP@100': 0.59124,
  'MAP@1000': 0.59148},
 {'Recall@1': 0.47722,
  'Recall@3': 0.64639,
  'Recall@5': 0.71233,
  'Recall@10': 0.77967,
  'Recall@100': 0.90733,
  'Recall@1000': 0.97667},
 {'P@1': 0.5,
  'P@3': 0.23222,
  'P@5': 0.15733,
  'P@10': 0.08733,
  'P@100': 0.01023,
  'P@1000': 0.00111})

# Evaluate-bm25

In [16]:
data_path

'../beir/datasets/scifact'

In [17]:
# Load BM25 scores
import pickle 

with open(f"{data_path}/{dataset}_bm25_scores.pickle", 'rb') as f:
    results_bm25 = pickle.load(f)

In [20]:
from beir.retrieval.search.lexical import BM25Search as BM25
from beir.retrieval.evaluation import EvaluateRetrieval

## elasticsearch settings
hostname = "localhost" #localhost
index_name = dataset+'_2' # scifact
initialize = True # True - Delete existing index and re-index all documents from scratch 

model_bm25 = BM25(index_name=index_name, hostname=hostname, initialize=initialize)
retriever_bm25 = EvaluateRetrieval(model_bm25)

2024-06-02 21:17:13 - Activating Elasticsearch....
2024-06-02 21:17:13 - Elastic Search Credentials: {'hostname': 'localhost', 'index_name': 'scifact_2', 'keys': {'title': 'title', 'body': 'txt'}, 'timeout': 100, 'retry_on_timeout': True, 'maxsize': 24, 'number_of_shards': 'default', 'language': 'english'}
2024-06-02 21:17:13 - Deleting previous Elasticsearch-Index named - scifact_2
2024-06-02 21:17:13 - Unable to create Index in Elastic Search. Reason: ConnectionError(('Connection aborted.', BadStatusLine('ÿ\x00\x00\x00\x00\x00\x00\x00\x01\x7fe\x00tity\r\n'))) caused by: ProtocolError(('Connection aborted.', BadStatusLine('ÿ\x00\x00\x00\x00\x00\x00\x00\x01\x7fe\x00tity\r\n')))
2024-06-02 21:17:15 - Creating fresh Elasticsearch-Index named - scifact_2
2024-06-02 21:17:15 - Unable to create Index in Elastic Search. Reason: ConnectionError(('Connection aborted.', BadStatusLine('ÿ\x00\x00\x00\x00\x00\x00\x00\x01\x7f-\x00ength: 117\r\n'))) caused by: ProtocolError(('Connection aborted.', B

In [21]:
#### Evaluate your retrieval using NDCG@k, MAP@K ...
ndcg, _map, recall, precision = retriever_bm25.evaluate(qrels, results_bm25, retriever_bm25.k_values)
ndcg, _map, recall, precision

2024-06-02 21:17:18 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
2024-06-02 21:17:18 - 

2024-06-02 21:17:18 - NDCG@1: 0.5767
2024-06-02 21:17:18 - NDCG@3: 0.6366
2024-06-02 21:17:18 - NDCG@5: 0.6652
2024-06-02 21:17:18 - NDCG@10: 0.6906
2024-06-02 21:17:18 - NDCG@100: 0.7134
2024-06-02 21:17:18 - NDCG@1000: 0.7212
2024-06-02 21:17:18 - 

2024-06-02 21:17:18 - MAP@1: 0.5559
2024-06-02 21:17:18 - MAP@3: 0.6143
2024-06-02 21:17:18 - MAP@5: 0.6312
2024-06-02 21:17:18 - MAP@10: 0.6437
2024-06-02 21:17:18 - MAP@100: 0.6492
2024-06-02 21:17:18 - MAP@1000: 0.6495
2024-06-02 21:17:18 - 

2024-06-02 21:17:18 - Recall@1: 0.5559
2024-06-02 21:17:18 - Recall@3: 0.6793
2024-06-02 21:17:18 - Recall@5: 0.7479
2024-06-02 21:17:18 - Recall@10: 0.8198
2024-06-02 21:17:18 - Recall@100: 0.9192
2024-06-02 21:17:18 - Recall@1000: 0.9800
2024-06-02 21:17:18 - 

2024-06-02 21:17:18 - P@1: 0.5767
2024-06-02 21:17:18

({'NDCG@1': 0.57667,
  'NDCG@3': 0.63658,
  'NDCG@5': 0.66524,
  'NDCG@10': 0.69064,
  'NDCG@100': 0.71337,
  'NDCG@1000': 0.7212},
 {'MAP@1': 0.55594,
  'MAP@3': 0.61432,
  'MAP@5': 0.63124,
  'MAP@10': 0.64374,
  'MAP@100': 0.64918,
  'MAP@1000': 0.6495},
 {'Recall@1': 0.55594,
  'Recall@3': 0.67928,
  'Recall@5': 0.74789,
  'Recall@10': 0.81978,
  'Recall@100': 0.91922,
  'Recall@1000': 0.98},
 {'P@1': 0.57667,
  'P@3': 0.24111,
  'P@5': 0.162,
  'P@10': 0.09067,
  'P@100': 0.0104,
  'P@1000': 0.00111})

# Ensemble

In [22]:
def get_maxmin(results):
    max_score = -1
    min_score = 999999
    for q_id, q in results.items():
        for doc_id, score in q.items():
            max_score = max(score, max_score)
            min_score = min(score, min_score)

    return min_score, max_score

# Get range to normalize both
min_distilbert_score, max_distilbert_score = get_maxmin(results)
min_bm25_score, max_bm25_score = get_maxmin(results_bm25)

min_distilbert_score, max_distilbert_score, min_bm25_score, max_bm25_score

(0.08832266181707382, 0.9469413757324219, 0.5297587, 120.60852)

In [23]:
# Normalize
def normalize_results(results, min_score, max_score):
    for q_id, q in results.items():
        for doc_id, score in q.items():
            results[q_id][doc_id] = (score-min_score)/(max_score-min_score)

    return results

results = normalize_results(results, min_distilbert_score, max_distilbert_score)
results_bm25 = normalize_results(results_bm25, min_bm25_score, max_bm25_score)
# results

In [24]:
# results_bm25

In [25]:
def ensemble_score(x,y):
    mu = 0.5
    return mu*x + (1-mu)*y

combined_result = {}

for q_id_1, q_1 in results.items():
        combined_result[q_id_1] = {}
        for doc_id_1, score_1 in q_1.items():
            
            score_2 = 0
            if results_bm25[q_id_1].get(doc_id_1,None)!=None:
                score_2 = results_bm25[q_id_1][doc_id_1]
                del results_bm25[q_id_1][doc_id_1] # So that same query-doc pair is not added to combined result twice
            
            combined_score = ensemble_score(score_1, score_2)
            combined_result[q_id_1][doc_id_1] = combined_score


# Now add remaining bm25 results in combined dict
for q_id_2, q_2 in results_bm25.items():
    for doc_id_2, score_2 in q_2.items():
         score_1 = 0
         combined_score = ensemble_score(score_1, score_2)
         combined_result[q_id_1][doc_id_1] = combined_score

In [26]:
ndcg, _map, recall, precision = retriever_bm25.evaluate(qrels, combined_result, retriever_bm25.k_values)

2024-06-02 21:18:07 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
2024-06-02 21:18:07 - 

2024-06-02 21:18:07 - NDCG@1: 0.6167
2024-06-02 21:18:07 - NDCG@3: 0.6937
2024-06-02 21:18:07 - NDCG@5: 0.7175
2024-06-02 21:18:07 - NDCG@10: 0.7391
2024-06-02 21:18:07 - NDCG@100: 0.7560
2024-06-02 21:18:07 - NDCG@1000: 0.7605
2024-06-02 21:18:07 - 

2024-06-02 21:18:07 - MAP@1: 0.5958
2024-06-02 21:18:07 - MAP@3: 0.6672
2024-06-02 21:18:07 - MAP@5: 0.6823
2024-06-02 21:18:07 - MAP@10: 0.6932
2024-06-02 21:18:07 - MAP@100: 0.6973
2024-06-02 21:18:07 - MAP@1000: 0.6975
2024-06-02 21:18:07 - 

2024-06-02 21:18:07 - Recall@1: 0.5958
2024-06-02 21:18:07 - Recall@3: 0.7508
2024-06-02 21:18:07 - Recall@5: 0.8084
2024-06-02 21:18:07 - Recall@10: 0.8681
2024-06-02 21:18:07 - Recall@100: 0.9430
2024-06-02 21:18:07 - Recall@1000: 0.9767
2024-06-02 21:18:07 - 

2024-06-02 21:18:07 - P@1: 0.6167
2024-06-02 21:18:07