In [1]:
import pandas as pd
from beir.datasets.data_loader import GenericDataLoader
from datetime import datetime
import os
import pickle

  from tqdm.autonotebook import tqdm


In [2]:

dataset = "scifact"
dataset_path = f"../datasets/{dataset}"

corpus, queries, qrels = GenericDataLoader(dataset_path).load(split="test")

100%|██████████| 5183/5183 [00:00<00:00, 7510.37it/s] 


In [3]:

def load_distilbert(model_path):
    print("Loading model ...")
    #################################
    # Loading DistilBERT
    #################################
    from beir.retrieval.evaluation import EvaluateRetrieval
    from beir.retrieval import models
    from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

    ## Load retriever from saved model
    model = DRES(models.SentenceBERT(model_path), batch_size=256)
    retriever = EvaluateRetrieval(model, score_function="cos_sim")
    print("Model loaded ...")
    return model, retriever


def load_distilbert_random():
    
    model_name = "distilbert-base-uncased" 
    model_save_path = os.path.join("../", "output", "{}-v1-{}".format(model_name, dataset)) 
    model, retriever = load_distilbert(model_save_path)
    return model, retriever


def load_distilbert_bm25():
    model_name = "distilbert-base-uncased" 
    model_save_path = os.path.join(os.getcwd(), "../output", "{}-v2-{}-bm25-hard-negs".format(model_name, dataset))
    model, retriever = load_distilbert(model_save_path)
    return model, retriever


def bm25_get_result(qid):
    score_path = "../datasets/scifact/scifact_bm25_scores.pickle"

    with open(score_path, 'rb') as f:
        results_bm25 = pickle.load(f)
    
    docids = results_bm25[qid]
    return {qid: docids}


def get_distilbert_r_result(qid, retriever):
    new_query = {qid: queries[qid], '0': ""}
    print(new_query)
    dense_result = retriever.retrieve(corpus, new_query)
    return dense_result


def get_distilbert_bm25_result(qid, retriever):
    new_query = {qid: queries[qid], '0': ""}
    dense_result = retriever.retrieve(corpus, new_query)
    return dense_result


In [4]:

def get_maxmin(results):
    max_score = -1
    min_score = 999999
    for q_id, q in results.items():
        for doc_id, score in q.items():
            max_score = max(score, max_score)
            min_score = min(score, min_score)

    return min_score, max_score

def ensemble_score(x,y):
    mu = 0.5
    return mu*x + (1-mu)*y

def normalize_results(results, min_score, max_score):
    for q_id, q in results.items():
        for doc_id, score in q.items():
            results[q_id][doc_id] = (score-min_score)/(max_score-min_score)

    return results

def ensemble_distilbert_bm25(qid, retriever):
    #### Retrieve dense results (format of results is identical to qrels)
    new_query = {qid: queries[qid], '0': ""}
    dense_result = retriever.retrieve(corpus, new_query)
    bm25_result = bm25_get_result(qid)

    # Get range to normalize both
    min_distilbert_score, max_distilbert_score = get_maxmin(dense_result)
    min_bm25_score, max_bm25_score = get_maxmin(bm25_result)

    dense_result = normalize_results(dense_result, min_distilbert_score, max_distilbert_score)
    bm25_result = normalize_results(bm25_result, min_bm25_score, max_bm25_score)

    combined_result = {}

    for q_id_1, q_1 in dense_result.items():
        combined_result[q_id_1] = {}
        for doc_id_1, score_1 in q_1.items():
            
            score_2 = 0
            if q_id_1 in bm25_result and bm25_result[q_id_1].get(doc_id_1,None)!=None:
                score_2 = bm25_result[q_id_1][doc_id_1]
                del bm25_result[q_id_1][doc_id_1] # So that same query-doc pair is not added to combined result twice
            
            combined_score = ensemble_score(score_1, score_2)
            combined_result[q_id_1][doc_id_1] = combined_score


    # Now add remaining bm25 results in combined dict
    for q_id_2, q_2 in bm25_result.items():
        for doc_id_2, score_2 in q_2.items():
            score_1 = 0
            combined_score = ensemble_score(score_1, score_2)
            combined_result[q_id_1][doc_id_1] = combined_score
            
    return combined_result




In [5]:
qid, query_text = tuple(queries.items())[0]
qid, query_text

('1', '0-dimensional biomaterials show inductive properties.')

In [6]:
results = bm25_get_result(qid)

In [11]:
[k for k,v in results[qid].items()][:10]
# results[qid]

['10608397',
 '18953920',
 '43385013',
 '34386619',
 '14827874',
 '121581019',
 '42421723',
 '21257564',
 '40212412',
 '31543713']

In [7]:
# model, retriever = load_distilbert_random()
# results = get_distilbert_r_result(qid, retriever)

In [8]:
# model, retriever = load_distilbert_bm25()
# results = get_distilbert_bm25_result(qid, retriever)
# ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
# ndcg, _map, recall, precision

In [43]:
# model, retriever = load_distilbert_random()
# results = ensemble_distilbert_bm25(qid, retriever)
# ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
# ndcg, _map, recall, precision

Batches: 100%|██████████| 1/1 [00:00<00:00, 107.28it/s]
Batches: 100%|██████████| 21/21 [00:34<00:00,  1.66s/it]


({'NDCG@1': 0.0,
  'NDCG@3': 0.0,
  'NDCG@5': 0.0,
  'NDCG@10': 0.0,
  'NDCG@100': 0.25,
  'NDCG@1000': 0.25},
 {'MAP@1': 0.0,
  'MAP@3': 0.0,
  'MAP@5': 0.0,
  'MAP@10': 0.0,
  'MAP@100': 0.06667,
  'MAP@1000': 0.06667},
 {'Recall@1': 0.0,
  'Recall@3': 0.0,
  'Recall@5': 0.0,
  'Recall@10': 0.0,
  'Recall@100': 1.0,
  'Recall@1000': 1.0},
 {'P@1': 0.0,
  'P@3': 0.0,
  'P@5': 0.0,
  'P@10': 0.0,
  'P@100': 0.01,
  'P@1000': 0.001})

In [9]:
model, retriever = load_distilbert_bm25()
results = ensemble_distilbert_bm25(qid, retriever)
ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
ndcg, _map, recall, precision

Loading model ...
Model loaded ...


Batches: 100%|██████████| 1/1 [00:05<00:00,  5.16s/it]
Batches: 100%|██████████| 21/21 [00:36<00:00,  1.72s/it]


({'NDCG@1': 0.0,
  'NDCG@3': 0.0,
  'NDCG@5': 0.0,
  'NDCG@10': 0.31546,
  'NDCG@100': 0.31546,
  'NDCG@1000': 0.31546},
 {'MAP@1': 0.0,
  'MAP@3': 0.0,
  'MAP@5': 0.0,
  'MAP@10': 0.125,
  'MAP@100': 0.125,
  'MAP@1000': 0.125},
 {'Recall@1': 0.0,
  'Recall@3': 0.0,
  'Recall@5': 0.0,
  'Recall@10': 1.0,
  'Recall@100': 1.0,
  'Recall@1000': 1.0},
 {'P@1': 0.0,
  'P@3': 0.0,
  'P@5': 0.0,
  'P@10': 0.1,
  'P@100': 0.01,
  'P@1000': 0.001})

In [11]:
queries['56']

'APOE4 expression in iPSC-derived neurons increases AlphaBeta production and tau phosphorylation causing GABA neuron degeneration.'

In [12]:
corpus['4709641']

{'text': "Efforts to develop drugs for Alzheimer's disease (AD) have shown promise in animal studies, only to fail in human trials, suggesting a pressing need to study AD in human model systems. Using human neurons derived from induced pluripotent stem cells that expressed apolipoprotein E4 (ApoE4), a variant of the APOE gene product and the major genetic risk factor for AD, we demonstrated that ApoE4-expressing neurons had higher levels of tau phosphorylation, unrelated to their increased production of amyloid-β (Aβ) peptides, and that they displayed GABAergic neuron degeneration. ApoE4 increased Aβ production in human, but not in mouse, neurons. Converting ApoE4 to ApoE3 by gene editing rescued these phenotypes, indicating the specific effects of ApoE4. Neurons that lacked APOE behaved similarly to those expressing ApoE3, and the introduction of ApoE4 expression recapitulated the pathological phenotypes, suggesting a gain of toxic effects from ApoE4. Treatment of ApoE4-expressing neu