<a href="https://colab.research.google.com/github/shuvanyu/Document-Retrieval-and-Ranking/blob/main/condenser_nfcorpus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prerequisites

In [1]:
!pip install sentence-transformers > /dev/null
!pip install beir > /dev/null
!pip install tensorflow_text > /dev/null

# Testing beir

In [2]:
from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

import logging
import pathlib, os

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#### Download scifact.zip dataset and unzip the dataset
dataset = "nfcorpus"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = os.path.join('drive', 'MyDrive',  "nlp_datashare")

  from tqdm.autonotebook import tqdm


In [3]:

data_path = util.download_and_unzip(url, out_dir)
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test") # or split = "train" or "dev"


  0%|          | 0/3633 [00:00<?, ?it/s]

In [4]:
sorted(list(corpus.keys()))[3]

'MED-1003'

# Co-condensor

In [5]:
len(corpus)

3633

In [6]:
model_path = "sentence-transformers/msmarco-bert-co-condensor"
model = models.SentenceBERT(model_path=model_path)
normalize = True


In [7]:
model = DRES(model, batch_size=16)
retriever = EvaluateRetrieval(model, score_function="dot") # or "cos_sim" for cosine similarity
results = retriever.retrieve(corpus, queries)

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Batches:   0%|          | 0/228 [00:00<?, ?it/s]

In [8]:
ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)

In [9]:
ndcg

{'NDCG@1': 0.43498,
 'NDCG@3': 0.37308,
 'NDCG@5': 0.34707,
 'NDCG@10': 0.31806,
 'NDCG@100': 0.28839,
 'NDCG@1000': 0.37973}

In [10]:
_map

{'MAP@1': 0.05594,
 'MAP@3': 0.0861,
 'MAP@5': 0.10042,
 'MAP@10': 0.11702,
 'MAP@100': 0.14415,
 'MAP@1000': 0.15778}

In [11]:
recall

{'Recall@1': 0.05594,
 'Recall@3': 0.09794,
 'Recall@5': 0.12224,
 'Recall@10': 0.15288,
 'Recall@100': 0.28486,
 'Recall@1000': 0.6152}

In [12]:
precision

{'P@1': 0.44892,
 'P@3': 0.34675,
 'P@5': 0.29474,
 'P@10': 0.23158,
 'P@100': 0.07241,
 'P@1000': 0.02027}

In [13]:
import random
random.seed(250)

#### Print top-k documents retrieved ####
top_k = 10

query_id, ranking_scores = random.choice(list(results.items()))


In [14]:
query_id

'PLAIN-2177'

In [15]:
len(ranking_scores)

1001

In [16]:
scores_sorted = sorted(ranking_scores.items(), key=lambda item: item[1], reverse=True)
print("Query : %s\n" % queries[query_id])

for rank in range(min(top_k, len(ranking_scores))):
    doc_id = scores_sorted[rank][0]
    # Format: Rank x: ID [Title] Body
    print("Rank %d: %s [%s] - %s\n" % (rank+1, doc_id, corpus[doc_id].get("title"), corpus[doc_id].get("text")))

Query : sulfur

Rank 1: MED-3282 [Generation of gaseous sulfur-containing compounds in tumour tissue and suppression of gas diffusion as an antitumour treatment.] - BACKGROUND AND AIMS: The mechanisms of cancer cell growth and metastasis are still not entirely understood, especially from the viewpoint of chemical reactions in tumours. Glycolytic metabolism is markedly accelerated in cancer cells, causing the accumulation of glucose (a reducing sugar) and methionine (an amino acid), which can non-enzymatically react and form carcinogenic substances. There is speculation that this reaction produces gaseous sulfur-containing compounds in tumour tissue. The aims of this study were to clarify the products in tumour and to investigate their effect on tumour proliferation. METHODS: Products formed in the reaction between glucose and methionine or its metabolites were analysed in vitro using gas chromatography. Flatus samples from patients with colon cancer and exhaled air samples from patient

# Random Query

In [17]:
# import torch

In [18]:
# if normalize:
#     corpus_embs = model.encode_corpus(reduced_corpus, batch_size=128, convert_to_tensor=True, normalize_embeddings=True)
# else:
#     corpus_embs = model.encode_corpus(reduced_corpus, batch_size=128, convert_to_tensor=True)

In [19]:
# query = ''

In [20]:
# start = datetime.datetime.now()
# if normalize:
#     query_emb = model.encode_queries([query], batch_size=1, convert_to_tensor=True, normalize_embeddings=True, show_progress_bar=False)
# else:
#     query_emb = model.encode_queries([query], batch_size=1, convert_to_tensor=True, show_progress_bar=False)

# #### Dot product for normalized embeddings is equal to cosine similarity
# sim_scores = utils.dot_score(query_emb, corpus_embs)
# sim_scores_top_k_values, sim_scores_top_k_idx = torch.topk(sim_scores, 10, dim=1, largest=True, sorted=True)
# end = datetime.datetime.now()

# #### Measuring time taken in ms (milliseconds)
# time_taken = (end - start)
# time_taken = time_taken.total_seconds() * 1000
# time_taken_all[query_id] = time_taken
# print("{}: {} {:.2f}ms".format(query_id, query, time_taken))