<a href="https://colab.research.google.com/github/shuvanyu/Document-Retrieval-and-Ranking/blob/main/condenser_nfcorpus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prerequisites

In [1]:
!pip install sentence-transformers > /dev/null
!pip install beir > /dev/null
!pip install tensorflow_text > /dev/null

# Testing beir

In [2]:
from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

import logging
import pathlib, os

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#### Download scifact.zip dataset and unzip the dataset
dataset = "nfcorpus"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = os.path.join('drive', 'MyDrive',  "nlp_datashare")

  from tqdm.autonotebook import tqdm


In [3]:

data_path = util.download_and_unzip(url, out_dir)
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test") # or split = "train" or "dev"


  0%|          | 0/3633 [00:00<?, ?it/s]

In [4]:
sorted(list(corpus.keys()))[3]

'MED-1003'

# Co-condensor

In [5]:
len(corpus)

3633

In [6]:
model_path = "sentence-transformers/msmarco-bert-co-condensor"
model = models.SentenceBERT(model_path=model_path)
normalize = True


In [7]:
model = DRES(model, batch_size=16)
retriever = EvaluateRetrieval(model, score_function="cos_sim") # or "cos_sim" for cosine similarity
results = retriever.retrieve(corpus, queries)

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Batches:   0%|          | 0/228 [00:00<?, ?it/s]

In [8]:
from collections import defaultdict
import pandas as pd

table = defaultdict(list)

for key in results.keys():
    table['query'].append(key)
    top_5 = sorted(results[key].items(), key=lambda x: x[1], reverse=True)[:5]
    table['word-count'].append(len(queries[key].split()))
    table['length-of-string'].append(len(queries[key]))
    table['top-1'].append(top_5[0][1])
    table['top-2'].append(top_5[1][1])
    table['top-3'].append(top_5[2][1])
    table['top-4'].append(top_5[3][1])
    table['top-5'].append(top_5[4][1])

df = pd.DataFrame(table)
df.to_csv('scores.csv')
df

Unnamed: 0,query,word-count,length-of-string,top-1,top-2,top-3,top-4,top-5
0,PLAIN-2,7,48,0.927542,0.916851,0.915573,0.915447,0.907110
1,PLAIN-12,5,35,0.892237,0.889848,0.876666,0.876179,0.875974
2,PLAIN-23,9,56,0.907913,0.903687,0.903596,0.901673,0.901383
3,PLAIN-33,5,41,0.937557,0.926978,0.916476,0.905681,0.900663
4,PLAIN-44,6,37,0.895952,0.894301,0.891559,0.887764,0.886433
...,...,...,...,...,...,...,...,...
318,PLAIN-3432,3,28,0.889595,0.884487,0.883014,0.882512,0.881440
319,PLAIN-3442,3,25,0.899829,0.898578,0.895365,0.894648,0.889610
320,PLAIN-3452,3,24,0.911192,0.910800,0.901904,0.898044,0.895320
321,PLAIN-3462,5,29,0.917739,0.912018,0.901222,0.899708,0.899583


In [9]:
ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)

In [10]:
ndcg

{'NDCG@1': 0.31424,
 'NDCG@3': 0.26734,
 'NDCG@5': 0.24832,
 'NDCG@10': 0.22291,
 'NDCG@100': 0.21449,
 'NDCG@1000': 0.30686}

In [11]:
_map

{'MAP@1': 0.0326,
 'MAP@3': 0.05217,
 'MAP@5': 0.06111,
 'MAP@10': 0.07113,
 'MAP@100': 0.08904,
 'MAP@1000': 0.09989}

In [12]:
recall

{'Recall@1': 0.0326,
 'Recall@3': 0.06202,
 'Recall@5': 0.08132,
 'Recall@10': 0.10277,
 'Recall@100': 0.24231,
 'Recall@1000': 0.56055}

In [13]:
precision

{'P@1': 0.33127,
 'P@3': 0.2549,
 'P@5': 0.21734,
 'P@10': 0.16656,
 'P@100': 0.05375,
 'P@1000': 0.01802}

In [14]:
import random
random.seed(250)

#### Print top-k documents retrieved ####
top_k = 5

query_id, ranking_scores = random.choice(list(results.items()))


In [15]:
query_id

'PLAIN-2177'

In [16]:
len(ranking_scores)

1001

In [17]:
scores_sorted = sorted(ranking_scores.items(), key=lambda item: item[1], reverse=True)
print("Query : %s\n" % queries[query_id])

for rank in range(min(top_k, len(ranking_scores))):
    doc_id = scores_sorted[rank][0]
    # Format: Rank x: ID [Title] Body
    print("Rank %d: %s [%s] - %s\n" % (rank+1, doc_id, corpus[doc_id].get("title"), corpus[doc_id].get("text")))

Query : sulfur

Rank 1: MED-1185 [Sulfite sensitivity: significance in human health.] - Endogenous sulfite is generated as a consequence of the body's normal processing of sulfur-containing amino acids. Sulfites occur as a consequence of fermentation and also occur naturally in a number of foods and beverages. As food additives, sulfiting agents were first used in 1664 and approved in the United States as long ago as the 1800s. With such long experience with their use, it is easy to understand why these substances have been regarded as safe. They are currently used for a variety of preservative properties, including controlling microbial growth, preventing browning and spoilage, and bleaching some foods. It is estimated that up to 500,000 (< .05% of the population) sulfite-sensitive individuals live in the United States. Sulfite sensitivity occurs most often in asthmatic adults--predominantly women; it is uncommonly reported in preschool children. Adverse reactions to sulfites in nonas