In [1]:
import ir_datasets
import pickle
import string

from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [2]:
# Load the NFCorpus dataset
dataset = ir_datasets.load("beir/nfcorpus/test")

In [3]:
# Common setup for data pre-processing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [4]:
# Data pre-processing (Used only for LSI)
def clean_text(sentence):
    # Remove punctuations
    lookup_table = sentence.maketrans("", "", string.punctuation)
    clean_text = sentence.translate(lookup_table)

    # Convert to lowercase and tokenize into words
    word_list = word_tokenize(clean_text.lower())

    # Remove stop-words and words with length less than or equal to 2
    word_list = [w for w in word_list if not w in stop_words and len(w) > 2]

    # Reduce each word to its lemma
    word_list = [lemmatizer.lemmatize(word) for word in word_list]

    # Convert back to sentence
    clean_sentence = " ".join(word_list)
    
    return clean_sentence

### Documents

In [5]:
docs = {}

In [12]:
for doc in dataset.docs_iter():
    docs[doc.doc_id] = {"text": doc.text, "clean_text":clean_text(doc.text)}

In [13]:
# Sanity check
docs['MED-10']['text']

'Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear. We evaluated risk of breast cancer death among statin users in a population-based cohort of breast cancer patients. The study cohort included all newly diagnosed breast cancer patients in Finland during 1995–2003 (31,236 cases), identified from the Finnish Cancer Registry. Information on statin use before and after the diagnosis was obtained from a national prescription database. We used the Cox proportional hazards regression method to estimate mortality among statin users with statin use as time-dependent variable. A total of 4,151 participants had used statins. During the median follow-up of 3.25 years after the diagnosis (range 0.08–9.0 years) 6,011 participants died, of which 3,619 (60.2%) was due to breast cancer. After adjustment for age, tumor characteri

In [14]:
# Sanity check
len(docs)

3633

In [15]:
# Save documents to file
with open("../dataset/documents.pkl", "wb") as f:
    pickle.dump(docs, f)

### Queries

In [16]:
queries = {}

In [17]:
for query in dataset.queries_iter():
    queries[query.query_id] = {"text":query.text, "clean_text":clean_text(query.text)}

In [19]:
# Sanity Check
queries['PLAIN-2']['text']

'Do Cholesterol Statin Drugs Cause Breast Cancer?'

In [21]:
# Sanity Check
len(queries)

323

In [22]:
# Save queries to file
with open("../dataset/queries.pkl", "wb") as f:
    pickle.dump(queries, f)

### Relevance Set (Ground Truth)

In [23]:
ground_truth = {}

In [24]:
for qrel in dataset.qrels_iter():
    if qrel.query_id not in ground_truth:
        ground_truth[qrel.query_id] = []
    if qrel.relevance > 0: 
        ground_truth[qrel.query_id].append(qrel.doc_id)

In [25]:
# Sanity Check
ground_truth['PLAIN-2'], len(ground_truth)

(['MED-2427',
  'MED-10',
  'MED-2429',
  'MED-2430',
  'MED-2431',
  'MED-14',
  'MED-2432',
  'MED-2428',
  'MED-2440',
  'MED-2434',
  'MED-2435',
  'MED-2436',
  'MED-2437',
  'MED-2438',
  'MED-2439',
  'MED-3597',
  'MED-3598',
  'MED-3599',
  'MED-4556',
  'MED-4559',
  'MED-4560',
  'MED-4828',
  'MED-4829',
  'MED-4830'],
 323)

In [26]:
# Save relevance set to file
with open("../dataset/rel_set.pkl", "wb") as f:
    pickle.dump(ground_truth, f)