In [1]:
import numpy as np
import pandas as pd
import os
import string
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import precision_score, recall_score, f1_score
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
import re
from collections import defaultdict

In [2]:
with open("../data/cisi/CISI.ALL") as f:
    articles = f.readlines()

In [3]:
def extract_docs(lines):
    title_mode = False
    body_mode = False
    edge_mode = False
    title = ""
    body = ""
    idx = None
    edge_str = ""
    edges = []
    docs = []
    for line in lines:
        for c in line:
            if line.startswith("."):
                if line.startswith(".I"):
                    for e in edge_str.split("\n"):
                        if "\t" in e:
                            edges.append((idx, int(e.split("\t")[0])))
                    idx = int(line.split()[1])
                    edge_str = ""
                    edge_mode = False
                if line.startswith(".T"):
                    title_mode = True
                    body_mode = False
                elif line.startswith(".W"):
                    title_mode = False
                    body_mode = True
                elif line.startswith(".X"):
                    docs.append({"id": idx, "title": title, "body": body})
                    title = ""
                    body = ""
                    title_mode = False
                    body_mode = False
                    edge_mode = True
                else:
                    title_mode = False
                    body_mode = False 
                    edge_mode = False
            if title_mode:
                title += c
            elif body_mode:
                body += c
            elif edge_mode:
                edge_str += c
    for e in edge_str.split("\n"):
        if "\t" in e:
            edges.append((idx, int(e.split("\t")[0])))
    return [x for x in docs if x["title"]], sorted(list(set(edges)))

In [4]:
docs, edges = extract_docs(articles)

In [5]:
def tokenizer(text):
    return word_tokenize(text.lower())

In [6]:
queries = {}

idx = None
with open("../data/cisi/CISI.QRY") as f:
    for query in f.read().split(".I"):
        for i, line in enumerate(query.split("\n")):
            if not line:
                continue
            elif i == 0:
                idx = int(line)
                queries[idx] = ""
            elif not line.startswith("."):
                queries[idx] += " "+line
        if idx:
            queries[idx] = tokenizer(queries[idx])

In [7]:
def clean_text(docs):
    sentences_clean = []
    for sentence in docs:
        lookup_table = sentence.maketrans('', '', string.punctuation)
        clean_text = sentence.translate(lookup_table)
        word_list = word_tokenize(clean_text)
        word_list = [w for w in word_list if not w in stop_words and len(w) > 2]
        word_list = [lemmatizer.lemmatize(word) for word in word_list]
        clean_text = ' '.join(word_list)
        sentences_clean.append(clean_text)
    return sentences_clean


In [8]:
docs_body = [doc['body'] for doc in docs]
clean_documents = clean_text(docs_body)

In [36]:
queries = {}

idx = None
with open("../data/cisi/CISI.QRY") as f:
    for query in f.read().split(".I"):
        for i, line in enumerate(query.split("\n")):
            if not line:
                continue
            elif i == 0:
                idx = int(line)
                queries[idx] = ""
            elif not line.startswith("."):
                queries[idx] += " "+line
queries = [queries[idx] for idx in range(1,len(queries)+1)]

In [41]:
clean_queries = clean_text(queries)

In [44]:
vectorizer = TfidfVectorizer()
vectorizer.fit(clean_documents+clean_queries)
documents_vectors = vectorizer.transform(clean_documents)
queries_vectors = vectorizer.transform(clean_queries)
num_topics = 100
svd = TruncatedSVD(n_components=num_topics)
documents_reduced = svd.fit_transform(documents_vectors)
queries_reduced = svd.fit_transform(queries_vectors)


In [48]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Calculate cosine similarity for each query-document pair
similarity_scores = {}
for query_id, query in tqdm(enumerate(queries_reduced), desc = 'Computing similarity scores'):
    scores = []
    for doc_id, doc in enumerate(documents_reduced):
        sim_score = cosine_similarity(query, doc)
        scores.append((doc_id, sim_score))
    similarity_scores[query_id] = sorted(scores, key=lambda x: x[1], reverse=True)

Computing similarity scores: 112it [00:01, 77.41it/s]


In [107]:
def precision_at_k(ranked_docs, relevant_docs, k=10):
    retrieved_relevant = 0
    for doc_id in ranked_docs[:k]:
        if doc_id in relevant_docs:
            retrieved_relevant += 1
    return retrieved_relevant / k

def recall_at_k(ranked_docs, relevant_docs, k=10):
    retrieved_relevant = sum(1 for doc_id in ranked_docs[:k] if doc_id in relevant_docs)
    return retrieved_relevant / len(relevant_docs) if relevant_docs else 0

def dcg_at_k(scores, k=10):
    return sum(score / np.log2(idx + 2) for idx, score in enumerate(scores[:k]))

def ndcg_at_k(ranked_docs, relevant_docs, k=5):
    ideal_scores = [1 if doc_id in relevant_docs else 0 for doc_id in ranked_docs]
    actual_scores = [1 if doc_id in relevant_docs else 0 for doc_id in ranked_docs[:k]]
    idcg = dcg_at_k(ideal_scores, k)
    dcg = dcg_at_k(actual_scores, k)
    return dcg / idcg if idcg > 0 else 0

In [99]:
with open("../data/cisi/CISI.REL") as f:
    lines = f.read().split('\n')[:-1]
    ground_truth = [[]]*len(lines)
    for line in lines:
        clean_line = line.strip().replace('\t',' ').split()
        query, doc = [int(num.replace(' ','')) for num in clean_line[:2]]
        ground_truth[query].append(doc)

In [100]:
predictions = [0]*len(similarity_scores)
for idx, scores in similarity_scores.items():
    scores_flattened = [doc for doc,score in scores]
    predictions[idx] = scores_flattened

In [109]:
mean_precision_at_k = np.mean([precision_at_k(preds,label) for preds,label in zip(predictions,ground_truth)])
mean_recall_at_k = np.mean([recall_at_k(preds,label) for preds,label in zip(predictions,ground_truth)])
mean_ndcg_at_k = np.mean([ndcg_at_k(preds,label) for preds,label in zip(predictions,ground_truth)])

In [111]:
mean_precision_at_k, mean_recall_at_k, mean_ndcg_at_k

(0.8142857142857144, 0.002614918799889898, 1.0)

In [1]:
import ranx

In [4]:
import ir_datasets
dataset = ir_datasets.load('cord19/trec-covid')

In [6]:
for doc in dataset.docs_iter()[:10]:
    print(doc)

[INFO] [starting] building docstore
[INFO] If you have a local copy of https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv, you can symlink it here to avoid downloading it again: C:\Users\srini\.ir_datasets\downloads\80d664e496b8b7e50a39c6f6bb92e0ef
[INFO] [starting] https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv
docs_iter:   0%|                                    | 0/192509 [00:00<?, ?doc/s]
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 0.0%| 0.00/269M [00:00<?, ?B/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 0.1%| 156k/269M [00:00<03:07, 1.43MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 0.2%| 524k/269M [00:00<01:52, 2.39MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 0.4%| 1.01M/269M [00:00<01:25, 3.15MB/s][A
https://ai2-

https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 13.2%| 35.5M/269M [00:11<01:17, 3.00MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 13.4%| 36.2M/269M [00:12<01:17, 3.01MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 13.7%| 36.8M/269M [00:12<01:16, 3.02MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 13.9%| 37.5M/269M [00:12<01:15, 3.05MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 14.2%| 38.2M/269M [00:12<01:15, 3.08MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 14.4%| 38.8M/269M [00:12<01:14, 3.09MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 14.7%| 39.5M/269M [00:12<01:13, 3.12MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-0

https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 33.1%| 89.1M/269M [00:24<00:49, 3.62MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 33.5%| 90.2M/269M [00:24<00:49, 3.62MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 33.9%| 91.4M/269M [00:25<00:49, 3.61MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 34.4%| 92.5M/269M [00:25<00:48, 3.62MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 34.8%| 93.7M/269M [00:25<00:48, 3.63MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 35.2%| 94.8M/269M [00:26<00:48, 3.63MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 35.6%| 95.9M/269M [00:26<00:47, 3.63MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-0

https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 59.4%| 160M/269M [00:38<00:26, 4.15MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 59.9%| 161M/269M [00:38<00:25, 4.16MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 60.3%| 162M/269M [00:38<00:25, 4.17MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 60.8%| 164M/269M [00:39<00:25, 4.18MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 61.2%| 165M/269M [00:39<00:24, 4.19MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 61.7%| 166M/269M [00:39<00:24, 4.19MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 62.1%| 167M/269M [00:39<00:24, 4.19MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/me

https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 87.4%| 235M/269M [00:57<00:08, 4.10MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 87.8%| 236M/269M [00:57<00:08, 4.09MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 88.3%| 238M/269M [00:58<00:07, 4.08MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 88.7%| 239M/269M [00:58<00:07, 4.07MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 89.2%| 240M/269M [00:59<00:07, 4.05MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 89.6%| 241M/269M [00:59<00:06, 4.05MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv: 90.1%| 242M/269M [00:59<00:06, 4.05MB/s][A
https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/me

Cord19Doc(doc_id='ug7v899j', title='Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia', doi='10.1186/1471-2334-1-6', date='2001-07-04', abstract='OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patie

In [9]:
import pandas as pd
pd.DataFrame(dataset.qrels_iter())

[INFO] [starting] https://ir.nist.gov/covidSubmit/data/qrels-covid_d5_j0.5-5.txt
[INFO] [finished] https://ir.nist.gov/covidSubmit/data/qrels-covid_d5_j0.5-5.txt: [00:00] [1.14MB] [2.63MB/s]
                                                                                           

Unnamed: 0,query_id,doc_id,relevance,iteration
0,1,005b2j4b,2,4.5
1,1,00fmeepz,1,4
2,1,010vptx3,2,0.5
3,1,0194oljo,1,2.5
4,1,021q9884,1,4
...,...,...,...,...
69313,50,zvop8bxh,2,5
69314,50,zwf26o63,1,5
69315,50,zwsvlnwe,0,5
69316,50,zxr01yln,1,5


In [8]:
!pip install pandas

