In [2]:
import numpy as np
import pandas as pd
import os
import string
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import precision_score, recall_score, f1_score
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
import re
from collections import defaultdict

In [None]:
with open("../data/cisi/CISI.ALL") as f:
    articles = f.readlines()

In [None]:
def extract_docs(lines):
    title_mode = False
    body_mode = False
    edge_mode = False
    title = ""
    body = ""
    idx = None
    edge_str = ""
    edges = []
    docs = []
    for line in lines:
        for c in line:
            if line.startswith("."):
                if line.startswith(".I"):
                    for e in edge_str.split("\n"):
                        if "\t" in e:
                            edges.append((idx, int(e.split("\t")[0])))
                    idx = int(line.split()[1])
                    edge_str = ""
                    edge_mode = False
                if line.startswith(".T"):
                    title_mode = True
                    body_mode = False
                elif line.startswith(".W"):
                    title_mode = False
                    body_mode = True
                elif line.startswith(".X"):
                    docs.append({"id": idx, "title": title, "body": body})
                    title = ""
                    body = ""
                    title_mode = False
                    body_mode = False
                    edge_mode = True
                else:
                    title_mode = False
                    body_mode = False 
                    edge_mode = False
            if title_mode:
                title += c
            elif body_mode:
                body += c
            elif edge_mode:
                edge_str += c
    for e in edge_str.split("\n"):
        if "\t" in e:
            edges.append((idx, int(e.split("\t")[0])))
    return [x for x in docs if x["title"]], sorted(list(set(edges)))

In [None]:
docs, edges = extract_docs(articles)

In [None]:
def tokenizer(text):
    return word_tokenize(text.lower())

In [None]:
queries = {}

idx = None
with open("../data/cisi/CISI.QRY") as f:
    for query in f.read().split(".I"):
        for i, line in enumerate(query.split("\n")):
            if not line:
                continue
            elif i == 0:
                idx = int(line)
                queries[idx] = ""
            elif not line.startswith("."):
                queries[idx] += " "+line
        if idx:
            queries[idx] = tokenizer(queries[idx])

In [None]:
def clean_text(docs):
    sentences_clean = []
    for sentence in docs:
        lookup_table = sentence.maketrans('', '', string.punctuation)
        clean_text = sentence.translate(lookup_table)
        word_list = word_tokenize(clean_text)
        word_list = [w for w in word_list if not w in stop_words and len(w) > 2]
        word_list = [lemmatizer.lemmatize(word) for word in word_list]
        clean_text = ' '.join(word_list)
        sentences_clean.append(clean_text)
    return sentences_clean


In [None]:
docs_body = [doc['body'] for doc in docs]
clean_documents = clean_text(docs_body)

In [3]:
with open("./backups/openai_embeddings/doc_embeddings_nfcorpus.pkl", "rb") as f:
    documents = pickle.load(f)

In [4]:
document_ids = list(documents.keys())

In [5]:
clean_documents = [documents[doc]["text"] for doc in documents]

In [6]:
with open("./backups/openai_embeddings/query_embeddings_nfcorpus.pkl", "rb") as f:
    queries = pickle.load(f)

In [7]:
query_ids = list(queries.keys())

In [8]:
clean_queries = [queries[q]["text"] for q in queries]

In [9]:
len(clean_documents), len(clean_queries)

(3633, 323)

In [10]:
document_ids[0], clean_documents[0], query_ids[0], clean_queries[0]

('MED-10',
 'Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear. We evaluated risk of breast cancer death among statin users in a population-based cohort of breast cancer patients. The study cohort included all newly diagnosed breast cancer patients in Finland during 1995–2003 (31,236 cases), identified from the Finnish Cancer Registry. Information on statin use before and after the diagnosis was obtained from a national prescription database. We used the Cox proportional hazards regression method to estimate mortality among statin users with statin use as time-dependent variable. A total of 4,151 participants had used statins. During the median follow-up of 3.25 years after the diagnosis (range 0.08–9.0 years) 6,011 participants died, of which 3,619 (60.2%) was due to breast cancer. After adjustment for age, tumo

In [None]:
queries = {}

idx = None
with open("../data/cisi/CISI.QRY") as f:
    for query in f.read().split(".I"):
        for i, line in enumerate(query.split("\n")):
            if not line:
                continue
            elif i == 0:
                idx = int(line)
                queries[idx] = ""
            elif not line.startswith("."):
                queries[idx] += " "+line
queries = [queries[idx] for idx in range(1,len(queries)+1)]

In [None]:
clean_queries = clean_text(queries)

In [11]:
vectorizer = TfidfVectorizer()
vectorizer.fit(clean_documents+clean_queries)
documents_vectors = vectorizer.transform(clean_documents)
queries_vectors = vectorizer.transform(clean_queries)
num_topics = 100
svd = TruncatedSVD(n_components=num_topics)
documents_reduced = svd.fit_transform(documents_vectors)
queries_reduced = svd.fit_transform(queries_vectors)


In [12]:
documents_reduced_dict = {id: vector for id, vector in zip(document_ids, documents_reduced)}

In [13]:
queries_reduced_dict = {id: vector for id, vector in zip(query_ids, queries_reduced)}

In [14]:
with open("./backups/lsi/nf_corpus/documents.pkl", "wb") as f:
    pickle.dump(documents_reduced_dict, f)

with open("./backups/lsi/nf_corpus/queries.pkl", "wb") as f:
    pickle.dump(queries_reduced_dict, f)

In [14]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Calculate cosine similarity for each query-document pair
similarity_scores = {}
for query_id, query in tqdm(enumerate(queries_reduced), desc = 'Computing similarity scores'):
    scores = []
    for doc_id, doc in enumerate(documents_reduced):
        sim_score = cosine_similarity(query, doc)
        scores.append((doc_id, sim_score))
    similarity_scores[query_id] = sorted(scores, key=lambda x: x[1], reverse=True)

Computing similarity scores: 1406it [00:09, 154.94it/s]


In [None]:
def precision_at_k(ranked_docs, relevant_docs, k=10):
    retrieved_relevant = 0
    for doc_id in ranked_docs[:k]:
        if doc_id in relevant_docs:
            retrieved_relevant += 1
    return retrieved_relevant / k

def recall_at_k(ranked_docs, relevant_docs, k=10):
    retrieved_relevant = sum(1 for doc_id in ranked_docs[:k] if doc_id in relevant_docs)
    return retrieved_relevant / len(relevant_docs) if relevant_docs else 0

def dcg_at_k(scores, k=10):
    return sum(score / np.log2(idx + 2) for idx, score in enumerate(scores[:k]))

def ndcg_at_k(ranked_docs, relevant_docs, k=5):
    ideal_scores = [1 if doc_id in relevant_docs else 0 for doc_id in ranked_docs]
    actual_scores = [1 if doc_id in relevant_docs else 0 for doc_id in ranked_docs[:k]]
    idcg = dcg_at_k(ideal_scores, k)
    dcg = dcg_at_k(actual_scores, k)
    return dcg / idcg if idcg > 0 else 0

In [None]:
with open("../data/cisi/CISI.REL") as f:
    lines = f.read().split('\n')[:-1]
    ground_truth = [[]]*len(lines)
    for line in lines:
        clean_line = line.strip().replace('\t',' ').split()
        query, doc = [int(num.replace(' ','')) for num in clean_line[:2]]
        ground_truth[query].append(doc)

In [None]:
predictions = [0]*len(similarity_scores)
for idx, scores in similarity_scores.items():
    scores_flattened = [doc for doc,score in scores]
    predictions[idx] = scores_flattened

In [None]:
mean_precision_at_k = np.mean([precision_at_k(preds,label) for preds,label in zip(predictions,ground_truth)])
mean_recall_at_k = np.mean([recall_at_k(preds,label) for preds,label in zip(predictions,ground_truth)])
mean_ndcg_at_k = np.mean([ndcg_at_k(preds,label) for preds,label in zip(predictions,ground_truth)])

In [None]:
mean_precision_at_k, mean_recall_at_k, mean_ndcg_at_k

In [None]:
sps = np.mean([precision_at_k(preds,label,k=1) for preds,label in zip(predictions,ground_truth)])
sps