In [26]:
import numpy as np
import pandas as pd
import os
from dotenv import load_dotenv
from nltk.tokenize import word_tokenize
from openai import OpenAI
from tqdm import tqdm
import pickle

In [2]:
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [3]:
with open("../data/cisi/CISI.ALL") as f:
    articles = f.readlines()

In [23]:
articles[:20]

['.I 1\n',
 '.T\n',
 '18 Editions of the Dewey Decimal Classifications\n',
 '.A\n',
 'Comaromi, J.P.\n',
 '.W\n',
 '   The present study is a history of the DEWEY Decimal\n',
 'Classification.  The first edition of the DDC was published\n',
 'in 1876, the eighteenth edition in 1971, and future editions\n',
 "will continue to appear as needed.  In spite of the DDC's\n",
 'long and healthy life, however, its full story has never\n',
 'been told.  There have been biographies of Dewey\n',
 'that briefly describe his system, but this is the first\n',
 'attempt to provide a detailed history of the work that\n',
 'more than any other has spurred the growth of\n',
 'librarianship in this country and abroad.\n',
 '.X\n',
 '1\t5\t1\n',
 '92\t1\t1\n',
 '262\t1\t1\n']

In [5]:
def extract_docs(lines):
    title_mode = False
    body_mode = False
    edge_mode = False
    title = ""
    body = ""
    idx = None
    edge_str = ""
    edges = []
    docs = []
    for line in lines:
        for c in line:
            if line.startswith("."):
                if line.startswith(".I"):
                    for e in edge_str.split("\n"):
                        if "\t" in e:
                            edges.append((idx, int(e.split("\t")[0])))
                    idx = int(line.split()[1])
                    edge_str = ""
                    edge_mode = False
                if line.startswith(".T"):
                    title_mode = True
                    body_mode = False
                elif line.startswith(".W"):
                    title_mode = False
                    body_mode = True
                elif line.startswith(".X"):
                    docs.append({"id": idx, "title": title, "body": body})
                    title = ""
                    body = ""
                    title_mode = False
                    body_mode = False
                    edge_mode = True
                else:
                    title_mode = False
                    body_mode = False 
                    edge_mode = False
            if title_mode:
                title += c
            elif body_mode:
                body += c
            elif edge_mode:
                edge_str += c
    for e in edge_str.split("\n"):
        if "\t" in e:
            edges.append((idx, int(e.split("\t")[0])))
    return [x for x in docs if x["title"]], sorted(list(set(edges)))

In [6]:
docs, edges = extract_docs(articles)

In [7]:
docs[0]

{'id': 1,
 'title': '.T\n18 Editions of the Dewey Decimal Classifications\n',
 'body': ".W\n   The present study is a history of the DEWEY Decimal\nClassification.  The first edition of the DDC was published\nin 1876, the eighteenth edition in 1971, and future editions\nwill continue to appear as needed.  In spite of the DDC's\nlong and healthy life, however, its full story has never\nbeen told.  There have been biographies of Dewey\nthat briefly describe his system, but this is the first\nattempt to provide a detailed history of the work that\nmore than any other has spurred the growth of\nlibrarianship in this country and abroad.\n"}

In [8]:
edges[:5]

[(1, 1), (1, 92), (1, 262), (1, 556), (1, 1004)]

In [9]:
def tokenizer(text):
    return word_tokenize(text.lower())

In [10]:
queries = {}

idx = None
with open("../data/cisi/CISI.QRY") as f:
    for query in f.read().split(".I"):
        for i, line in enumerate(query.split("\n")):
            if not line:
                continue
            elif i == 0:
                idx = int(line)
                queries[idx] = ""
            elif not line.startswith("."):
                queries[idx] += " "+line
        if idx:
            queries[idx] = tokenizer(queries[idx])

In [30]:
len(queries)

112

In [14]:
client = OpenAI(api_key = OPENAI_API_KEY)

def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    if response and hasattr(response, 'data') and response.data:
        embedding = response.data[0].embedding
        return embedding
    else:
        print("Invalid response or no embedding data received.")
        return None
        
for doc in tqdm(docs, desc = 'Generating Embeddings'):
    combined_text = doc['title'] + " " + doc['body']
    doc['embedding'] = get_embedding(combined_text)

Generating Embeddings...: 100%|█████████████| 1460/1460 [04:11<00:00,  5.82it/s]


In [15]:
# Generate embeddings for queries
for idx, query in tqdm(queries.items(), desc = 'Generating Embeddings'):
    query_text = " ".join(query)
    queries[idx] = {'text': query_text, 'embedding': get_embedding(query_text)}

Generating Embeddings: 100%|██████████████████| 112/112 [00:18<00:00,  6.10it/s]


In [25]:
docs_file_path = './backups/doc_embeddings.pkl'
query_file_path = './backups/query_embeddings.pkl'

with open(docs_file_path, 'wb') as file:
    pickle.dump(docs, file)

print(f"Embeddings saved to {docs_file_path}")

with open(query_file_path, 'wb') as file:
    pickle.dump(docs, file)

print(f"Embeddings saved to {query_file_path}")

Embeddings saved to ./backups/doc_embeddings.pkl
Embeddings saved to ./backups/query_embeddings.pkl


In [None]:
#### In case of API limit exceeded error, use embeddings from pickle files ###

docs_file_path = './backups/doc_embeddings.pkl'

with open(docs_file_path, 'rb') as file:
    loaded_docs = pickle.load(file)

print("Document embeddings loaded successfully.")

query_file_path = './backups/query_embeddings.pkl'

# Load the query embeddings from the file
with open(query_file_path, 'rb') as file:
    loaded_queries = pickle.load(file)

print("Query embeddings loaded successfully.")

In [18]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Calculate cosine similarity for each query-document pair
similarity_scores = {}
for query_id, query in tqdm(queries.items(), desc = 'Computing similarity scores')
    query_embedding = query['embedding']
    scores = []
    for doc in docs:
        doc_embedding = doc['embedding']
        sim_score = cosine_similarity(query_embedding, doc_embedding)
        scores.append((doc['id'], sim_score))
    similarity_scores[query_id] = sorted(scores, key=lambda x: x[1], reverse=True)

In [33]:
similarity_scores[2]

[(307, 0.8622070604460544),
 (1377, 0.8532600772483417),
 (483, 0.8525422061067497),
 (492, 0.8505962110037494),
 (487, 0.8458896480899807),
 (526, 0.8429878863521187),
 (805, 0.842949763121365),
 (1356, 0.8422791984831532),
 (503, 0.8418165228062782),
 (1078, 0.8416231765734883),
 (488, 0.8412967218128226),
 (421, 0.8397424546139038),
 (650, 0.8387414613457432),
 (612, 0.8385174284209581),
 (1136, 0.8384168703319161),
 (611, 0.838084002854477),
 (1422, 0.8377995115819561),
 (664, 0.837364699157437),
 (309, 0.8366233256378005),
 (68, 0.8366165895334814),
 (879, 0.8364018544098337),
 (733, 0.8360868871806285),
 (175, 0.8360449633024435),
 (420, 0.8355648141452376),
 (451, 0.834459062949873),
 (486, 0.834456806108527),
 (562, 0.8343412539033217),
 (662, 0.833583114747267),
 (633, 0.8332608026281798),
 (495, 0.8329512522150602),
 (655, 0.8326522991446257),
 (510, 0.8324940355639456),
 (565, 0.8322884036916737),
 (1197, 0.8322598126608585),
 (660, 0.8322383322002146),
 (125, 0.832214194103

In [40]:
rel_set = {}
with open(os.path.join("../data/cisi/", 'CISI.REL')) as f:
    for l in f.readlines():
        qry_id = l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[0] 
        doc_id = l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[-1]

        if qry_id in rel_set:
            rel_set[qry_id].append(doc_id)
        else:
            rel_set[qry_id] = []
            rel_set[qry_id].append(doc_id)

print(f"\n\nNumber of mappings = {len(rel_set)}")
print(rel_set.keys()) 



Number of mappings = 76
dict_keys(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '37', '39', '41', '42', '43', '44', '45', '46', '49', '50', '52', '54', '55', '56', '57', '58', '61', '62', '65', '66', '67', '69', '71', '76', '79', '81', '82', '84', '90', '92', '95', '96', '97', '98', '99', '100', '101', '102', '104', '109', '111'])


In [50]:
def precision_at_k(ranked_docs, relevant_docs, k=10):
    retrieved_relevant = 0
    for doc_id, _ in ranked_docs[:k]:
        if relevant_docs == []:
            retrieved_relevant +=1
        elif doc_id in relevant_docs:
            retrieved_relevant += 1
    return retrieved_relevant / k

query_id = 1
relevant_docs_for_query = list(map(int, rel_set[str(query_id)]))
print(relevant_docs_for_query)
# print(sorted([x[0] for x in similarity_scores[query_id]]))
precision = precision_at_k(similarity_scores[query_id], relevant_docs_for_query, k=10)
print(f"Precision@5: {precision}")

[28, 35, 38, 42, 43, 52, 65, 76, 86, 150, 189, 192, 193, 195, 215, 269, 291, 320, 429, 465, 466, 482, 483, 510, 524, 541, 576, 582, 589, 603, 650, 680, 711, 722, 726, 783, 813, 820, 868, 869, 894, 1162, 1164, 1195, 1196, 1281]
Precision@5: 0.9


In [21]:
def recall_at_k(ranked_docs, relevant_docs, k=10):
    retrieved_relevant = sum(1 for doc_id, _ in ranked_docs[:k] if doc_id in relevant_docs)
    return retrieved_relevant / len(relevant_docs) if relevant_docs else 0

recall = recall_at_k(similarity_scores[query_id], relevant_docs_for_query, k=10)
print(f"Recall@10: {recall}")

Recall@10: 0.6666666666666666


In [22]:
def dcg_at_k(scores, k=10):
    return sum(score / np.log2(idx + 2) for idx, score in enumerate(scores[:k]))

def ndcg_at_k(ranked_docs, relevant_docs, k=5):
    ideal_scores = [1 if doc_id in relevant_docs else 0 for doc_id, _ in ranked_docs]
    actual_scores = [1 if doc_id in relevant_docs else 0 for doc_id, _ in ranked_docs[:k]]
    idcg = dcg_at_k(ideal_scores, k)
    dcg = dcg_at_k(actual_scores, k)
    return dcg / idcg if idcg > 0 else 0

ndcg = ndcg_at_k(similarity_scores[query_id], relevant_docs_for_query, k=5)
print(f"NDCG@5: {ndcg}")


NDCG@5: 1.0
