In [1]:
import numpy as np
import pandas as pd
import os
from dotenv import load_dotenv
from nltk.tokenize import word_tokenize
from openai import OpenAI
from tqdm import tqdm
import pickle
import time

In [2]:
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [3]:
with open("../data/cisi/CISI.ALL") as f:
    articles = f.readlines()

In [4]:
articles[:20]

['.I 1\n',
 '.T\n',
 '18 Editions of the Dewey Decimal Classifications\n',
 '.A\n',
 'Comaromi, J.P.\n',
 '.W\n',
 '   The present study is a history of the DEWEY Decimal\n',
 'Classification.  The first edition of the DDC was published\n',
 'in 1876, the eighteenth edition in 1971, and future editions\n',
 "will continue to appear as needed.  In spite of the DDC's\n",
 'long and healthy life, however, its full story has never\n',
 'been told.  There have been biographies of Dewey\n',
 'that briefly describe his system, but this is the first\n',
 'attempt to provide a detailed history of the work that\n',
 'more than any other has spurred the growth of\n',
 'librarianship in this country and abroad.\n',
 '.X\n',
 '1\t5\t1\n',
 '92\t1\t1\n',
 '262\t1\t1\n']

In [5]:
def extract_docs(lines):
    title_mode = False
    body_mode = False
    edge_mode = False
    title = ""
    body = ""
    idx = None
    edge_str = ""
    edges = []
    docs = []
    for line in lines:
        for c in line:
            if line.startswith("."):
                if line.startswith(".I"):
                    for e in edge_str.split("\n"):
                        if "\t" in e:
                            edges.append((idx, int(e.split("\t")[0])))
                    idx = int(line.split()[1])
                    edge_str = ""
                    edge_mode = False
                if line.startswith(".T"):
                    title_mode = True
                    body_mode = False
                elif line.startswith(".W"):
                    title_mode = False
                    body_mode = True
                elif line.startswith(".X"):
                    docs.append({"id": idx, "title": title, "body": body})
                    title = ""
                    body = ""
                    title_mode = False
                    body_mode = False
                    edge_mode = True
                else:
                    title_mode = False
                    body_mode = False 
                    edge_mode = False
            if title_mode:
                title += c
            elif body_mode:
                body += c
            elif edge_mode:
                edge_str += c
    for e in edge_str.split("\n"):
        if "\t" in e:
            edges.append((idx, int(e.split("\t")[0])))
    return [x for x in docs if x["title"]], sorted(list(set(edges)))

In [6]:
docs, edges = extract_docs(articles)

In [7]:
docs[0]

{'id': 1,
 'title': '.T\n18 Editions of the Dewey Decimal Classifications\n',
 'body': ".W\n   The present study is a history of the DEWEY Decimal\nClassification.  The first edition of the DDC was published\nin 1876, the eighteenth edition in 1971, and future editions\nwill continue to appear as needed.  In spite of the DDC's\nlong and healthy life, however, its full story has never\nbeen told.  There have been biographies of Dewey\nthat briefly describe his system, but this is the first\nattempt to provide a detailed history of the work that\nmore than any other has spurred the growth of\nlibrarianship in this country and abroad.\n"}

In [8]:
edges[:5]

[(1, 1), (1, 92), (1, 262), (1, 556), (1, 1004)]

In [9]:
def tokenizer(text):
    return word_tokenize(text.lower())

In [10]:
queries = {}

idx = None
with open("../data/cisi/CISI.QRY") as f:
    for query in f.read().split(".I"):
        for i, line in enumerate(query.split("\n")):
            if not line:
                continue
            elif i == 0:
                idx = int(line)
                queries[idx] = ""
            elif not line.startswith("."):
                queries[idx] += " "+line
        if idx:
            queries[idx] = tokenizer(queries[idx])

In [11]:
len(queries)

112

In [12]:
client = OpenAI(api_key = OPENAI_API_KEY)

def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    if response and hasattr(response, 'data') and response.data:
        embedding = response.data[0].embedding
        return embedding
    else:
        print("Invalid response or no embedding data received.")
        return None

In [14]:
# Generate embeddings for docs
for doc in tqdm(docs, desc = 'Generating Embeddings'):
    combined_text = doc['title'] + " " + doc['body']
    doc['embedding'] = get_embedding(combined_text)

Generating Embeddings...: 100%|█████████████| 1460/1460 [04:11<00:00,  5.82it/s]


In [19]:
# Generate embeddings for queries
for idx, query in tqdm(queries.items(), desc = 'Generating Embeddings'):
    query_text = " ".join(query)
    queries[idx] = {'text': query_text, 'embedding': get_embedding(query_text)}

Generating Embeddings: 100%|█████████████████████████████████████████| 112/112 [00:43<00:00,  2.58it/s]


In [20]:
docs_file_path = './backups/openai_embeddings/doc_embeddings.pkl'
query_file_path = './backups/openai_embeddings/query_embeddings.pkl'

with open(docs_file_path, 'wb') as file:
    pickle.dump(docs, file)

print(f"Embeddings saved to {docs_file_path}")

with open(query_file_path, 'wb') as file:
    pickle.dump(queries, file)

print(f"Embeddings saved to {query_file_path}")

Embeddings saved to ./backups/query_embeddings.pkl


In [15]:
#### In case of API limit exceeded error, use embeddings from pickle files ###

docs_file_path = './backups/openai_embeddings/doc_embeddings.pkl'

with open(docs_file_path, 'rb') as file:
    loaded_docs = pickle.load(file)

print("Document embeddings loaded successfully.")

query_file_path = './backups/openai_embeddings/query_embeddings.pkl'

# Load the query embeddings from the file
with open(query_file_path, 'rb') as file:
    loaded_queries = pickle.load(file)
    
queries = loaded_queries
docs = loaded_docs
print("Query embeddings loaded successfully.")

Document embeddings loaded successfully.
Query embeddings loaded successfully.


In [16]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Calculate cosine similarity for each query-document pair
similarity_scores = {}
execution_times = [] 
for query_id, query in tqdm(queries.items(), desc = 'Computing similarity scores'):
    query_embedding = query['embedding']
    scores = []
    start_time = time.time()
    for doc in docs:
        doc_embedding = doc['embedding']
        sim_score = cosine_similarity(query_embedding, doc_embedding)
        scores.append((doc['id'], sim_score))
    
    end_time = time.time()  # Record end time
    execution_time = (end_time - start_time) * 1000  # Convert to milliseconds
    execution_times.append(execution_time)
    similarity_scores[query_id] = sorted(scores, key=lambda x: x[1], reverse=True)

mean_execution_time = sum(execution_times) / len(execution_times)
print(f"Mean execution time for all queries: {mean_execution_time:.2f} ms")

Computing similarity scores: 100%|███████████████████████████████████| 112/112 [00:29<00:00,  3.81it/s]

Mean execution time for all queries: 261.24 ms





In [17]:
len(similarity_scores)

112

In [2]:
with open("../data/cisi/CISI.REL") as f:
    lines = f.read().split('\n')[:-1]
    ground_truth = [[]]*len(lines)
    for line in lines:
        clean_line = line.strip().replace('\t',' ').split()
        query, doc = [int(num.replace(' ','')) for num in clean_line[:2]]
        ground_truth[query].append(doc)

In [3]:
with open("./backups/ground_truth.pkl", "wb") as f:
    pickle.dump(ground_truth, f)

In [None]:
# Load the ground truth from the file
with open("./backups/ground_truth.pkl", "rb") as f:
    ground_truth = pickle.load(f)

print("Grouth truth loaded succesfully.")

In [19]:
def precision_at_k(ranked_docs, relevant_docs, k=10):
    retrieved_relevant = 0
    for doc_id in ranked_docs[:k]:
        if doc_id in relevant_docs:
            retrieved_relevant += 1
    return retrieved_relevant / k

def recall_at_k(ranked_docs, relevant_docs, k=10):
    retrieved_relevant = sum(1 for doc_id in ranked_docs[:k] if doc_id in relevant_docs)
    return retrieved_relevant / len(relevant_docs) if relevant_docs else 0

def dcg_at_k(scores, k=10):
    return sum(score / np.log2(idx + 2) for idx, score in enumerate(scores[:k]))

def ndcg_at_k(ranked_docs, relevant_docs, k=5):
    ideal_scores = [1 if doc_id in relevant_docs else 0 for doc_id in ranked_docs]
    actual_scores = [1 if doc_id in relevant_docs else 0 for doc_id in ranked_docs[:k]]
    idcg = dcg_at_k(ideal_scores, k)
    dcg = dcg_at_k(actual_scores, k)
    return dcg / idcg if idcg > 0 else 0

In [20]:
predictions = [0]*(len(similarity_scores)+1)
for idx, scores in similarity_scores.items():
    scores_flattened = [doc for doc, score in scores]
    predictions[idx] = scores_flattened

In [21]:
mean_precision_at_k = np.mean([precision_at_k(preds,label) for preds,label in zip(predictions[1:],ground_truth)])
mean_recall_at_k = np.mean([recall_at_k(preds,label) for preds,label in zip(predictions[1:],ground_truth)])
mean_ndcg_at_k = np.mean([ndcg_at_k(preds,label) for preds,label in zip(predictions[1:],ground_truth)])

In [22]:
mean_precision_at_k, mean_recall_at_k, mean_ndcg_at_k

(0.9705357142857144, 0.003116685016974034, 1.0)