In [13]:
import numpy as np
import pandas as pd
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi


In [14]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\preet\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
def parse_cisi(file_path):
    documents = []
    doc_ids = []

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        content = file.read()

    docs = content.split('.I ')[1:]

    for doc in docs:
        lines = doc.split('\n')
        doc_id = lines[0].strip()

        text = ""
        for line in lines:
            if line.startswith('.T') or line.startswith('.W'):
                continue
            text += line + " "

        documents.append(text.strip())
        doc_ids.append(doc_id)

    return documents, doc_ids


In [16]:
documents, doc_ids = parse_cisi("cisi.all")


In [17]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def tokenizer(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [stemmer.stem(w) for w in tokens if w not in stop_words]
    return tokens


In [18]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenizer)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)




In [19]:
tokenized_corpus = [tokenizer(doc) for doc in documents]
bm25 = BM25Okapi(tokenized_corpus)


In [21]:
def retrieve_tfidf(query, top_k=10):
    query_vec = tfidf_vectorizer.transform([query])
    scores = cosine_similarity(query_vec, tfidf_matrix).flatten()

    top_indices = scores.argsort()[::-1][:top_k]

    data = []
    for idx in top_indices:
        data.append({
            "doc_id": doc_ids[idx],
            "score": scores[idx],
            "snippet": documents[idx][:150]
        })

    return pd.DataFrame(data)

def retrieve_bm25(query, top_k=10):
    tokenized_query = tokenizer(query)
    scores = bm25.get_scores(tokenized_query)

    top_indices = np.argsort(scores)[::-1][:top_k]

    data = []
    for idx in top_indices:
        data.append({
            "doc_id": doc_ids[idx],
            "score": scores[idx],
            "snippet": documents[idx][:150]
        })

    return pd.DataFrame(data)



In [22]:
query = "automatic document indexing and retrieval systems"

tfidf_output = retrieve_tfidf(query)
bm25_output = retrieve_bm25(query)

tfidf_output


Unnamed: 0,doc_id,score,snippet
0,565,0.588029,565 Computer Evaluation of Indexing and Text P...
1,790,0.455504,790 Computer Indexing of Medical Articles - Pr...
2,315,0.440325,315 Automatic Abstracting and Indexing - Surve...
3,72,0.438267,72 A Comparison Between Manual and Automatic I...
4,1144,0.437281,"1144 Automatic Indexing .A Stevens, M.E. A..."
5,51,0.425831,51 An Experiment in Automatic Indexing .A Dame...
6,1327,0.422275,1327 The SMART Retrieval System Experiments in...
7,662,0.40935,662 Automatic Indexing: An Experimental Inquir...
8,446,0.389985,"446 Computer Assisted Indexing .A Gray, W. A. ..."
9,663,0.387364,663 Automatic Document Classification Part I...


In [23]:
bm25_output


Unnamed: 0,doc_id,score,snippet
0,565,13.838548,565 Computer Evaluation of Indexing and Text P...
1,790,12.072155,790 Computer Indexing of Medical Articles - Pr...
2,1419,11.795622,1419 Utility of Automatic Classification Syste...
3,51,11.140641,51 An Experiment in Automatic Indexing .A Dame...
4,1327,10.996559,1327 The SMART Retrieval System Experiments in...
5,448,10.864333,448 An Evaluation of Query Expansion by the Ad...
6,830,10.831249,"830 Progress in Documentation .A Jones, K.s. ..."
7,608,10.763713,608 A new comparison Between Conventional Inde...
8,72,10.624096,72 A Comparison Between Manual and Automatic I...
9,662,10.413535,662 Automatic Indexing: An Experimental Inquir...


In [29]:
def precision_at_k(retrieved, relevant, k=10):
    retrieved_k = retrieved[:k]
    return len(set(retrieved_k) & set(relevant)) / k
def recall_at_k(retrieved, relevant, k=10):
    retrieved_k = retrieved[:k]
    return len(set(retrieved_k) & set(relevant)) / len(relevant)
def average_precision(retrieved, relevant):
    score = 0.0
    hits = 0
    for i, doc in enumerate(retrieved, start=1):
        if doc in relevant:
            hits += 1
            score += hits / i
    return score / len(relevant)
def ndcg_at_k(retrieved, relevant, k=10):
    dcg = 0.0
    for i, doc in enumerate(retrieved[:k], start=1):
        if doc in relevant:
            dcg += 1 / np.log2(i + 1)

    ideal_dcg = sum(1 / np.log2(i + 1) for i in range(1, min(len(relevant), k) + 1))
    return dcg / ideal_dcg if ideal_dcg > 0 else 0
tfidf_docs = tfidf_output["doc_id"].tolist()
bm25_docs = bm25_output["doc_id"].tolist()

relevant_docs = tfidf_docs[:3]  # assumed relevant
print("TF-IDF Precision:", precision_at_k(tfidf_docs, relevant_docs))
print("TF-IDF Recall:", recall_at_k(tfidf_docs, relevant_docs))
print("TF-IDF MAP:", average_precision(tfidf_docs, relevant_docs))
print("TF-IDF nDCG:", ndcg_at_k(tfidf_docs, relevant_docs))

print("\nBM25 Precision:", precision_at_k(bm25_docs, relevant_docs))
print("BM25 Recall:", recall_at_k(bm25_docs, relevant_docs))
print("BM25 MAP:", average_precision(bm25_docs, relevant_docs))
print("BM25 nDCG:", ndcg_at_k(bm25_docs, relevant_docs))







TF-IDF Precision: 0.3
TF-IDF Recall: 1.0
TF-IDF MAP: 1.0
TF-IDF nDCG: 1.0

BM25 Precision: 0.2
BM25 Recall: 0.6666666666666666
BM25 MAP: 0.6666666666666666
BM25 nDCG: 0.7653606369886217
