In [1]:
# Preprocessing (preprocess.py)
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

nltk.download('stopwords')

stop = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", ' ', text)
    tokens = [t for t in text.split() if t not in stop]
    tokens = [stemmer.stem(t) for t in tokens]
    return ' '.join(tokens)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# Indexing (indexer.py)
from sklearn.feature_extraction.text import TfidfVectorizer

# Define 'docs' as a list of strings (example placeholder)
docs = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

vectorizer = TfidfVectorizer()
docs_processed = [preprocess_text(d) for d in docs]
X = vectorizer.fit_transform(docs_processed)

In [3]:
# BM25 (retrieve.py)
!pip install rank_bm25
from rank_bm25 import BM25Okapi
import numpy as np

tokenized_docs = [doc.split() for doc in docs_processed]
bm25 = BM25Okapi(tokenized_docs)

def bm25_query(q):
    q_tok = preprocess_text(q).split()
    scores = bm25.get_scores(q_tok)
    ranked = np.argsort(scores)[::-1]
    return ranked, scores

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [4]:
# Retrieval (retrieve.py)
from sklearn.metrics.pairwise import linear_kernel

def tfidf_query(q):
    q_vec = vectorizer.transform([preprocess_text(q)])
    sims = linear_kernel(q_vec, X).flatten()
    ranked = np.argsort(sims)[::-1]
    return ranked, sims

In [5]:
class CISIParser:
    @staticmethod
    def _parse_cisi_content(file_path, target_tags):
        with open(file_path, 'r') as f:
            content = f.read()

        parsed_data = {}
        items = content.split('.I ')

        for item in items[1:]:  # Skip empty preamble
            lines = item.split('\n')
            try:
                obj_id = int(lines[0].strip())
            except ValueError:
                continue  # Skip malformed IDs

            collected_text = []
            current_tag = None

            for line in lines:
                if line.startswith('.'):
                    # Update current state (e.g., .T, .W, .A)
                    current_tag = line[:2]
                    continue

                # If we are currently inside one of the tags we want, keep the line
                if current_tag in target_tags:
                    collected_text.append(line)

            parsed_data[obj_id] = " ".join(collected_text).strip()

        return parsed_data

    @staticmethod
    def parse_docs(file_path):
        # Documents need Title (.T) and Abstract (.W)
        return CISIParser._parse_cisi_content(file_path, target_tags=['.T', '.W'])

    @staticmethod
    def parse_titles(file_path):
        # Reporting only needs Title (.T)
        return CISIParser._parse_cisi_content(file_path, target_tags=['.T'])


In [6]:
@staticmethod
def parse_queries(file_path):
    # Queries usually only have body text (.W)
    # Note: Some CISI queries have .T too, but usually .W is the core question
    return CISIParser._parse_cisi_content(file_path, target_tags=['.W'])

@staticmethod
def parse_rels(file_path):
    # This format is completely different (Structure: QID DOCID ...),
    # so it stays as its own distinct logic.
    rels = defaultdict(set)
    with open(file_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 2:
                try:
                    qid = int(parts[0])
                    doc_id = int(parts[1])
                    rels[qid].add(doc_id)
                except ValueError:
                    continue
    return rels


In [7]:
class VectorSpaceModel:
    def __init__(self, docs):
        self.doc_ids = list(docs.keys())
        self.corpus = [docs[did] for did in self.doc_ids]
        self.vectorizer = TfidfVectorizer(
            tokenizer=tokenize,
            stop_words=None,
            token_pattern=None
        )
        self.doc_vectors = self.vectorizer.fit_transform(self.corpus)

    def retrieve(self, query_text):
        q_vec = self.vectorizer.transform([query_text])
        scores = cosine_similarity(q_vec, self.doc_vectors).flatten()
        ranked_indices = scores.argsort()[::-1]
        results = []
        for idx in ranked_indices:
            if scores[idx] > 0:
                results.append((self.doc_ids[idx], scores[idx]))
        return results


In [10]:
# Execution
if __name__ == "__main__":
  print("Parsing CISI Dataset...")
  # Loading everything up
  docs = CISIParser.parse_docs(FILES['docs'])

  titles = CISIParser.parse_titles(FILES['docs'])
  queries = CISIParser.parse_queries(FILES['queries'])
  rels = CISIParser.parse_rels(FILES['rels'])

  print(f"Loaded {len(docs)} documents, {lenn(queries)} queries, {len(rels)} relevance sets.")

  print("\nInitializing Models...")
  # initializing out two contenders
  vsm = VectorSpaceModel(docs)
  lm = DirichletLM(docs, mu=4000)

  print("\nRunning Retrieval...")
  # running the retrieval loop
  vsm_map_scores, vsm_p10_scores = [], []
  lm_map_scores, lm_p10_scores = [], []

  active_queries = sorted([qid for qid in queries in rels])
  results_header = f"{'Model': <10} | {'Query ID':<10} | {'Doc ID':<10} | {'Title' :<60} | {'Query':<50}\n"
  seperator = "-" * 150 + "\n"

  with open(RESULT_FILE, 'w', encoding='utf-8') as f, open(SAMPLE_RESULT_FILE, 'w', encoding='utf-8') as sample_f:
    f.write(result_header)
    f.write(seperator)