# Information Retrieval on CISI Dataset

This notebook implements a Vector Space Model (VSM) to retrieve documents from the CISI dataset and evaluates its performance using P@10, MAP, and Recall.

In [1]:
import os
import re
import numpy as np
import nltk
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Downloads
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords

In [2]:
# Configuration
# Assuming cisi_data is in the same directory as this notebook
DATA_DIR = os.path.join(os.getcwd(), 'cisi_data')

FILES = {
    'docs': os.path.join(DATA_DIR, 'CISI.ALL'),
    'queries': os.path.join(DATA_DIR, 'CISI.QRY'),
    'rels': os.path.join(DATA_DIR, 'CISI.REL')
}

STOPWORDS = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    tokens = [t for t in text.split() if t not in STOPWORDS]
    return ' '.join(tokens)

In [3]:
class CISIParser:
    @staticmethod
    def _parse_cisi_content(file_path, target_tags):
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")
            
        with open(file_path, 'r') as f:
            content = f.read()

        parsed_data = {}
        items = content.split('.I ')

        for item in items[1:]:  # Skip empty preamble
            lines = item.split('\n')
            try:
                obj_id = int(lines[0].strip())
            except ValueError:
                continue  # Skip malformed IDs

            collected_text = []
            current_tag = None

            for line in lines:
                if line.startswith('.'):
                    current_tag = line[:2]
                    continue

                if current_tag in target_tags:
                    collected_text.append(line)

            parsed_data[obj_id] = " ".join(collected_text).strip()

        return parsed_data

    @staticmethod
    def parse_docs(file_path):
        return CISIParser._parse_cisi_content(file_path, target_tags=['.T', '.W'])

    @staticmethod
    def parse_titles(file_path):
        return CISIParser._parse_cisi_content(file_path, target_tags=['.T'])

    @staticmethod
    def parse_queries(file_path):
        return CISIParser._parse_cisi_content(file_path, target_tags=['.W'])

    @staticmethod
    def parse_rels(file_path):
        rels = defaultdict(set)
        if not os.path.exists(file_path):
             raise FileNotFoundError(f"File not found: {file_path}")
             
        with open(file_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) >= 2:
                    try:
                        qid = int(parts[0])
                        doc_id = int(parts[1])
                        rels[qid].add(doc_id)
                    except ValueError:
                        continue
        return rels

In [4]:
class VectorSpaceModel:
    def __init__(self, docs):
        self.doc_ids = list(docs.keys())
        self.corpus = [docs[did] for did in self.doc_ids]
        self.vectorizer = TfidfVectorizer(
            preprocessor=preprocess_text,
            stop_words=None  # Preprocessing handles this
        )
        self.doc_vectors = self.vectorizer.fit_transform(self.corpus)

    def retrieve(self, query_text):
        q_vec = self.vectorizer.transform([query_text])
        scores = cosine_similarity(q_vec, self.doc_vectors).flatten()
        ranked_indices = scores.argsort()[::-1]
        results = []
        for idx in ranked_indices:
            if scores[idx] > 0:
                results.append((self.doc_ids[idx], scores[idx]))
        return results

In [5]:
# Metrics
def calculate_map(retrieved, relevant):
    if not relevant: return 0.0
    score = 0.0
    hits = 0.0
    for i, (doc_id, _) in enumerate(retrieved):
        if doc_id in relevant:
            hits += 1.0
            score += hits / (i + 1)
    return score / len(relevant)

def calculate_p10(retrieved, relevant):
    if not relevant: return 0.0
    top_10 = retrieved[:10]
    hits = sum(1 for doc_id, _ in top_10 if doc_id in relevant)
    return hits / 10.0

def calculate_recall(retrieved, relevant):
    if not relevant: return 0.0
    hits = sum(1 for doc_id, _ in retrieved if doc_id in relevant)
    return hits / len(relevant)

In [6]:
if __name__ == "__main__":
    print("Loading CISI Dataset...")
    try:
        docs = CISIParser.parse_docs(FILES['docs'])
        queries = CISIParser.parse_queries(FILES['queries'])
        rels = CISIParser.parse_rels(FILES['rels'])
        print(f"Loaded {len(docs)} docs, {len(queries)} queries, {len(rels)} rel sets.")
    except FileNotFoundError as e:
        print(f"Error: {e}")
        print("Please check your 'cisi_data' directory path.")
        docs = {}

    if docs:
        print("\nInitializing Vector Space Model...")
        vsm = VectorSpaceModel(docs)

        print("\nRunning Retrieval & Evaluation...")
        map_scores = []
        p10_scores = []
        recall_scores = []

        # Iterate over queries that have relevance judgments
        active_queries = [q for q in queries if q in rels]

        for qid in active_queries:
            query_text = queries[qid]
            relevant_docs = rels[qid]
            
            # Retrieve
            results = vsm.retrieve(query_text)
            
            # Calculate Metrics
            map_s = calculate_map(results, relevant_docs)
            p10_s = calculate_p10(results, relevant_docs)
            rec_s = calculate_recall(results, relevant_docs)
            
            map_scores.append(map_s)
            p10_scores.append(p10_s)
            recall_scores.append(rec_s)

        mean_map = np.mean(map_scores) if map_scores else 0
        mean_p10 = np.mean(p10_scores) if p10_scores else 0
        mean_recall = np.mean(recall_scores) if recall_scores else 0

        print("="*40)
        print("EVALUATION RESULTS (VSM)")
        print("="*40)
        print(f"MAP       : {mean_map:.4f}")
        print(f"P@10      : {mean_p10:.4f}")
        print(f"Recall    : {mean_recall:.4f}")
        print("="*40)

Loading CISI Dataset...
Loaded 1460 docs, 112 queries, 76 rel sets.

Initializing Vector Space Model...

Running Retrieval & Evaluation...
EVALUATION RESULTS (VSM)
MAP       : 0.1981
P@10      : 0.3197
Recall    : 0.9043
