## CSD358 - Information Retrieval
### Assignment 2
#### Tejaswi Manavala Narayanan - 2110110556
#### Pratham Goel - 2110110388

Github link: https://github.com/prathampg2003/IR_assignment

In [43]:
import os
import math
import re
from collections import defaultdict

In [44]:
def  read_file(fname):
    with open(fname, 'r', encoding = 'utf8') as f:
        txt = f.read()
        return txt

In [45]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [46]:
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mdtej\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [47]:

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mdtej\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mdtej\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [48]:
from nltk.stem import PorterStemmer
nltk.download("punkt")
ps = PorterStemmer()

from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mdtej\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mdtej\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Reading files

## Vector Space Model

In [53]:
class VectorSpaceModel:
    def __init__(self):
        self.dictionary = {}  # Stores term -> (df, [(doc_id, term_freq)])
        self.doc_lengths = {}  # Stores doc_id -> document length
        self.doc_count = 0     # Number of documents in the corpus
        self.doc_id_to_file = {}  # Maps doc_id to file name
        self.stop_words = set(stopwords.words('english'))  # Stop words set
        self.lemmatizer = WordNetLemmatizer()  # Initialize lemmatizer
        self.stemmer = PorterStemmer()  # Initialize stemmer

    def preprocess(self, text):
        """
        Tokenizes, removes stop words, lemmatizes, and stems the text.
        """
        tokens = re.findall(r'\b\w+\b', text.lower())  # Tokenize and remove non-alphanumeric characters
        filtered_tokens = [word for word in tokens if word not in self.stop_words]  # Remove stop words
        lemmatized_tokens = [self.lemmatizer.lemmatize(word) for word in filtered_tokens]  # Lemmatization
        stemmed_tokens = [self.stemmer.stem(word) for word in lemmatized_tokens]  # Stemming
        return stemmed_tokens

    def process_corpus(self, directory):
        """
        Reads documents from the corpus directory and preprocesses them.
        """
        allfiles = {}
        for i, file in enumerate(os.listdir(directory), 1):
            with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
                content = f.read()
            preprocessed_content = self.preprocess(content)
            allfiles[i] = preprocessed_content
            self.doc_id_to_file[i] = file
            self.add_document(i, preprocessed_content)  # Add document terms to index
        return allfiles

    def add_document(self, doc_id, terms):
        """
        Adds a document's terms to the index.
        """
        term_freqs = defaultdict(int)
        for term in terms:
            term_freqs[term] += 1

        for term, tf in term_freqs.items():
            if term not in self.dictionary:
                self.dictionary[term] = (0, [])
            df, postings = self.dictionary[term]
            postings.append((doc_id, tf))
            self.dictionary[term] = (df + 1, postings)

        # Calculate and store document length (used for normalization)
        doc_length = math.sqrt(sum((1 + math.log10(tf))**2 for tf in term_freqs.values()))
        self.doc_lengths[doc_id] = doc_length
        self.doc_count += 1

    def tf_idf(self, term, doc_id, tf, for_query=False):
        """
        Computes the tf-idf value for a term in a document or query.
        """
        df, _ = self.dictionary[term]
        idf = math.log10(self.doc_count / df) if df > 0 else 0
        log_tf = 1 + math.log10(tf) if tf > 0 else 0

        if for_query:
            return log_tf * idf  # Use idf for query
        return log_tf  # no idf for documents

    def rank_documents(self, query_terms):
            """
            Ranks documents by cosine similarity between the query and each document.
            """
            query_term_freqs = defaultdict(int)
            
            # Count term frequencies in the query
            for term in query_terms:
                if term in self.dictionary:
                    query_term_freqs[term] += 1

            # Create the query vector and apply log tf and idf weighting
            query_vector = {}
            for term, tf in query_term_freqs.items():
                query_vector[term] = self.tf_idf(term, None, tf, for_query=True)

            # Normalize the query vector
            query_length = math.sqrt(sum(weight ** 2 for weight in query_vector.values()))
            
            if query_length > 0:
                # Normalize each weight in the query vector
                query_vector = {term: weight / query_length for term, weight in query_vector.items()}

            # Calculate cosine similarity for each document
            scores = defaultdict(float)
            
            for term, query_weight in query_vector.items():
                df, postings = self.dictionary.get(term, (0, []))
                for doc_id, tf in postings:
                    # Calculate document weight using TF-IDF
                    doc_weight = self.tf_idf(term, doc_id, tf)
                    scores[doc_id] += query_weight * doc_weight

            # Normalize document scores using cosine normalization
            for doc_id in scores:
                doc_vector_length = self.doc_lengths[doc_id]  # We already store the Euclidean length
                
                if doc_vector_length > 0:
                    scores[doc_id] /= doc_vector_length  # Apply correct cosine normalization

            ranked_docs = sorted(scores.items(), key=lambda x: (-x[1], x[0]))
            ranked_filenames = [(self.doc_id_to_file[doc_id], score) for doc_id, score in ranked_docs[:10]]

            return ranked_filenames

#### Testing

In [54]:
vsm = VectorSpaceModel()
corpus_directory = "Corpus"
vsm.process_corpus(corpus_directory)

# Search with a free-text query
query = "Developing your Zomato business account and profile is a great way to boost your restaurant's online reputation"
query_terms = vsm.preprocess(query)
result = vsm.rank_documents(query_terms)

# Print the top relevant documents with scores
print(f"Top relevant documents for query '{query}':")
for i in range(10):
    print(f"{result[i][0]}: {result[i][1]:.8f}")


Top relevant documents for query 'Developing your Zomato business account and profile is a great way to boost your restaurant's online reputation':
zomato.txt: 0.21460251
swiggy.txt: 0.13100810
instagram.txt: 0.06052477
messenger.txt: 0.05916808
youtube.txt: 0.05845097
Discord.txt: 0.05331835
bing.txt: 0.05177956
paypal.txt: 0.04708566
reddit.txt: 0.04409441
flipkart.txt: 0.04072831


In [55]:
query = "Warwickshire, came from an ancient family and was the heiress to some land"
query_terms = vsm.preprocess(query)
result = vsm.rank_documents(query_terms)

# Print the top relevant documents with scores
print(f"Top relevant documents for query '{query}':")
for i in range(10):
    print(f"{result[i][0]}: {result[i][1]:.8f}")

Top relevant documents for query 'Warwickshire, came from an ancient family and was the heiress to some land':
shakespeare.txt: 0.11997620
levis.txt: 0.02414239
Adobe.txt: 0.02265058
google.txt: 0.02072642
nike.txt: 0.01921104
zomato.txt: 0.01770312
huawei.txt: 0.01372434
skype.txt: 0.01170110
blackberry.txt: 0.01094421
Dell.txt: 0.01076635
