In [None]:
import os
import math
from collections import defaultdict

In [None]:
# Function to preprocess text (tokenization, lowercasing)
def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    # Tokenize by splitting on non-alphanumeric characters
    tokens = [word for word in text.split() if word.isalpha()]
    return tokens

In [None]:
# Step 1: Building the index (dictionary and postings list)
def build_index(corpus_dir):
    index = defaultdict(list)
    doc_lengths = {}
    doc_ids = {}
    doc_id_counter = 0
    num_docs = 0
    doc_term_counts = defaultdict(int)

    for doc in os.listdir(corpus_dir):
        if doc.endswith(".txt"):
            doc_id_counter += 1
            doc_path = os.path.join(corpus_dir, doc)
            with open(doc_path, 'r', encoding='utf-8') as f:
                content = f.read()
            tokens = preprocess(content)
            doc_ids[doc_id_counter] = doc

            term_freq = defaultdict(int)
            for token in tokens:
                term_freq[token] += 1

            # Store term frequencies in the index
            for term, freq in term_freq.items():
                index[term].append((doc_id_counter, freq))
                doc_term_counts[term] += 1

            # Calculate document length for normalization (lnc)
            length = math.sqrt(sum((1 + math.log10(freq)) ** 2 for freq in term_freq.values()))
            doc_lengths[doc_id_counter] = length
            num_docs += 1

    return index, doc_ids, doc_lengths, num_docs, doc_term_counts

In [None]:
# Step 2: Calculate query vector using ltc scheme
def calculate_query_vector(query, index, num_docs, doc_term_counts):
    query_tokens = preprocess(query)
    query_freq = defaultdict(int)
    query_vector = {}
    
    for token in query_tokens:
        query_freq[token] += 1
    
    for token, freq in query_freq.items():
        if token in index:
            df = doc_term_counts[token]  # document frequency
            idf = math.log10(num_docs / df) if df > 0 else 0
            query_vector[token] = (1 + math.log10(freq)) * idf
    
    return query_vector

In [None]:
# Step 3: Rank documents using cosine similarity (lnc.ltc)
def rank_documents(query_vector, index, doc_lengths, doc_ids):
    scores = defaultdict(float)
    
    # Compute document scores based on the query
    for term, q_weight in query_vector.items():
        if term in index:
            for doc_id, term_freq in index[term]:
                doc_weight = 1 + math.log10(term_freq)  # lnc: log normalization
                scores[doc_id] += q_weight * doc_weight

    # Normalize by document length (cosine similarity)
    for doc_id in scores:
        scores[doc_id] /= doc_lengths[doc_id]

    # Sort documents by score in descending order
    ranked_docs = sorted(scores.items(), key=lambda item: (-item[1], doc_ids[item[0]]))

    # Return the top 10 documents
    return [(doc_ids[doc_id], score) for doc_id, score in ranked_docs[:10]]