In [None]:
import os
import math
from collections import defaultdict

In [None]:
# Function to preprocess text (tokenization, lowercasing)
def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    # Tokenize by splitting on non-alphanumeric characters
    tokens = [word for word in text.split() if word.isalpha()]
    return tokens

In [None]:
# Step 1: Building the index (dictionary and postings list)
def build_index(corpus_dir):
    index = defaultdict(list)
    doc_lengths = {}
    doc_ids = {}
    doc_id_counter = 0
    num_docs = 0
    doc_term_counts = defaultdict(int)

    for doc in os.listdir(corpus_dir):
        if doc.endswith(".txt"):
            doc_id_counter += 1
            doc_path = os.path.join(corpus_dir, doc)
            with open(doc_path, 'r', encoding='utf-8') as f:
                content = f.read()
            tokens = preprocess(content)
            doc_ids[doc_id_counter] = doc

            term_freq = defaultdict(int)
            for token in tokens:
                term_freq[token] += 1

            # Store term frequencies in the index
            for term, freq in term_freq.items():
                index[term].append((doc_id_counter, freq))
                doc_term_counts[term] += 1

            # Calculate document length for normalization (lnc)
            length = math.sqrt(sum((1 + math.log10(freq)) ** 2 for freq in term_freq.values()))
            doc_lengths[doc_id_counter] = length
            num_docs += 1

    return index, doc_ids, doc_lengths, num_docs, doc_term_counts