In [10]:
import os
import re
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


#### Inverted Index

In [11]:
documents = {
    1: "The quick brown fox quick jumps over the lazy dog.",
    2: "Never jump over the lazy dog quickly.",
    3: "Foxes are quick and they love jumping."
}


In [12]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Preprocessing: tokenization + stopword removal + stemming
def preprocess(text):
    tokens = re.findall(r'\b\w+\b', text.lower())  # tokenize + lowercase
    tokens = [word for word in tokens if word not in stop_words]  # remove stopwords
    stems = [stemmer.stem(word) for word in tokens]  # stemming
    return stems


In [13]:
# Inverted index: term -> set of doc IDs
inverted_index = defaultdict(set)

for doc_id, text in documents.items():
    terms = preprocess(text)
    for term in terms:
        inverted_index[term].add(doc_id)

for term in sorted(inverted_index):
    print(f"{term}: {sorted(inverted_index[term])}")


brown: [1]
dog: [1, 2]
fox: [1, 3]
jump: [1, 2, 3]
lazi: [1, 2]
love: [3]
never: [2]
quick: [1, 3]
quickli: [2]


#### Posistional Inverted Index

In [15]:
positional_index = defaultdict(lambda: defaultdict(list))

for doc_id, text in documents.items():
    terms = preprocess(text)
    for pos, term in enumerate(terms):
        positional_index[term][doc_id].append(pos)

for term in sorted(positional_index):
    print(f"{term}:")
    for doc_id, positions in positional_index[term].items():
        print(f"  Doc {doc_id}: Positions {positions}")

brown:
  Doc 1: Positions [1]
dog:
  Doc 1: Positions [6]
  Doc 2: Positions [3]
fox:
  Doc 1: Positions [2]
  Doc 3: Positions [0]
jump:
  Doc 1: Positions [4]
  Doc 2: Positions [1]
  Doc 3: Positions [3]
lazi:
  Doc 1: Positions [5]
  Doc 2: Positions [2]
love:
  Doc 3: Positions [2]
never:
  Doc 2: Positions [0]
quick:
  Doc 1: Positions [0, 3]
  Doc 3: Positions [1]
quickli:
  Doc 2: Positions [4]
