In [1]:
from collections import defaultdict

In [3]:
def preprocess(text):
 tokens = text.lower().split() # Simple tokenization and lowercasing
 return tokens

In [7]:
# Build an inverted index
def build_inverted_index(documents):
 inverted_index = defaultdict(list)
 for doc_id, text in enumerate(documents):
     words = preprocess(text)
     for word in words:
         if doc_id not in inverted_index[word]:
             inverted_index[word].append(doc_id)
             
 return inverted_index

In [11]:
# Retrieve documents based on a query
def retrieve_documents(query, inverted_index):
    query_words = preprocess(query)
    relevant_docs = set()
    for word in query_words:
        if word in inverted_index:
           relevant_docs.update(inverted_index[word])
    
    return relevant_docs


In [13]:
# Example usage
documents = [
 "The quick brown fox jumps over the lazy dog",
 "The dog is lazy but the fox is quick",
 "Brown fox, brown dog, who is quick?",
]


In [15]:
# Build the inverted index
inverted_index = build_inverted_index(documents)

In [17]:
# Print the inverted index for reference
print("Inverted Index:", dict(inverted_index))

Inverted Index: {'the': [0, 1], 'quick': [0, 1], 'brown': [0, 2], 'fox': [0, 1], 'jumps': [0], 'over': [0], 'lazy': [0, 1], 'dog': [0, 1], 'is': [1, 2], 'but': [1], 'fox,': [2], 'dog,': [2], 'who': [2], 'quick?': [2]}


In [19]:
# Query for document retrieval
query = "quick brown"
relevant_docs = retrieve_documents(query, inverted_index)

In [21]:
# Output the relevant document IDs
print(f"Documents relevant to the query '{query}':", relevant_docs)

Documents relevant to the query 'quick brown': {0, 1, 2}
