In [8]:
# Import libraries
from collections import defaultdict

In [9]:
# Sample documents
documents = {
    "doc1": "Data science and machine learning are crucial in AI development.",
    "doc2": "Machine learning and AI are advancing rapidly with new technologies.",
    "doc3": "Data analysis and machine learning are key components of data science."
}

In [10]:
# Function to build an inverted index
def build_inverted_index(documents):
    inverted_index = defaultdict(list)

    for doc_id, text in documents.items():
        # Tokenize and normalize the text
        words = text.lower().split()

        # Populate the inverted index
        for word in words:
            if doc_id not in inverted_index[word]:
                inverted_index[word].append(doc_id)

    return inverted_index

In [11]:
# Function to retrieve documents for a given query
def retrieve_documents(query, inverted_index):
    query_words = query.lower().split()
    retrieved_docs = set()

    for word in query_words:
        if word in inverted_index:
            retrieved_docs.update(inverted_index[word])

    return retrieved_docs

In [12]:
# Build the inverted index
inverted_index = build_inverted_index(documents)

In [13]:
# Display the inverted index
print("Inverted Index:", dict(inverted_index))

Inverted Index: {'data': ['doc1', 'doc3'], 'science': ['doc1'], 'and': ['doc1', 'doc2', 'doc3'], 'machine': ['doc1', 'doc2', 'doc3'], 'learning': ['doc1', 'doc2', 'doc3'], 'are': ['doc1', 'doc2', 'doc3'], 'crucial': ['doc1'], 'in': ['doc1'], 'ai': ['doc1', 'doc2'], 'development.': ['doc1'], 'advancing': ['doc2'], 'rapidly': ['doc2'], 'with': ['doc2'], 'new': ['doc2'], 'technologies.': ['doc2'], 'analysis': ['doc3'], 'key': ['doc3'], 'components': ['doc3'], 'of': ['doc3'], 'science.': ['doc3']}


In [14]:
# Query and retrieve documents
query = "machine learning"
retrieved_docs = retrieve_documents(query, inverted_index)
print("\nDocuments retrieved for query:", query)
print(retrieved_docs)


Documents retrieved for query: machine learning
{'doc2', 'doc1', 'doc3'}
