In [33]:
# Define the corpus manually
corpus = [
    "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do.",
    "Once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it.",
    "So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy) whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies.",
    "Suddenly a White Rabbit with pink eyes ran close by her."
]

# Step 1: Remove punctuation
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
processed_corpus = []
for sentence in corpus:
    for ele in sentence:
        if ele in punc:
            sentence = sentence.replace(ele, " ")
    processed_corpus.append(sentence.lower())
print("1.Processed Corpus:\n  ", processed_corpus)

# Step 2: Tokenize the data into individual words
def tokenize_words(corpus):
    result = []
    for line in corpus:
        result.append(line.split())
    return result

tokenized_corpus = tokenize_words(processed_corpus)
print("\n2.Tokenized Corpus:\n  ", tokenized_corpus)

# Step 3: Remove stop words
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')

tokens_without_sw = []
for sentence in processed_corpus:
    text_tokens = word_tokenize(sentence)
    tokens_without_sw.extend([word for word in text_tokens if word not in stopwords.words()])

print("\n3.Tokens without Stop Words:\n  ", tokens_without_sw)

# Step 4: Create an inverted index
inverted_index = {}
for i, sentence in enumerate(processed_corpus):
    for word in tokens_without_sw:
        if word in sentence:
            if word not in inverted_index:
                inverted_index[word] = []
            inverted_index[word].append(i + 1)

print("\n4.Inverted Index:\n  ", inverted_index)

'''
    Explanation:
Corpus Processing: Removes punctuation and converts text to lowercase for consistency.
Tokenization: Splits sentences into words.
Stopword Removal: Filters out common stop words.
Inverted Index Creation: Maps each unique word to the list of sentences it appears in.
The output will be an inverted index where each word is associated with the sentence numbers containing it. Let me know if any additional tweaks are needed!

'''

1.Processed Corpus:
   ['alice was beginning to get very tired of sitting by her sister on the bank  and of having nothing to do ', 'once or twice she had peeped into the book her sister was reading  but it had no pictures or conversations in it ', 'so she was considering in her own mind  as well as she could  for the hot day made her feel very sleepy  whether the pleasure of making a daisy chain would be worth the trouble of getting up and picking the daisies ', 'suddenly a white rabbit with pink eyes ran close by her ']

2.Tokenized Corpus:
   [['alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank', 'and', 'of', 'having', 'nothing', 'to', 'do'], ['once', 'or', 'twice', 'she', 'had', 'peeped', 'into', 'the', 'book', 'her', 'sister', 'was', 'reading', 'but', 'it', 'had', 'no', 'pictures', 'or', 'conversations', 'in', 'it'], ['so', 'she', 'was', 'considering', 'in', 'her', 'own', 'mind', 'as', 'well', 'as', 'she', 'could',

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Om\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



3.Tokens without Stop Words:
   ['alice', 'beginning', 'tired', 'sitting', 'sister', 'bank', 'peeped', 'book', 'sister', 'reading', 'pictures', 'conversations', 'considering', 'mind', 'hot', 'day', 'made', 'feel', 'sleepy', 'pleasure', 'making', 'daisy', 'chain', 'worth', 'trouble', 'picking', 'daisies', 'suddenly', 'white', 'rabbit', 'pink', 'eyes', 'ran', 'close']

4.Inverted Index:
   {'alice': [1], 'beginning': [1], 'tired': [1], 'sitting': [1], 'sister': [1, 1, 2, 2], 'bank': [1], 'peeped': [2], 'book': [2], 'reading': [2], 'pictures': [2], 'conversations': [2], 'considering': [3], 'mind': [3], 'hot': [3], 'day': [3], 'made': [3], 'feel': [3], 'sleepy': [3], 'pleasure': [3], 'making': [3], 'daisy': [3], 'chain': [3], 'worth': [3], 'trouble': [3], 'picking': [3], 'daisies': [3], 'suddenly': [4], 'white': [4], 'rabbit': [4], 'pink': [4], 'eyes': [4], 'ran': [4], 'close': [4]}


'\n    Explanation:\nCorpus Processing: Removes punctuation and converts text to lowercase for consistency.\nTokenization: Splits sentences into words.\nStopword Removal: Filters out common stop words.\nInverted Index Creation: Maps each unique word to the list of sentences it appears in.\nThe output will be an inverted index where each word is associated with the sentence numbers containing it. Let me know if any additional tweaks are needed!\n\n'