Develop a text preprocessing and analysis application using NLTK for tokenization, POS
tagging, and basic NLP tasks.

In [21]:
!pip install nltk



In [22]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [23]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import pos_tag

In [24]:
text = """
Alia Bhatt won the National Film Award for Best Actress for Gangubai Kathiawadi (2022). She received the Filmfare Award for Best Actress a record-setting six times for her roles in Udta Punjab (2016), Raazi (2018), Gully Boy (2019), Gangubai Kathiawadi (2022), Rocky Aur Rani Kii Prem Kahaani (2023) and Jigra (2024), in addition to the Filmfare Critics Award for Best Actress for Highway (2014). She also won the Filmfare OTT Award for Best Actress in a Web Original Film for Darlings (2022).
"""


# Sentence Tokenizer

In [25]:
sentences = sent_tokenize(text)
print("Sentences:")
for s in sentences:
    print("-", s)

Sentences:
- 
Alia Bhatt won the National Film Award for Best Actress for Gangubai Kathiawadi (2022).
- She received the Filmfare Award for Best Actress a record-setting six times for her roles in Udta Punjab (2016), Raazi (2018), Gully Boy (2019), Gangubai Kathiawadi (2022), Rocky Aur Rani Kii Prem Kahaani (2023) and Jigra (2024), in addition to the Filmfare Critics Award for Best Actress for Highway (2014).
- She also won the Filmfare OTT Award for Best Actress in a Web Original Film for Darlings (2022).


# Word Tokenizer

In [26]:
words = word_tokenize(text)
print("Words:")
print(words)

Words:
['Alia', 'Bhatt', 'won', 'the', 'National', 'Film', 'Award', 'for', 'Best', 'Actress', 'for', 'Gangubai', 'Kathiawadi', '(', '2022', ')', '.', 'She', 'received', 'the', 'Filmfare', 'Award', 'for', 'Best', 'Actress', 'a', 'record-setting', 'six', 'times', 'for', 'her', 'roles', 'in', 'Udta', 'Punjab', '(', '2016', ')', ',', 'Raazi', '(', '2018', ')', ',', 'Gully', 'Boy', '(', '2019', ')', ',', 'Gangubai', 'Kathiawadi', '(', '2022', ')', ',', 'Rocky', 'Aur', 'Rani', 'Kii', 'Prem', 'Kahaani', '(', '2023', ')', 'and', 'Jigra', '(', '2024', ')', ',', 'in', 'addition', 'to', 'the', 'Filmfare', 'Critics', 'Award', 'for', 'Best', 'Actress', 'for', 'Highway', '(', '2014', ')', '.', 'She', 'also', 'won', 'the', 'Filmfare', 'OTT', 'Award', 'for', 'Best', 'Actress', 'in', 'a', 'Web', 'Original', 'Film', 'for', 'Darlings', '(', '2022', ')', '.']


# Stopword Removal

In [27]:
stop_words = set(stopwords.words('english'))

filtered_words = [w for w in words if w.isalpha() and w.lower() not in stop_words]
print("After Stopword Removal:")
print(filtered_words)

After Stopword Removal:
['Alia', 'Bhatt', 'National', 'Film', 'Award', 'Best', 'Actress', 'Gangubai', 'Kathiawadi', 'received', 'Filmfare', 'Award', 'Best', 'Actress', 'six', 'times', 'roles', 'Udta', 'Punjab', 'Raazi', 'Gully', 'Boy', 'Gangubai', 'Kathiawadi', 'Rocky', 'Aur', 'Rani', 'Kii', 'Prem', 'Kahaani', 'Jigra', 'addition', 'Filmfare', 'Critics', 'Award', 'Best', 'Actress', 'Highway', 'also', 'Filmfare', 'OTT', 'Award', 'Best', 'Actress', 'Web', 'Original', 'Film', 'Darlings']


# Stemming

In [28]:
stemmer = PorterStemmer()

stemmed_words = [stemmer.stem(w) for w in filtered_words]
print("Stemmed Words:")
print(stemmed_words)

Stemmed Words:
['alia', 'bhatt', 'nation', 'film', 'award', 'best', 'actress', 'gangubai', 'kathiawadi', 'receiv', 'filmfar', 'award', 'best', 'actress', 'six', 'time', 'role', 'udta', 'punjab', 'raazi', 'gulli', 'boy', 'gangubai', 'kathiawadi', 'rocki', 'aur', 'rani', 'kii', 'prem', 'kahaani', 'jigra', 'addit', 'filmfar', 'critic', 'award', 'best', 'actress', 'highway', 'also', 'filmfar', 'ott', 'award', 'best', 'actress', 'web', 'origin', 'film', 'darl']


# Lemmatization

In [29]:
lemmatizer = WordNetLemmatizer()

lemmatized_words = [lemmatizer.lemmatize(w) for w in filtered_words]
print("Lemmatized Words:")
print(lemmatized_words)

Lemmatized Words:
['Alia', 'Bhatt', 'National', 'Film', 'Award', 'Best', 'Actress', 'Gangubai', 'Kathiawadi', 'received', 'Filmfare', 'Award', 'Best', 'Actress', 'six', 'time', 'role', 'Udta', 'Punjab', 'Raazi', 'Gully', 'Boy', 'Gangubai', 'Kathiawadi', 'Rocky', 'Aur', 'Rani', 'Kii', 'Prem', 'Kahaani', 'Jigra', 'addition', 'Filmfare', 'Critics', 'Award', 'Best', 'Actress', 'Highway', 'also', 'Filmfare', 'OTT', 'Award', 'Best', 'Actress', 'Web', 'Original', 'Film', 'Darlings']


# POS Tagging

In [30]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [31]:
pos_tags = pos_tag(filtered_words)

print("POS Tagged Words:")
for word, tag in pos_tags:
    print(f"{word} → {tag}")

POS Tagged Words:
Alia → NNP
Bhatt → NNP
National → NNP
Film → NNP
Award → NNP
Best → NNP
Actress → NNP
Gangubai → NNP
Kathiawadi → NNP
received → VBD
Filmfare → NNP
Award → NNP
Best → NNP
Actress → NNP
six → CD
times → NNS
roles → VBZ
Udta → NNP
Punjab → NNP
Raazi → NNP
Gully → NNP
Boy → NNP
Gangubai → NNP
Kathiawadi → NNP
Rocky → NNP
Aur → NNP
Rani → NNP
Kii → NNP
Prem → NNP
Kahaani → NNP
Jigra → NNP
addition → NN
Filmfare → NNP
Critics → NNP
Award → NNP
Best → NNP
Actress → NNP
Highway → NNP
also → RB
Filmfare → NNP
OTT → NNP
Award → NNP
Best → NNP
Actress → NNP
Web → NNP
Original → NNP
Film → NNP
Darlings → NNP


# Word Frequency

In [32]:
from collections import Counter

word_freq = Counter(lemmatized_words)
print("Word Frequency:")
print(word_freq)

Word Frequency:
Counter({'Award': 4, 'Best': 4, 'Actress': 4, 'Filmfare': 3, 'Film': 2, 'Gangubai': 2, 'Kathiawadi': 2, 'Alia': 1, 'Bhatt': 1, 'National': 1, 'received': 1, 'six': 1, 'time': 1, 'role': 1, 'Udta': 1, 'Punjab': 1, 'Raazi': 1, 'Gully': 1, 'Boy': 1, 'Rocky': 1, 'Aur': 1, 'Rani': 1, 'Kii': 1, 'Prem': 1, 'Kahaani': 1, 'Jigra': 1, 'addition': 1, 'Critics': 1, 'Highway': 1, 'also': 1, 'OTT': 1, 'Web': 1, 'Original': 1, 'Darlings': 1})


# Named NLP Pipeline Function

In [33]:
def nlp_pipeline(text):
    words = word_tokenize(text.lower())
    words = [w for w in words if w.isalpha()]
    
    words = [w for w in words if w not in stop_words]
    lemmas = [lemmatizer.lemmatize(w) for w in words]
    pos = pos_tag(lemmas)
    
    return {
        "tokens": words,
        "lemmatized": lemmas,
        "pos_tags": pos,
        "frequency": Counter(lemmas)
    }

In [34]:
result = nlp_pipeline(text)

print("Tokens:", result["tokens"])
print("\nPOS Tags:", result["pos_tags"])
print("\nWord Frequency:", result["frequency"])

Tokens: ['alia', 'bhatt', 'national', 'film', 'award', 'best', 'actress', 'gangubai', 'kathiawadi', 'received', 'filmfare', 'award', 'best', 'actress', 'six', 'times', 'roles', 'udta', 'punjab', 'raazi', 'gully', 'boy', 'gangubai', 'kathiawadi', 'rocky', 'aur', 'rani', 'kii', 'prem', 'kahaani', 'jigra', 'addition', 'filmfare', 'critics', 'award', 'best', 'actress', 'highway', 'also', 'filmfare', 'ott', 'award', 'best', 'actress', 'web', 'original', 'film', 'darlings']

POS Tags: [('alia', 'NNS'), ('bhatt', 'VBP'), ('national', 'JJ'), ('film', 'NN'), ('award', 'NN'), ('best', 'JJS'), ('actress', 'NN'), ('gangubai', 'NN'), ('kathiawadi', 'NN'), ('received', 'VBD'), ('filmfare', 'JJ'), ('award', 'RB'), ('best', 'JJS'), ('actress', 'RB'), ('six', 'CD'), ('time', 'NN'), ('role', 'NN'), ('udta', 'JJ'), ('punjab', 'NN'), ('raazi', 'NN'), ('gully', 'RB'), ('boy', 'JJ'), ('gangubai', 'NN'), ('kathiawadi', 'NN'), ('rocky', 'JJ'), ('aur', 'NN'), ('rani', 'NN'), ('kii', 'NN'), ('prem', 'NN'), ('ka