In [12]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

# # Step 1: Download required NLTK packages (handles missing resources)
nltk.download('punkt_tab')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

# Step 2: Initialize text
text = "Tokenization is the first step in text analytics. The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization."

# Step 3: Perform Tokenization
print("\n--- Tokenization ---")
tokenized_sentences = sent_tokenize(text)  # Sentence Tokenization
tokenized_words = word_tokenize(text)  # Word Tokenization

print("Sentences:", tokenized_sentences)
print("Words:", tokenized_words)

# Step 4: Removing Punctuation & Stop Words
stop_words = set(stopwords.words("english"))

# Remove punctuation and lowercase the text
clean_text = re.sub(r'[^\w\s]', '', text.lower())

# Tokenize and remove stopwords
filtered_words = [word for word in word_tokenize(clean_text) if word not in stop_words]

print("\n--- Stopword Removal ---")
print("Filtered Words:", filtered_words)

# Step 5: Perform Stemming
ps = PorterStemmer()
sample_words = ["wait", "waiting", "waited", "waits"]

print("\n--- Stemming ---")
print([ps.stem(word) for word in sample_words])

# Step 6: Perform Lemmatization
lemmatizer = WordNetLemmatizer()
lem_words = ["studies", "studying", "cries", "cry"]

print("\n--- Lemmatization ---")
print([lemmatizer.lemmatize(word) for word in lem_words])

# Step 7: Apply POS Tagging
data = "The pink sweater fit her perfectly"
words = word_tokenize(data)

print("\n--- POS Tagging ---")
print(nltk.pos_tag(words))


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\pawar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!



--- Tokenization ---
Sentences: ['Tokenization is the first step in text analytics.', 'The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization.']
Words: ['Tokenization', 'is', 'the', 'first', 'step', 'in', 'text', 'analytics', '.', 'The', 'process', 'of', 'breaking', 'down', 'a', 'text', 'paragraph', 'into', 'smaller', 'chunks', 'such', 'as', 'words', 'or', 'sentences', 'is', 'called', 'Tokenization', '.']

--- Stopword Removal ---
Filtered Words: ['tokenization', 'first', 'step', 'text', 'analytics', 'process', 'breaking', 'text', 'paragraph', 'smaller', 'chunks', 'words', 'sentences', 'called', 'tokenization']

--- Stemming ---
['wait', 'wait', 'wait', 'wait']

--- Lemmatization ---
['study', 'studying', 'cry', 'cry']

--- POS Tagging ---
[('The', 'DT'), ('pink', 'NN'), ('sweater', 'NN'), ('fit', 'VBP'), ('her', 'PRP$'), ('perfectly', 'RB')]
