In [1]:
!pip install nltk



In [2]:
import nltk
from nltk.tokenize import WhitespaceTokenizer, WordPunctTokenizer, TreebankWordTokenizer, TweetTokenizer, MWETokenizer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [3]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [5]:
text = "This is Assignment no one of NLP subject .Don't hesitate to ask questions! NLP is fun i am a student of Cse AI stdying NLP"
print("Original Text:\n", text)
print("="*60)

Original Text:
 This is Assignment no one of NLP subject .Don't hesitate to ask questions! NLP is fun i am a student of Cse AI stdying NLP


In [7]:
# Whitespace Tokenizer
whitespace_tokens = WhitespaceTokenizer().tokenize(text)
print("Whitespace Tokenizer:", whitespace_tokens)
# Punctuation-based Tokenizer
punct_tokens = WordPunctTokenizer().tokenize(text)
print("Punctuation Tokenizer:", punct_tokens)
# Treebank Tokenizer
treebank_tokens = TreebankWordTokenizer().tokenize(text)
print("Treebank Tokenizer:", treebank_tokens)
# Tweet Tokenizer
tweet_tokens = TweetTokenizer().tokenize(text)
print("Tweet Tokenizer:", tweet_tokens)
# MWE Tokenizer (Multi-Word Expressions)
mwe_tokenizer = MWETokenizer([("natural", "language"), ("New", "York")])
mwe_tokens = mwe_tokenizer.tokenize(text.split())
print("MWE Tokenizer:", mwe_tokens)

Whitespace Tokenizer: ['This', 'is', 'Assignment', 'no', 'one', 'of', 'NLP', 'subject', ".Don't", 'hesitate', 'to', 'ask', 'questions!', 'NLP', 'is', 'fun', 'i', 'am', 'a', 'student', 'of', 'Cse', 'AI', 'stdying', 'NLP']
Punctuation Tokenizer: ['This', 'is', 'Assignment', 'no', 'one', 'of', 'NLP', 'subject', '.', 'Don', "'", 't', 'hesitate', 'to', 'ask', 'questions', '!', 'NLP', 'is', 'fun', 'i', 'am', 'a', 'student', 'of', 'Cse', 'AI', 'stdying', 'NLP']
Treebank Tokenizer: ['This', 'is', 'Assignment', 'no', 'one', 'of', 'NLP', 'subject', '.Do', "n't", 'hesitate', 'to', 'ask', 'questions', '!', 'NLP', 'is', 'fun', 'i', 'am', 'a', 'student', 'of', 'Cse', 'AI', 'stdying', 'NLP']
Tweet Tokenizer: ['This', 'is', 'Assignment', 'no', 'one', 'of', 'NLP', 'subject', '.', "Don't", 'hesitate', 'to', 'ask', 'questions', '!', 'NLP', 'is', 'fun', 'i', 'am', 'a', 'student', 'of', 'Cse', 'AI', 'stdying', 'NLP']
MWE Tokenizer: ['This', 'is', 'Assignment', 'no', 'one', 'of', 'NLP', 'subject', ".Don't",

In [8]:
print("\n--- STEMMING ---")
porter = PorterStemmer()
snowball = SnowballStemmer("english")
porter_stems = [porter.stem(token) for token in treebank_tokens]
snowball_stems = [snowball.stem(token) for token in treebank_tokens]
print("Porter Stemmer:", porter_stems)
print("Snowball Stemmer:", snowball_stems)


--- STEMMING ---
Porter Stemmer: ['thi', 'is', 'assign', 'no', 'one', 'of', 'nlp', 'subject', '.do', "n't", 'hesit', 'to', 'ask', 'question', '!', 'nlp', 'is', 'fun', 'i', 'am', 'a', 'student', 'of', 'cse', 'ai', 'stdi', 'nlp']
Snowball Stemmer: ['this', 'is', 'assign', 'no', 'one', 'of', 'nlp', 'subject', '.do', "n't", 'hesit', 'to', 'ask', 'question', '!', 'nlp', 'is', 'fun', 'i', 'am', 'a', 'student', 'of', 'cse', 'ai', 'stdi', 'nlp']


In [9]:
print("\n--- LEMMATIZATION ---")
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(token) for token in treebank_tokens]
print("WordNet Lemmatizer:", lemmas)


--- LEMMATIZATION ---
WordNet Lemmatizer: ['This', 'is', 'Assignment', 'no', 'one', 'of', 'NLP', 'subject', '.Do', "n't", 'hesitate', 'to', 'ask', 'question', '!', 'NLP', 'is', 'fun', 'i', 'am', 'a', 'student', 'of', 'Cse', 'AI', 'stdying', 'NLP']
