In [1]:
%pip install nltk textblob



In [2]:
import nltk
from nltk.tokenize import (
    WhitespaceTokenizer,
    wordpunct_tokenize,
    TreebankWordTokenizer,
    TweetTokenizer,
    MWETokenizer
)
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk import pos_tag

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
text = """During my summer vacation, I visited Goa with my family.
We enjoyed walking on the beautiful beaches, eating delicious seafood,
and watching the sunset. I felt very relaxed and happy throughout the trip.
The weather was pleasant, and the people were friendly, making the vacation memorable."""
print("Original Text:\n", text)


Original Text:
 During my summer vacation, I visited Goa with my family.
We enjoyed walking on the beautiful beaches, eating delicious seafood,
and watching the sunset. I felt very relaxed and happy throughout the trip.
The weather was pleasant, and the people were friendly, making the vacation memorable.


### **Tokenization**

In [4]:
wt = WhitespaceTokenizer()
print("Whitespace Tokenization:", wt.tokenize(text))

Whitespace Tokenization: ['During', 'my', 'summer', 'vacation,', 'I', 'visited', 'Goa', 'with', 'my', 'family.', 'We', 'enjoyed', 'walking', 'on', 'the', 'beautiful', 'beaches,', 'eating', 'delicious', 'seafood,', 'and', 'watching', 'the', 'sunset.', 'I', 'felt', 'very', 'relaxed', 'and', 'happy', 'throughout', 'the', 'trip.', 'The', 'weather', 'was', 'pleasant,', 'and', 'the', 'people', 'were', 'friendly,', 'making', 'the', 'vacation', 'memorable.']


In [5]:
print("Punctuation-based Tokenization:", wordpunct_tokenize(text))

Punctuation-based Tokenization: ['During', 'my', 'summer', 'vacation', ',', 'I', 'visited', 'Goa', 'with', 'my', 'family', '.', 'We', 'enjoyed', 'walking', 'on', 'the', 'beautiful', 'beaches', ',', 'eating', 'delicious', 'seafood', ',', 'and', 'watching', 'the', 'sunset', '.', 'I', 'felt', 'very', 'relaxed', 'and', 'happy', 'throughout', 'the', 'trip', '.', 'The', 'weather', 'was', 'pleasant', ',', 'and', 'the', 'people', 'were', 'friendly', ',', 'making', 'the', 'vacation', 'memorable', '.']


In [6]:
tbt = TreebankWordTokenizer()
print("Treebank Tokenization:", tbt.tokenize(text))

Treebank Tokenization: ['During', 'my', 'summer', 'vacation', ',', 'I', 'visited', 'Goa', 'with', 'my', 'family.', 'We', 'enjoyed', 'walking', 'on', 'the', 'beautiful', 'beaches', ',', 'eating', 'delicious', 'seafood', ',', 'and', 'watching', 'the', 'sunset.', 'I', 'felt', 'very', 'relaxed', 'and', 'happy', 'throughout', 'the', 'trip.', 'The', 'weather', 'was', 'pleasant', ',', 'and', 'the', 'people', 'were', 'friendly', ',', 'making', 'the', 'vacation', 'memorable', '.']


In [7]:
tweet_tokenizer = TweetTokenizer()
print("Tweet Tokenization:", tweet_tokenizer.tokenize(text))

Tweet Tokenization: ['During', 'my', 'summer', 'vacation', ',', 'I', 'visited', 'Goa', 'with', 'my', 'family', '.', 'We', 'enjoyed', 'walking', 'on', 'the', 'beautiful', 'beaches', ',', 'eating', 'delicious', 'seafood', ',', 'and', 'watching', 'the', 'sunset', '.', 'I', 'felt', 'very', 'relaxed', 'and', 'happy', 'throughout', 'the', 'trip', '.', 'The', 'weather', 'was', 'pleasant', ',', 'and', 'the', 'people', 'were', 'friendly', ',', 'making', 'the', 'vacation', 'memorable', '.']


In [8]:
mwe = MWETokenizer([('New', 'York'), ('summer', 'vacation')], separator='_')
print("MWE Tokenization:", mwe.tokenize(wordpunct_tokenize(text)))

MWE Tokenization: ['During', 'my', 'summer_vacation', ',', 'I', 'visited', 'Goa', 'with', 'my', 'family', '.', 'We', 'enjoyed', 'walking', 'on', 'the', 'beautiful', 'beaches', ',', 'eating', 'delicious', 'seafood', ',', 'and', 'watching', 'the', 'sunset', '.', 'I', 'felt', 'very', 'relaxed', 'and', 'happy', 'throughout', 'the', 'trip', '.', 'The', 'weather', 'was', 'pleasant', ',', 'and', 'the', 'people', 'were', 'friendly', ',', 'making', 'the', 'vacation', 'memorable', '.']


# **Stemming**

In [10]:
tokens = wordpunct_tokenize(text)

# Porter Stemmer
porter = PorterStemmer()
print("Porter Stemmer:", [porter.stem(word) for word in tokens])

# Snowball Stemmer
snowball = SnowballStemmer("english")
print("Snowball Stemmer:", [snowball.stem(word) for word in tokens])

Porter Stemmer: ['dure', 'my', 'summer', 'vacat', ',', 'i', 'visit', 'goa', 'with', 'my', 'famili', '.', 'we', 'enjoy', 'walk', 'on', 'the', 'beauti', 'beach', ',', 'eat', 'delici', 'seafood', ',', 'and', 'watch', 'the', 'sunset', '.', 'i', 'felt', 'veri', 'relax', 'and', 'happi', 'throughout', 'the', 'trip', '.', 'the', 'weather', 'wa', 'pleasant', ',', 'and', 'the', 'peopl', 'were', 'friendli', ',', 'make', 'the', 'vacat', 'memor', '.']
Snowball Stemmer: ['dure', 'my', 'summer', 'vacat', ',', 'i', 'visit', 'goa', 'with', 'my', 'famili', '.', 'we', 'enjoy', 'walk', 'on', 'the', 'beauti', 'beach', ',', 'eat', 'delici', 'seafood', ',', 'and', 'watch', 'the', 'sunset', '.', 'i', 'felt', 'veri', 'relax', 'and', 'happi', 'throughout', 'the', 'trip', '.', 'the', 'weather', 'was', 'pleasant', ',', 'and', 'the', 'peopl', 'were', 'friend', ',', 'make', 'the', 'vacat', 'memor', '.']


# **Lemmatization**

In [11]:
lemmatizer = WordNetLemmatizer()


print("Lemmatization:", [lemmatizer.lemmatize(word) for word in tokens])

Lemmatization: ['During', 'my', 'summer', 'vacation', ',', 'I', 'visited', 'Goa', 'with', 'my', 'family', '.', 'We', 'enjoyed', 'walking', 'on', 'the', 'beautiful', 'beach', ',', 'eating', 'delicious', 'seafood', ',', 'and', 'watching', 'the', 'sunset', '.', 'I', 'felt', 'very', 'relaxed', 'and', 'happy', 'throughout', 'the', 'trip', '.', 'The', 'weather', 'wa', 'pleasant', ',', 'and', 'the', 'people', 'were', 'friendly', ',', 'making', 'the', 'vacation', 'memorable', '.']
