Perform tokenization (Whitespace, Punctuation-based, Treebank, Tweet, MWE) using NLTK
library. Use porter stemmer and snowball stemmer for stemming. Use any technique for
lemmatization.

In [None]:
!pip install nltk



In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
text = "OMG!!! ü§ñ AI is taking over the world üåçüòÇ I love machine learning and human language processing!!! #NLP #AI #TechLife üöÄüî•"
print("Original Text:\n", text)

Original Text:
 OMG!!! ü§ñ AI is taking over the world üåçüòÇ I love machine learning and human language processing!!! #NLP #AI #TechLife üöÄüî•


## Whitespace Tokenization

In [None]:
from nltk.tokenize import WhitespaceTokenizer

wt = WhitespaceTokenizer()
whitespace_tokens = wt.tokenize(text)
print("\nWhitespace Tokenization:\n", whitespace_tokens)


Whitespace Tokenization:
 ['OMG!!!', 'ü§ñ', 'AI', 'is', 'taking', 'over', 'the', 'world', 'üåçüòÇ', 'I', 'love', 'machine', 'learning', 'and', 'human', 'language', 'processing!!!', '#NLP', '#AI', '#TechLife', 'üöÄüî•']


## Punctuation Based Tokenization

In [None]:
from nltk.tokenize import WordPunctTokenizer

wpt = WordPunctTokenizer()
punct_tokens = wpt.tokenize(text)
print("\nPunctuation-based Tokenization:\n", punct_tokens)


Punctuation-based Tokenization:
 ['OMG', '!!!', 'ü§ñ', 'AI', 'is', 'taking', 'over', 'the', 'world', 'üåçüòÇ', 'I', 'love', 'machine', 'learning', 'and', 'human', 'language', 'processing', '!!!', '#', 'NLP', '#', 'AI', '#', 'TechLife', 'üöÄüî•']


## Treebank Tokenization

In [None]:
from nltk.tokenize import TreebankWordTokenizer

tbt = TreebankWordTokenizer()
treebank_tokens = tbt.tokenize(text)
print("\nTreebank Tokenization:\n", treebank_tokens)


Treebank Tokenization:
 ['OMG', '!', '!', '!', 'ü§ñ', 'AI', 'is', 'taking', 'over', 'the', 'world', 'üåçüòÇ', 'I', 'love', 'machine', 'learning', 'and', 'human', 'language', 'processing', '!', '!', '!', '#', 'NLP', '#', 'AI', '#', 'TechLife', 'üöÄüî•']


## Tweet Tokenization

In [None]:
from nltk.tokenize import TweetTokenizer

tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(text)
print("\nTweet Tokenization:\n", tweet_tokens)


Tweet Tokenization:
 ['OMG', '!', '!', '!', 'ü§ñ', 'AI', 'is', 'taking', 'over', 'the', 'world', 'üåç', 'üòÇ', 'I', 'love', 'machine', 'learning', 'and', 'human', 'language', 'processing', '!', '!', '!', '#NLP', '#AI', '#TechLife', 'üöÄ', 'üî•']


## MWE (Multi-Word Expression) Tokenization

In [None]:
from nltk.tokenize import MWETokenizer

mwe = MWETokenizer([('human', 'language'), ('machine', 'learning')], separator='_')
mwe_tokens = mwe.tokenize(text.split())
print("\nMWE Tokenization:\n", mwe_tokens)


MWE Tokenization:
 ['OMG!!!', 'ü§ñ', 'AI', 'is', 'taking', 'over', 'the', 'world', 'üåçüòÇ', 'I', 'love', 'machine_learning', 'and', 'human_language', 'processing!!!', '#NLP', '#AI', '#TechLife', 'üöÄüî•']


# Stemming

## Porter Stemmer

In [None]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
porter_stems = [ps.stem(word) for word in treebank_tokens]
print("\nPorter Stemmer:\n", porter_stems)


Porter Stemmer:
 ['omg', '!', '!', '!', 'ü§ñ', 'ai', 'is', 'take', 'over', 'the', 'world', 'üåçüòÇ', 'i', 'love', 'machin', 'learn', 'and', 'human', 'languag', 'process', '!', '!', '!', '#', 'nlp', '#', 'ai', '#', 'techlif', 'üöÄüî•']


## Snowball Stemmer

In [None]:
from nltk.stem import SnowballStemmer

ss = SnowballStemmer("english")
snowball_stems = [ss.stem(word) for word in treebank_tokens]
print("\nSnowball Stemmer:\n", snowball_stems)


Snowball Stemmer:
 ['omg', '!', '!', '!', 'ü§ñ', 'ai', 'is', 'take', 'over', 'the', 'world', 'üåçüòÇ', 'i', 'love', 'machin', 'learn', 'and', 'human', 'languag', 'process', '!', '!', '!', '#', 'nlp', '#', 'ai', '#', 'techlif', 'üöÄüî•']


# Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(word) for word in treebank_tokens]
print("\nLemmatization:\n", lemmas)


Lemmatization:
 ['OMG', '!', '!', '!', 'ü§ñ', 'AI', 'is', 'taking', 'over', 'the', 'world', 'üåçüòÇ', 'I', 'love', 'machine', 'learning', 'and', 'human', 'language', 'processing', '!', '!', '!', '#', 'NLP', '#', 'AI', '#', 'TechLife', 'üöÄüî•']
