In [26]:
import nltk
from nltk.tokenize import word_tokenize, TreebankWordTokenizer, TweetTokenizer, RegexpTokenizer, MWETokenizer
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer

# Download necessary resources
nltk.download('punkt')
nltk.download('wordnet')     
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Saurabh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Saurabh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Saurabh\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [27]:
# sample tweet text
text = "I'm getting on Borderlands, and I will murder you all! 😈 #gaming #fun @user"
print("Original Text:\n", text)

Original Text:
 I'm getting on Borderlands, and I will murder you all! 😈 #gaming #fun @user


In [28]:
# Split by whitespace
whitespace_tokens = text.split()
print("Whitespace Tokenization:\n", whitespace_tokens)

Whitespace Tokenization:
 ["I'm", 'getting', 'on', 'Borderlands,', 'and', 'I', 'will', 'murder', 'you', 'all!', '😈', '#gaming', '#fun', '@user']


In [29]:
# Regex tokenizer (splits on non-word characters)
regex_tokenizer = RegexpTokenizer(r'\w+')
regex_tokens = regex_tokenizer.tokenize(text)
print("Punctuation-Based Tokenization:\n", regex_tokens)

Punctuation-Based Tokenization:
 ['I', 'm', 'getting', 'on', 'Borderlands', 'and', 'I', 'will', 'murder', 'you', 'all', 'gaming', 'fun', 'user']


In [30]:
# Treebank Tokenizer (standard for grammatical text)
treebank_tokenizer = TreebankWordTokenizer()
treebank_tokens = treebank_tokenizer.tokenize(text)
print("Treebank Tokenization:\n", treebank_tokens)


Treebank Tokenization:
 ['I', "'m", 'getting', 'on', 'Borderlands', ',', 'and', 'I', 'will', 'murder', 'you', 'all', '!', '😈', '#', 'gaming', '#', 'fun', '@', 'user']


In [31]:
# Tweet Tokenizer (handles emojis, mentions, hashtags better)
tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(text)
print("Tweet Tokenization:\n", tweet_tokens)

Tweet Tokenization:
 ["I'm", 'getting', 'on', 'Borderlands', ',', 'and', 'I', 'will', 'murder', 'you', 'all', '!', '😈', '#gaming', '#fun', '@user']


In [32]:
# Define some multi-word expressions
mwe_tokenizer = MWETokenizer([('getting', 'on'), ('Borderlands', ',')])
mwe_tokens = mwe_tokenizer.tokenize(text.split())
print("Multi-Word Expression Tokenization:\n", mwe_tokens)

Multi-Word Expression Tokenization:
 ["I'm", 'getting_on', 'Borderlands,', 'and', 'I', 'will', 'murder', 'you', 'all!', '😈', '#gaming', '#fun', '@user']


In [33]:
porter = PorterStemmer()
snowball = SnowballStemmer("english")

tokens = regex_tokens  # using regex-based clean tokens

porter_stems = [porter.stem(token) for token in tokens]
snowball_stems = [snowball.stem(token) for token in tokens]

print("Porter Stemmer:\n", porter_stems)
print("Snowball Stemmer:\n", snowball_stems)

Porter Stemmer:
 ['i', 'm', 'get', 'on', 'borderland', 'and', 'i', 'will', 'murder', 'you', 'all', 'game', 'fun', 'user']
Snowball Stemmer:
 ['i', 'm', 'get', 'on', 'borderland', 'and', 'i', 'will', 'murder', 'you', 'all', 'game', 'fun', 'user']


In [34]:
lemmatizer = WordNetLemmatizer()

# Lemmatization (default is noun unless specified)
lemmas = [lemmatizer.lemmatize(token) for token in tokens]
print("Lemmatized Tokens:\n", lemmas)


Lemmatized Tokens:
 ['I', 'm', 'getting', 'on', 'Borderlands', 'and', 'I', 'will', 'murder', 'you', 'all', 'gaming', 'fun', 'user']
