In [1]:
import nltk
from nltk.tokenize import (
    WhitespaceTokenizer,
    WordPunctTokenizer,
    TreebankWordTokenizer,
    TweetTokenizer,
    MWETokenizer
)

In [2]:
# Sample text containing:
# 1. A contraction (don't)
# 2. Punctuation (New York!)
# 3. Social media tags (#NLP, @Rohan)
# 4. A multi-word expression (Natural Language)
text = "I don't like #NLP? @Rohan said Natural Language is hard in New York!"

In [3]:
# 1. Whitespace Tokenizer
ws_tokens = WhitespaceTokenizer().tokenize(text)

In [4]:
# 2. Punctuation Tokenizer
punct_tokens = WordPunctTokenizer().tokenize(text)

In [5]:
# 3. Treebank Tokenizer
tree_tokens = TreebankWordTokenizer().tokenize(text)

In [6]:
# 4. Tweet Tokenizer
tweet_tokens = TweetTokenizer().tokenize(text)

In [7]:
# 5. MWE(Multi-Word Expression) Tokenizer (Requires predefined expressions)
mwe_tokenizer = MWETokenizer([('Natural', 'Language'), ('New', 'York')])
# Note: MWE usually requires a pre-tokenized list
mwe_tokens = mwe_tokenizer.tokenize(ws_tokens)

In [8]:
# Displaying Results
print(f"Original: {text}\n" + "-"*30)
print(f"Whitespace: {ws_tokens}")
print(f"Punctuation: {punct_tokens}")
print(f"Treebank:   {tree_tokens}")
print(f"Tweet:      {tweet_tokens}")
print(f"MWE:        {mwe_tokens}")

Original: I don't like #NLP? @Rohan said Natural Language is hard in New York!
------------------------------
Whitespace: ['I', "don't", 'like', '#NLP?', '@Rohan', 'said', 'Natural', 'Language', 'is', 'hard', 'in', 'New', 'York!']
Punctuation: ['I', 'don', "'", 't', 'like', '#', 'NLP', '?', '@', 'Rohan', 'said', 'Natural', 'Language', 'is', 'hard', 'in', 'New', 'York', '!']
Treebank:   ['I', 'do', "n't", 'like', '#', 'NLP', '?', '@', 'Rohan', 'said', 'Natural', 'Language', 'is', 'hard', 'in', 'New', 'York', '!']
Tweet:      ['I', "don't", 'like', '#NLP', '?', '@Rohan', 'said', 'Natural', 'Language', 'is', 'hard', 'in', 'New', 'York', '!']
MWE:        ['I', "don't", 'like', '#NLP?', '@Rohan', 'said', 'Natural_Language', 'is', 'hard', 'in', 'New', 'York!']


In [9]:
import nltk
from nltk.stem import PorterStemmer, SnowballStemmer

In [10]:
# 1. Initialize the stemmers
porter = PorterStemmer()
# Snowball requires you to specify the language
snowball = SnowballStemmer(language='english')

In [11]:
# 2. Define a list of words to test
# These include variations in tense, plurality, and adverbs
words = [
    "generous", "generously", "generation",
    "running", "ran", "runs",
    "fairly", "fairness",
    "ponies", "caresses"
]

In [12]:
# 3. Print the header for our comparison table
print(f"{'Original Word':<15} | {'Porter Stemmer':<15} | {'Snowball Stemmer':<15}")
print("-" * 50)

# 4. Apply stemming and display results
for word in words:
    p_stem = porter.stem(word)
    s_stem = snowball.stem(word)
    print(f"{word:<15} | {p_stem:<15} | {s_stem:<15}")

Original Word   | Porter Stemmer  | Snowball Stemmer
--------------------------------------------------
generous        | gener           | generous       
generously      | gener           | generous       
generation      | gener           | generat        
running         | run             | run            
ran             | ran             | ran            
runs            | run             | run            
fairly          | fairli          | fair           
fairness        | fair            | fair           
ponies          | poni            | poni           
caresses        | caress          | caress         


In [13]:
# 5. Demonstrating a phrase/sentence stemming
sentence = "The dogs are running quickly through the leaves"
sentence_tokens = sentence.split()
stemmed_sentence = [snowball.stem(w) for w in sentence_tokens]

print("\nOriginal Sentence:", sentence)
print("Snowball Stemmed:", " ".join(stemmed_sentence))


Original Sentence: The dogs are running quickly through the leaves
Snowball Stemmed: the dog are run quick through the leav


In [14]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [15]:
# Download the necessary datasets for WordNet
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [16]:
# 1. Initialize the Lemmatizer
lemmatizer = WordNetLemmatizer()

In [17]:
# 2. List of words to test
words = ["feet", "cacti", "geese", "rocks", "running", "was", "better"]

In [19]:
print(f"{'Original Word':<15} | {'Lemma (Default)':<15} | {'Lemma (with POS)':<15}")
print("-" * 50)

# 3. Demonstrate the difference POS tags make
# 'v' stands for verb, 'a' for adjective, 'n' for noun
test_data = [
    ("running", wordnet.VERB),
    ("was", wordnet.VERB),
    ("better", wordnet.ADJ),
    ("feet", wordnet.NOUN),
    ("leaves", wordnet.NOUN),
    ("leaves", wordnet.VERB)
]

for word, tag in test_data:
    # Without tag, WordNet assumes everything is a Noun
    default_lemma = lemmatizer.lemmatize(word)
    # With tag, it finds the true root
    tagged_lemma = lemmatizer.lemmatize(word, pos=tag)

    print(f"{word:<15} | {default_lemma:<15} | {tagged_lemma:<15}")

Original Word   | Lemma (Default) | Lemma (with POS)
--------------------------------------------------
running         | running         | run            
was             | wa              | be             
better          | better          | good           
feet            | foot            | foot           
leaves          | leaf            | leaf           
leaves          | leaf            | leave          
