In [None]:
pip install nltk


In [2]:
import nltk
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.tokenize import MWETokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string

In [3]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [4]:

# Sample text for demonstration
text = "Tokenization is the process of splitting text into tokens. It's an important step in natural language processing!"


In [5]:

# Tokenization using different methods
# 1. Whitespace tokenization
whitespace_tokens = text.split()

# 2. Punctuation-based tokenization
punctuation_tokens = [word.strip(string.punctuation) for word in text.split() if word.strip(string.punctuation)]

# 3. Treebank tokenization
treebank_tokenizer = TreebankWordTokenizer()
treebank_tokens = treebank_tokenizer.tokenize(text)

# 4. Tweet tokenizer for social media text
tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(text)

# 5. Multi-Word Expression (MWE) tokenizer
mwe_tokenizer = MWETokenizer([('natural', 'language'), ('important', 'step')])
mwe_tokens = mwe_tokenizer.tokenize(word_tokenize(text))


In [6]:

# Stemming using Porter and Snowball stemmers
porter_stemmer = PorterStemmer()
snowball_stemmer = SnowballStemmer('english')

porter_stems = [porter_stemmer.stem(token) for token in word_tokenize(text)]
snowball_stems = [snowball_stemmer.stem(token) for token in word_tokenize(text)]

# Lemmatization using WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

lemmas = [lemmatizer.lemmatize(token) for token in word_tokenize(text)]

In [7]:

print("Original Text:")
print(text)

print("\nTokenization:")
print("Whitespace Tokens:", whitespace_tokens)
print("Punctuation Tokens:", punctuation_tokens)
print("Treebank Tokens:", treebank_tokens)
print("Tweet Tokens:", tweet_tokens)
print("MWE Tokens:", mwe_tokens)

print("\nStemming:")
print("Porter Stems:", porter_stems)
print("Snowball Stems:", snowball_stems)

print("\nLemmatization:")
print("Lemmas:", lemmas)

Original Text:
Tokenization is the process of splitting text into tokens. It's an important step in natural language processing!

Tokenization:
Whitespace Tokens: ['Tokenization', 'is', 'the', 'process', 'of', 'splitting', 'text', 'into', 'tokens.', "It's", 'an', 'important', 'step', 'in', 'natural', 'language', 'processing!']
Punctuation Tokens: ['Tokenization', 'is', 'the', 'process', 'of', 'splitting', 'text', 'into', 'tokens', "It's", 'an', 'important', 'step', 'in', 'natural', 'language', 'processing']
Treebank Tokens: ['Tokenization', 'is', 'the', 'process', 'of', 'splitting', 'text', 'into', 'tokens.', 'It', "'s", 'an', 'important', 'step', 'in', 'natural', 'language', 'processing', '!']
Tweet Tokens: ['Tokenization', 'is', 'the', 'process', 'of', 'splitting', 'text', 'into', 'tokens', '.', "It's", 'an', 'important', 'step', 'in', 'natural', 'language', 'processing', '!']
MWE Tokens: ['Tokenization', 'is', 'the', 'process', 'of', 'splitting', 'text', 'into', 'tokens', '.', 'It',