**Whitespace based Tokenization**

Syntax : tokenize.WhitespaceTokenizer()

Return : Return the tokens from a string

In [1]:
# import WhitespaceTokenizer() method from nltk
from nltk.tokenize import WhitespaceTokenizer
     
# Create a reference variable for Class WhitespaceTokenizer
wt = WhitespaceTokenizer()

# Create a string input
text = "Welcome to the I2IT-NLP Page. \n Good Morning \t"
print("\nOriginal string:")
print(text)     
# Use tokenize method
tokenized_text = wt.tokenize(text)
print("\nSplitting using whitespece into separate tokens:")
print(tokenized_text)


Original string:
Welcome to the I2IT-NLP Page. 
 Good Morning 	

Splitting using whitespece into separate tokens:
['Welcome', 'to', 'the', 'I2IT-NLP', 'Page.', 'Good', 'Morning']


**Punctuation-based tokenizer**

In [2]:
from nltk.tokenize import WordPunctTokenizer
text = "Welcome to the I2IT-NLP Page. \n Good Morning \t"
print("\nOriginal string:")
print(text)
result = WordPunctTokenizer().tokenize(text)
print("\nSplit all punctuation into separate tokens:")
print(result)


Original string:
Welcome to the I2IT-NLP Page. 
 Good Morning 	

Split all punctuation into separate tokens:
['Welcome', 'to', 'the', 'I2IT', '-', 'NLP', 'Page', '.', 'Good', 'Morning']


**Treebank Tokenizer**

In [3]:
from nltk.tokenize import TreebankWordTokenizer
 
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(text)

['Welcome', 'to', 'the', 'I2IT-NLP', 'Page.', 'Good', 'Morning']

**Tweet Tokenizer**

When we want to apply tokenization in text data like tweets, the tokenizers mentioned above can’t produce practical tokens. Through this issue, NLTK has a rule based tokenizer special for tweets. We can split emojis into different words if we need them for tasks like sentiment analysis.

In [4]:
from nltk.tokenize import TweetTokenizer

tweet_tokenize = TweetTokenizer()
sample_tweet = "Who is your favourite cryptocurrency influencer? 🗣🏆 Tag them below! 👇"
print(tweet_tokenize.tokenize(sample_tweet))

['Who', 'is', 'your', 'favourite', 'cryptocurrency', 'influencer', '?', '🗣', '🏆', 'Tag', 'them', 'below', '!', '👇']


**Multi-Word Expression Tokenizer**


In [5]:
# import MWETokenizer() method from nltk
from nltk.tokenize import MWETokenizer
#from nltk.tokenize import  word_tokenize

# Create a reference variable for Class MWETokenizer
tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')])

tokenizer.add_mwe(('in', 'spite', 'of'))
tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split())

['In', 'a_little', 'or', 'a_little_bit', 'or', 'a_lot', 'in_spite_of']

In [6]:
from nltk.tokenize import MWETokenizer
tokenizer = MWETokenizer()
tokenizer.add_mwe(('Steven', 'Spielberg'))
tokenizer.tokenize('Steven Spielberg is an American writer producer director '.split())

['Steven_Spielberg', 'is', 'an', 'American', 'writer', 'producer', 'director']

# Stemming

In [7]:
import nltk
from nltk.stem import PorterStemmer

ps = PorterStemmer()
example_words =["connector","connection","connects","connecting","connected"]
         
#Next, we can easily stem by doing something like:
for w in example_words:
  print(ps.stem(w))

connector
connect
connect
connect
connect


**Snowball Stemmer**

In [8]:
from nltk.stem import SnowballStemmer
snowball = SnowballStemmer(language='english')
words = ['generous','generate','generously','generation']
for word in words:
    print(word,"--->",snowball.stem(word))

generous ---> generous
generate ---> generat
generously ---> generous
generation ---> generat


**Lancaster Stemmer**

In [9]:
from nltk.stem import LancasterStemmer
lancaster = LancasterStemmer()
words = ['eating','eats','eaten','puts','putting']
for word in words:
    print(word,"--->",lancaster.stem(word))

eating ---> eat
eats ---> eat
eaten ---> eat
puts ---> put
putting ---> put


**Regex Stemmer**

In [10]:
from nltk.stem import RegexpStemmer
regexp = RegexpStemmer('ing$|s$|e$|able$', min=4)
words = ['mass','was','bee','computer','advisable']
for word in words:
    print(word,"--->",regexp.stem(word))

mass ---> mas
was ---> was
bee ---> bee
computer ---> computer
advisable ---> advis


**Porter Vs Snowball Vs Lancaster Vs Regex Stemmers**



In [11]:
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer, RegexpStemmer
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer(language='english')
regexp = RegexpStemmer('ing$|s$|e$|able$', min=4)
word_list = ['generous','generate','generously','generation']
print("{0:20}{1:20}{2:20}{3:30}{4:40}".format("Word","Porter Stemmer","Snowball Stemmer","Lancaster Stemmer",'Regexp Stemmer'))
for word in word_list:
    print("{0:20}{1:20}{2:20}{3:30}{4:40}".format(word,porter.stem(word),snowball.stem(word),lancaster.stem(word),regexp.stem(word)))

Word                Porter Stemmer      Snowball Stemmer    Lancaster Stemmer             Regexp Stemmer                          
generous            gener               generous            gen                           generou                                 
generate            gener               generat             gen                           generat                                 
generously          gener               generous            gen                           generously                              
generation          gener               generat             gen                           generation                              


# Lemmatization

**Use any technique for lemmatization.**

**Using NLTK Library for Lemmatization**

In [12]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [13]:
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations="?:!.,;"
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

sentence_words
print("{0:20}{1:20}".format("Word","Lemma"))
for word in sentence_words:
    print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word)))

Word                Lemma               
He                  He                  
was                 wa                  
running             running             
and                 and                 
eating              eating              
at                  at                  
same                same                
time                time                
He                  He                  
has                 ha                  
bad                 bad                 
habit               habit               
of                  of                  
swimming            swimming            
after               after               
playing             playing             
long                long                
hours               hour                
in                  in                  
the                 the                 
Sun                 Sun                 
