In [None]:
'''
What is Lemmatization ?

Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as a single item.
Lemmatization is similar to stemming but it brings context to the words. So it links words with similar meaning to one word.

Text preprocessing includes both Stemming as well as Lemmatization. Many times people find these two terms confusing. 
Some treat these two as same. Actually, lemmatization is preferred over Stemming because lemmatization does morphological
analysis of the words.

Applications of lemmatization are:

-Used in comprehensive retrieval systems like search engines.
-Used in compact indexing
'''

In [1]:
from nltk.stem import WordNetLemmatizer

In [2]:
lemmatizer = WordNetLemmatizer()

In [3]:
example_words = ["word", "wordy", "wording", "cacti", "rocks", "catty", "demonic", "geese", "ravishing", "better", "best", "run"]

In [4]:
for w in example_words:
    print(lemmatizer.lemmatize(w, pos="a"))

word
wordy
wording
cacti
rocks
catty
demonic
geese
ravishing
good
best
run


In [5]:
for w in example_words:
    print(lemmatizer.lemmatize(w, pos="v"))

word
wordy
word
cacti
rock
catty
demonic
geese
ravish
better
best
run


In [6]:
for w in example_words:
    print(lemmatizer.lemmatize(w))

word
wordy
wording
cactus
rock
catty
demonic
goose
ravishing
better
best
run


In [7]:
## Additional
# i) WordNetLemmatizer() without POS tag

In [8]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

text = "She jumped into the river and breathed heavily"
wordnet = WordNetLemmatizer()
tokenizer = word_tokenize(text)

for token in tokenizer:
    print(token,"--->",wordnet.lemmatize(token))

She ---> She
jumped ---> jumped
into ---> into
the ---> the
river ---> river
and ---> and
breathed ---> breathed
heavily ---> heavily


In [9]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

text = "I am running and I usually use to runs"

wordnet = WordNetLemmatizer()
tokenizer = word_tokenize(text)

for token in tokenizer:
    print(token,"--->",wordnet.lemmatize(token))

I ---> I
am ---> am
running ---> running
and ---> and
I ---> I
usually ---> usually
use ---> use
to ---> to
runs ---> run


In [10]:
#ii) WordNetLemmatizer() with POS tags

In [3]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,pos_tag

text = "She jumped into the river and breathed heavily"
wordnet = WordNetLemmatizer()

for token,tag in pos_tag(word_tokenize(text)):
    pos=tag[0].lower()
        
    if pos not in ['a', 'r', 'n', 'v']:
        pos='n'
    
    print(token,"--->",wordnet.lemmatize(token,pos))

She ---> She
jumped ---> jump
into ---> into
the ---> the
river ---> river
and ---> and
breathed ---> breathe
heavily ---> heavily


In [2]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,pos_tag

text = "I am running and I usually use to runs"
wordnet = WordNetLemmatizer()

for token,tag in pos_tag(word_tokenize(text)):
    pos=tag[0].lower()
        
    if pos not in ['a', 'r', 'n', 'v']:
        pos='n'
    
    print(token,"--->",wordnet.lemmatize(token,pos))

I ---> I
am ---> be
running ---> run
and ---> and
I ---> I
usually ---> usually
use ---> use
to ---> to
runs ---> run


In [13]:
'''
Stemming vs Lemmatization
Although both look quite similar there are key differences between Stemming vs Lemmatization –

-The output of lemmatization is an actual word like Changing -> Change
but stemming may not produce an actual English word like Changing -> Chang.

-The stemming process just follows the step-by-step implementation of algorithms like SnowBall, Porter, etc.
to derive the stem. Whereas lemmatization makes use of a lookup database like WordNet to derive lemma. 
For example, the lemmatization of “better” is “well” and this another word is derived as lemma as it looks up in the dictionary.
But the stemming result will come as “better” only without a lookup.
However, this lookup can at times slow down the lemmatization process.

-Stemming does not take the context of the word into account, for example, “meeting” can be a verb or noun based on the context. 
But lemmatization does consider the context of the word before generating its lemma.

Stemming vs Lemmatization Example
In the example code below we first tokenize the text and then with the help of for loop stemmed the token with Snowball Stemmer
and Porter Stemmer. At the same time, we also Lemmatize the text and convert it into a lemma with the help of Wordnet Lemmatizer.
'''

'\nStemming vs Lemmatization\nAlthough both look quite similar there are key differences between Stemming vs Lemmatization –\n\n-The output of lemmatization is an actual word like Changing -> Change\nbut stemming may not produce an actual English word like Changing -> Chang.\n\n-The stemming process just follows the step-by-step implementation of algorithms like SnowBall, Porter, etc.\nto derive the stem. Whereas lemmatization makes use of a lookup database like WordNet to derive lemma. \nFor example, the lemmatization of “better” is “well” and this another word is derived as lemma as it looks up in the dictionary.\nBut the stemming result will come as “better” only without a lookup.\nHowever, this lookup can at times slow down the lemmatization process.\n\n-Stemming does not take the context of the word into account, for example, “meeting” can be a verb or noun based on the context. \nBut lemmatization does consider the context of the word before generating its lemma.\n\nStemming vs L

In [9]:
from nltk.stem import SnowballStemmer, PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

snowball = SnowballStemmer(language='english')
porter = PorterStemmer()
wordnet = WordNetLemmatizer()

text = ["better", "Caring", "are", "am", "worse", "struggling", 'meeting']

print("{:<12}{:<20}{:<20}{:<20}".format("Word", "Snowball Stemmer", "Porter Stemmer", "Wordnet Lemmatizer"))

for token, tag in pos_tag(text):
    pos = tag[0].lower()
    if pos not in ['a', 'r', 'n', 'v']:
        pos = 'n'
    print("{:<12}{:<20}{:<20}{:<20}".format(
        token,
        snowball.stem(token),
        porter.stem(token),
        wordnet.lemmatize(token, pos)
    ))

Word        Snowball Stemmer    Porter Stemmer      Wordnet Lemmatizer  
better      better              better              well                
Caring      care                care                Caring              
are         are                 are                 be                  
am          am                  am                  be                  
worse       wors                wors                worse               
struggling  struggl             struggl             struggle            
meeting     meet                meet                meeting             
