# Stemming and Lemmatizer

In [1]:
import nltk # import package for tokenization
nltk.download('punkt') # download all spporting function /files for NLTK package
nltk.download('wordnet') # download all spporting function /files for NLTK package
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk import word_tokenize, pos_tag

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shashi.singh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shashi.singh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\shashi.singh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Stemming <br>
Stemming algorithm works by cutting the suffix from the word. In a broader sense cuts either the beginning or end of the word.
1. Porter Stemmer
2. Lancaster Stemmer


In [2]:
from nltk.stem import PorterStemmer
poster=PorterStemmer()
# Try some stems
print('go: {}'.format(poster.stem('go')))
print('goes: {}'.format(poster.stem('goes')))
print('went: {}'.format(poster.stem('went')))
print('gone: {}'.format(poster.stem('gone')))

go: go
goes: goe
went: went
gone: gone


In [3]:
from nltk.stem import LancasterStemmer
lancaster=LancasterStemmer()
# Try some stems
print('go: {}'.format(lancaster.stem('go')))
print('goes: {}'.format(lancaster.stem('goes')))
print('went: {}'.format(lancaster.stem('went')))
print('gone: {}'.format(lancaster.stem('gone')))

go: go
goes: goe
went: went
gone: gon


### What is Lemmatization? <br> 
Lemmatization is the algorithmic process of finding the lemma of a word depending on their meaning. Lemmatization usually refers to the morphological analysis of words, which aims to remove inflectional endings. It helps in returning the base or dictionary form of a word, which is known as the lemma. The NLTK Lemmatization method is based on WorldNet's built-in morph function. Text preprocessing includes both stemming as well as lemmatization. 

In [4]:
#lemmatization
from nltk.stem import WordNetLemmatizer 
 # Initializing WordNetLemmatizer()
lemmatizer = WordNetLemmatizer()

print('go: {}'.format(lemmatizer.lemmatize('go')))
print('goes: {}'.format(lemmatizer.lemmatize('goes')))
print('went: {}'.format(lemmatizer.lemmatize('went')))
print('gone: {}'.format(lemmatizer.lemmatize('gone')))

go: go
goes: go
went: went
gone: gone


## Wordnet <b>
Wordnet is an large, freely and publicly available lexical database for the English language aiming to establish
structured semantic relationships between words. It offers lemmatization capabilities as well and is one of the 
earliest and most commonly used lemmatizers.

In [27]:
#nltk.download('tagsets')
#nltk.help.upenn_tagset()# tagset documentation
#nltk.download('wordnet')
from collections import defaultdict #Default Dictionary is imported from collections
from nltk.corpus import wordnet as wn #the corpus reader wordnet is imported.
from nltk.tag import pos_tag
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. 
#By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN) #Dictionary is created where pos_tag (first letter) are the key values 
tag_map['J'] = wn.ADJ                   #whose values are mapped with the value 
tag_map['V'] = wn.VERB                  #from wordnet dictionary. We have taken the only first letter as 
tag_map['R'] = wn.ADV
# we will use it later in the loop.
tag_map


defaultdict(<function __main__.<lambda>()>, {'J': 'a', 'V': 'v', 'R': 'r'})

In [23]:
#lemmatization
from nltk.stem import WordNetLemmatizer 
 # Initializing WordNetLemmatizer()
lemmatizer = WordNetLemmatizer()

print('go: {}'.format(lemmatizer.lemmatize('go',tag_map[tag[0]])))
print('goes: {}'.format(lemmatizer.lemmatize('goes',tag_map[tag[0]])))
print('went: {}'.format(lemmatizer.lemmatize('went',tag_map[tag[0]])))
print('gone: {}'.format(lemmatizer.lemmatize('gone',tag_map[tag[0]])))
print('goes: {}'.format(lemmatizer.lemmatize('goes','v')))
print('goes: {}'.format(lemmatizer.lemmatize('goes','n')))
print('goes: {}'.format(lemmatizer.lemmatize('goes','r')))
print('goes: {}'.format(lemmatizer.lemmatize('goes','a')))

go: go
goes: go
went: go
gone: go
goes: go
goes: go
goes: goes
goes: goes


### Why is Lemmatization better than Stemming? <br>

Stemming algorithm works by cutting the suffix from the word. In a broader sense cuts either the beginning or end of the word.

On the contrary, Lemmatization is a more powerful operation, and it takes into consideration morphological analysis of the words. It returns the lemma which is the base form of all its inflectional forms. In-depth linguistic knowledge is required to create dictionaries and look for the proper form of the word. Stemming is a general operation while lemmatization is an intelligent operation where the proper form will be looked in the dictionary. Hence, lemmatization helps in forming better machine learning features.

In [8]:
#Poster Stemmer VS Lancaster Stemmer VS Lemmatization

word_list = ["---------------","detections","detected","detection","detecting","go","went","gone","going","goa","send","sent","sending","console","consoling","run","ran","running"]
print("{0:22}{1:22}{2:22}{3:22}{4:22}".format("Word","Poster Stemmer","Lancaster Stemmer","Lemmatization W/O POS","Lemmatization with POS"))
for word,tag in pos_tag(word_list):
    print("{0:22}{1:22}{2:22}{3:22}{4:22}".format(word,poster.stem(word),lancaster.stem(word),lemmatizer.lemmatize(word),lemmatizer.lemmatize(word,tag_map[tag[0]])))
print("")
print(" It returned going as such without converting it to the root form go. This is because the lemmatization process depends on the POS tag to come up with the correct lemma.", 
 "Now let us lemmatize again by providing the POS tag for the word.")

Word                  Poster Stemmer        Lancaster Stemmer     Lemmatization W/O POS Lemmatization with POS
---------------       ---------------       ---------------       ---------------       ---------------       
detections            detect                detect                detection             detection             
detected              detect                detect                detected              detect                
detection             detect                detect                detection             detection             
detecting             detect                detect                detecting             detect                
go                    go                    go                    go                    go                    
went                  went                  went                  went                  go                    
gone                  gone                  gon                   gone                  go                    
g

In [9]:
#pip install spacy
# python -m spacy download en_core_web_sm

import spacy 
nlp = spacy.load('en_core_web_sm') 

In [10]:
#Poster Stemmer VS Lancaster Stemmer VS Lemmatization

word_list = ("detections detected detection detecting go went gone going goa send sent sending console consoling run ran running")
#word_list=("My name is Shaurya Uppal.  I enjoy writing articles on GeeksforGeeks checkout my other article by going to my profile section.")
dd=nlp(word_list)
print("{0:22}{1:22}{2:22}{3:22}".format("Text", "Lemma","POS", "tag"))
for token in dd:
  print("{0:22}{1:22}{2:22}{3:22}".format(token.text, token.lemma_, token.pos_, token.tag_))


Text                  Lemma                 POS                   tag                   
detections            detection             NOUN                  NNS                   
detected              detect                VERB                  VBD                   
detection             detection             NOUN                  NN                    
detecting             detect                VERB                  VBG                   
go                    go                    VERB                  VB                    
went                  go                    VERB                  VBD                   
gone                  go                    VERB                  VBN                   
going                 go                    VERB                  VBG                   
goa                   goa                   PROPN                 NNP                   
send                  send                  VERB                  VB                    
sent                 

### References:-
1.https://www.guru99.com/stemming-lemmatization-python-nltk.html
2.https://www.geeksforgeeks.org/python-pos-tagging-and-lemmatization-using-spacy/
3.https://www.nltk.org/  
4.https://spacy.io/
5.https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
6.https://wordnet.princeton.edu/