In [15]:
# Stemming : reducing inflection in words to their root form
import nltk
from nltk.stem import LancasterStemmer

In [1]:
# porter stemmer : the oldest stemmer orignally developed in 1979
# it just uses suffix stripping
from nltk.stem import PorterStemmer
sp = PorterStemmer()
[sp.stem ('sing'), sp.stem ('lying'), sp.stem('tries'), sp.stem('affectionately')]
# if we see it removed es from tries to produce tri which isn't the correct root word for the word
# Porter stemmer does not follow linguistics rather a set of 5 rules
# This is the reason why PorterStemmer does not often generate stems that are actual English words.
# why do we use porter stemmer then ? It's because of it's simplicity and speed

['sing', 'lie', 'tri', 'affection']

In [None]:
#LancasterStemmer is simple, but heavy stemming due to iterations and over-stemming may occur. 
# Over-stemming causes the stems to be not linguistic, or they may have no meaning.

In [16]:
# It is commonly useful in Information Retrieval Environments known as 
# IR Environments for fast recall and fetching of search queries.
#A list of words to be stemmed
porter = PorterStemmer()
lancaster = LancasterStemmer()
word_list = ["friend", "friendship", "friends", "friendships","stabil","destabilize","misunderstanding","railroad","moonlight","football"]
print("{0:20}{1:20}{2:20}".format("Word","Porter Stemmer","lancaster Stemmer"))
for word in word_list:
    print("{0:20}{1:20}{2:20}".format(word,porter.stem(word),lancaster.stem(word)))

Word                Porter Stemmer      lancaster Stemmer   
friend              friend              friend              
friendship          friendship          friend              
friends             friend              friend              
friendships         friendship          friend              
stabil              stabil              stabl               
destabilize         destabil            dest                
misunderstanding    misunderstand       misunderstand       
railroad            railroad            railroad            
moonlight           moonlight           moonlight           
football            footbal             footbal             


In [2]:
# It uses regular expressions to identify morphological affixes
# Any substrings that match the regular expressions will be removed.
from nltk.stem import RegexpStemmer
sre = RegexpStemmer('ing')
sre.stem('singing')
# As this could be seen that this stemmer is not of much use as
# it only check the matching substring and removes it without following any linguistics.

's'

In [17]:
# It is very usefful stemmer as in supports 16 languages
# See which languages are supported
print(" ".join(SnowballStemmer.languages)) 
# the languages are : 

arabic danish dutch english finnish french german hungarian italian norwegian porter portuguese romanian russian spanish swedish


In [18]:
# One can generate its own set of rules for any language that is why Python nltk 
# introduced SnowballStemmers that are used to create non-English Stemmers!
from nltk.stem import SnowballStemmer
SnowballStemmer.languages
len(SnowballStemmer.languages)

16

In [20]:
# Invoking the stemmers this way is useful if we do 
# not know the language to be stemmed at runtime.
stemmer = SnowballStemmer("german") # Choose a language
stemmer.stem("Autobahnen") # Stem a word
# root word is autobahn meaning highway

'autobahn'

In [6]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('oxen')

'ox'

In [None]:
# The only major thing to note is that 
# lemmatize takes a part of speech parameter, "pos." 
# If not supplied, the default is "noun." This means that 
# an attempt will be made to find the closest noun, which can create trouble

In [21]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a")) # For better we get good mentioning the pos works good
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))
print(lemmatizer.lemmatize('best', pos='a')) # For superlative degree its superlative

cat
cactus
goose
rock
python
good
best
run
run


In [11]:
lemmatizer.lemmatize('affectionsately', pos='a')
# It kind of fails here result should be affection 

'affectionsately'

In [22]:
# Let's distinguish between stemming and lemmatization

In [24]:
import nltk
from nltk.stem.porter import PorterStemmer
porter_stemmer  = PorterStemmer()
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
    print("Stemming for {} is {}".format(w,porter_stemmer.stem(w)))  

Stemming for studies is studi
Stemming for studying is studi
Stemming for cries is cri
Stemming for cry is cri


In [27]:
import nltk
from nltk.stem import 	WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
    print("Lemma for {} is {}".format(w, wordnet_lemmatizer.lemmatize(w)))  

Lemma for studies is study
Lemma for studying is studying
Lemma for cries is cry
Lemma for cry is cry


In [None]:
# If we look stemming for studies and studying, output is same (studi) 
# but lemmatizer provides different lemma for both tokens study for studies 
# and studying for studying. So when we need to make feature set to train 
# machine, it would be great if lemmatization is preferred.