# Stemming

In [1]:
# Import the toolkit and the full Porter Stemmer liberary
import nltk
from nltk.stem.porter import PorterStemmer

In [2]:
p_stemmer = PorterStemmer()

In [3]:
words = ['run','runner','running','ran','runs','easily','fairly']

In [4]:
for word in words:
    print(word+' --> '+p_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fairli


In [5]:
from nltk.stem.snowball import SnowballStemmer
# The Snowball Stemmer requires that you pass a language parameter 
s_stemmer = SnowballStemmer(language='english')

In [6]:
words = ['run','runner','running','ran','runs','easily','fairly']

In [7]:
for word in words:
    print(word+' --> '+s_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fair


In [8]:
word = ['consolingly']

In [9]:
print('Porter Stemmer:')
for word in word:
    print((word+' --> '+p_stemmer.stem(word)))

Porter Stemmer:
consolingly --> consolingli


In [10]:
print('Snowball Stemmer:')
for word in word:
    print((word+' --> '+s_stemmer.stem(word)))

Snowball Stemmer:
c --> c
o --> o
n --> n
s --> s
o --> o
l --> l
i --> i
n --> n
g --> g
l --> l
y --> y


In [11]:
phrase = 'I am meeting his tomorrow at the meeting'
for word in phrase.split():
    print(word+' --> '+p_stemmer.stem(word))

I --> i
am --> am
meeting --> meet
his --> hi
tomorrow --> tomorrow
at --> at
the --> the
meeting --> meet


# Lemmatization

In [12]:
# Perform standard imports:
import spacy
nlp = spacy.load('en_core_web_sm')

In [13]:
doc1 = nlp(u'I am a runner running in a race because I love to run since I ran today')
for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_) 

I 	 PRON 	 4690420944186131903 	 I
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 4690420944186131903 	 I
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


# Function to display lemmas

In [19]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [20]:
doc2 = nlp(u'I saw eighteen mice today!')
show_lemmas(doc2)

I            PRON   4690420944186131903    I
saw          VERB   11925638236994514241   see
eighteen     NUM    9609336664675087640    eighteen
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


In [21]:
doc3 = nlp(u'I am meeting him tomorrow at the meeting')
show_lemmas(doc3)

I            PRON   4690420944186131903    I
am           AUX    10382539506755952630   be
meeting      VERB   6880656908171229526    meet
him          PRON   1655312771067108281    he
tomorrow     NOUN   3573583789758258062    tomorrow
at           ADP    11667289587015813222   at
the          DET    7425985699627899538    the
meeting      NOUN   14798207169164081740   meeting


In [22]:
doc4 = nlp(u"That's an enormous automobile")
show_lemmas(doc4)

That         PRON   4380130941430378203    that
's           AUX    10382539506755952630   be
an           DET    15099054000809333061   an
enormous     ADJ    17917224542039855524   enormous
automobile   NOUN   7211811266693931283    automobile
