In [1]:
import spacy

- stemming is simply to use fix set of rules to convert a word to its base form like removing 'ing' or 'able' (eg. eating, sleeping, adjustable etc.), it doesn't require any language knowledge and works on simple rules
- lemmatization is also used for converting to base word(also called lemma) but it requires language knowledge. like converting ate to eat etc.
- stemming can only be done using nltk

In [2]:
from nltk.stem import PorterStemmer

In [3]:
stemmer = PorterStemmer()

In [4]:

words = ["eating", "eats", "eat", "ate", "adjustable", "rafting", "ability", "meeting"]

for word in words:
    print(word, "|", stemmer.stem(word))

# it didn't do anything for words like ate because it doesn't have any rule for that 
# and it also doesn't have any language knowledge
# so stemming is dumb compared to lemmatization 
# but it still has some value in NLP since it is fast (bcz it works on very simply rules)

eating | eat
eats | eat
eat | eat
ate | ate
adjustable | adjust
rafting | raft
ability | abil
meeting | meet


In [11]:
nlp = spacy.load("en_core_web_sm")

In [6]:
doc = nlp("eating eats eat ate adjustable rafting ability meeting better")
for token in doc:
    print(token, " | ", token.lemma_)

eating  |  eating
eats  |  eat
eat  |  eat
ate  |  eat
adjustable  |  adjustable
rafting  |  raft
ability  |  ability
meeting  |  meet
better  |  well


In [8]:
doc = nlp("Mando talked for 3 hours although talking isn't his thing he became talkative")
for token in doc:
    print(token, " | ", token.lemma_)

Mando  |  Mando
talked  |  talk
for  |  for
3  |  3
hours  |  hour
although  |  although
talking  |  talk
is  |  be
n't  |  not
his  |  his
thing  |  thing
he  |  he
became  |  become
talkative  |  talkative


### Custom lemmatizer

In [9]:
nlp.pipe_names


['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [12]:
#since bro and bruh are slangs, it doesn;t know that base word in brother, so we cn add it

doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")
for token in doc:
    print(token.text, "|", token.lemma_)

Bro | Bro
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brah
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
am | be
exhausted | exhaust


In [13]:
ar = nlp.get_pipe('attribute_ruler')

ar.add([[{"TEXT":"Bro"}],[{"TEXT":"Brah"}]],{"LEMMA":"Brother"})

doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")
for token in doc:
    print(token.text, "|", token.lemma_)

Bro | Brother
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brother
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
am | be
exhausted | exhaust
