<a href="https://colab.research.google.com/github/saddarudin/google_colab/blob/main/nlp_stemming_lemmatization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import spacy
import nltk
from nltk.stem import PorterStemmer

## Stemming in NLTK

In [3]:
stemmer = PorterStemmer()
words = ['eating','eats','eat','ate','adjustable','rafting','ability','meeting']

for word in words:
  print(word, '|', stemmer.stem(word))

eating | eat
eats | eat
eat | eat
ate | ate
adjustable | adjust
rafting | raft
ability | abil
meeting | meet


## Lemmatization in NLTK

In [12]:
#Lemmatization in NLTK
nltk.download('wordnet')
nltk.download('omw-1.4') # For additional language support
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
words = ['eating','eats','eat','ate','adjustable','rafting','ability','meeting']

for word in words:
  print(word, '|', lemmatizer.lemmatize(word))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


eating | eating
eats | eats
eat | eat
ate | ate
adjustable | adjustable
rafting | rafting
ability | ability
meeting | meeting


## Lemmatization in spaCy

In [6]:
nlp = spacy.load('en_core_web_sm')

doc = nlp('eating eats eat ate adjustable rafting ability meeting better')

for token in doc:
  print(token, '|', token.lemma_, '|', token.lemma)

eating | eat | 9837207709914848172
eats | eat | 9837207709914848172
eat | eat | 9837207709914848172
ate | eat | 9837207709914848172
adjustable | adjustable | 6033511944150694480
rafting | raft | 7154368781129989833
ability | ability | 11565809527369121409
meeting | meet | 6880656908171229526
better | well | 4525988469032889948


In [7]:
doc = nlp("Mando talked 3 hours although talking isn't his thing he became talkative")
for token in doc:
  print(token, '|', token.lemma_)

Mando | Mando
talked | talk
3 | 3
hours | hour
although | although
talking | talk
is | be
n't | not
his | his
thing | thing
he | he
became | become
talkative | talkative


## Customizing the model

In [8]:
doc = nlp("Bro, you wanna go? Brah, don't say no! I'm exhausted")
# In above sentence Bro, Brah both point to Brother but the model treat them as as they are

for token in doc:
  print(token, '|', token.lemma_)

Bro | Bro
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brah
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
'm | be
exhausted | exhaust


In [11]:
ar = nlp.get_pipe('attribute_ruler')
ar.add([[{'Text':'Bro'}],[{'Text':'Brah'}]],{'LEMMA':'Brother'})

doc = nlp("Bro, you wanna go? Brah, don't say no! I'm exhausted")
for token in doc:
  print(token, '|', token.lemma_)

Bro | Brother
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brother
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
'm | be
exhausted | exhaust
