<a href="https://colab.research.google.com/github/rajdas2001/NLP-Workbook/blob/main/4_Stemming_and_Lemmatization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stemming in NLTK

In [15]:
words = ["eating", "eats", "eat", "ate", "adjustable", "rafting", "ability", "meeting", "dogs", "abaci"]

In [16]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
for word in words:
    print(word, "-->", stemmer.stem(word))

eating --> eat
eats --> eat
eat --> eat
ate --> ate
adjustable --> adjust
rafting --> raft
ability --> abil
meeting --> meet
dogs --> dog
abaci --> abaci


In [17]:
from nltk.stem.snowball import SnowballStemmer

#the stemmer requires a language parameter
snow_stemmer = SnowballStemmer(language='english')
for word in words:
  print(word, "-->", snow_stemmer.stem(word))

eating --> eat
eats --> eat
eat --> eat
ate --> ate
adjustable --> adjust
rafting --> raft
ability --> abil
meeting --> meet
dogs --> dog
abaci --> abaci


# Lemmatization in NLTK

In [18]:
# import these modules
import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print("rocks :", lemmatizer.lemmatize("rocks"))
print("corpora :", lemmatizer.lemmatize("corpora"))

# a denotes adjective in "pos"
print("better :", lemmatizer.lemmatize("better", pos="a"))


rocks : rock
corpora : corpus
better : good


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
for word in words:
  print(word, "-->", lemmatizer.lemmatize(word))

eating --> eating
eats --> eats
eat --> eat
ate --> ate
adjustable --> adjustable
rafting --> rafting
ability --> ability
meeting --> meeting
dogs --> dog
abaci --> abacus


# Lemmatization in Spacy

In [20]:
import spacy

nlp = spacy.load("en_core_web_sm")

doc = nlp("Mando talked for 3 hours although talking isn't his thing")
doc = nlp("eating eats eat ate adjustable rafting ability meeting better")
for token in doc:
    print(token, " | ", token.lemma_)

eating  |  eat
eats  |  eat
eat  |  eat
ate  |  eat
adjustable  |  adjustable
rafting  |  raft
ability  |  ability
meeting  |  meeting
better  |  well


In [25]:
new_doc = ""
for word in words:
  new_doc = new_doc + " " + word

new_doc = nlp(new_doc)

for word in new_doc:
  print(word, "-->", word.lemma_)

  -->  
eating --> eat
eats --> eat
eat --> eat
ate --> eat
adjustable --> adjustable
rafting --> raft
ability --> ability
meeting --> meeting
dogs --> dog
abaci --> abaci


# Customizing lemmatizer

In [26]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [27]:
ar = nlp.get_pipe('attribute_ruler')

ar.add([[{"TEXT":"Bro"}],[{"TEXT":"Bruh"}]],{"LEMMA":"Brother"})

doc = nlp("Bro, you wanna go? Bruh, don't say no! I am exhausted")
for token in doc:
    print(token.text, "|", token.lemma_)

Bro | Brother
, | ,
you | you
wanna | wanna
go | go
? | ?
Bruh | Brother
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
am | be
exhausted | exhaust


In [28]:
doc[6]

Bruh

In [29]:
doc[6].lemma_

'Brother'