In [23]:
#Stemming and Lemmatization meanings:
#stemming is a text normalization technique that involves removing suffixes and sometimes even prefixes from words to obtain a common base form known as stem.
#When you look at a stemmed word , you'll find that the stem may not be valid word on its own so the stems may not be valid real words and that stems may not represent the core meaning of the word that was stemmed.

In [24]:
import string
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import RegexpStemmer

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet


In [25]:
text="Their research involves exploring the intricacies of artificial"\
" intelligence , delving into themysteries of quantum computing , and "\
" deciphering the secrets of genetic coding . Their work has been called "\
"awesome , awful and akward "
text

'Their research involves exploring the intricacies of artificial intelligence , delving into themysteries of quantum computing , and  deciphering the secrets of genetic coding . Their work has been called awesome , awful and akward '

In [26]:
nltk.download('punkt_tab')
words=word_tokenize(text)
words

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


['Their',
 'research',
 'involves',
 'exploring',
 'the',
 'intricacies',
 'of',
 'artificial',
 'intelligence',
 ',',
 'delving',
 'into',
 'themysteries',
 'of',
 'quantum',
 'computing',
 ',',
 'and',
 'deciphering',
 'the',
 'secrets',
 'of',
 'genetic',
 'coding',
 '.',
 'Their',
 'work',
 'has',
 'been',
 'called',
 'awesome',
 ',',
 'awful',
 'and',
 'akward']

In [27]:
porter_stemmer = PorterStemmer()
porter_stemmer

<PorterStemmer>

In [28]:
porter_stemmed_words = [porter_stemmer.stem(word) for word in words]
df=pd.DataFrame({'Original Words':words,'Porter Stemmed Words':porter_stemmed_words})
df

Unnamed: 0,Original Words,Porter Stemmed Words
0,Their,their
1,research,research
2,involves,involv
3,exploring,explor
4,the,the
5,intricacies,intricaci
6,of,of
7,artificial,artifici
8,intelligence,intellig
9,",",","


In [29]:
#Porter Stemmer trims the word but these are not the best stems but still used because simple, easy and preserves the meaning of the word.

In [30]:
#2.Lancaster Stemmer
lancaster_stemmer = LancasterStemmer()
lancaster_stemmed_words = [lancaster_stemmer.stem(word) for word in words]
df=pd.DataFrame({'Original Words':words,
                 'Porter Stemmed Words':porter_stemmed_words,
                 'Lancaster Stemmed Words':lancaster_stemmed_words})
df

Unnamed: 0,Original Words,Porter Stemmed Words,Lancaster Stemmed Words
0,Their,their,their
1,research,research,research
2,involves,involv,involv
3,exploring,explor,expl
4,the,the,the
5,intricacies,intricaci,int
6,of,of,of
7,artificial,artifici,art
8,intelligence,intellig,intellig
9,",",",",","


In [31]:
#we use Lancaster stemmer for  speed and efficiency ,way more agressive than porter stemmer .

In [32]:
#3.snowball stemmer(developed by the same person that created porter stemmer , Notable difference is that this uses multiple languages)
#checking the langugaes that snowball stemmer can use
print(SnowballStemmer.languages)
snowball_stemmer = SnowballStemmer(language='english')
snowball_stemmed_words = [snowball_stemmer.stem(word) for word in words]
print(snowball_stemmed_words)


('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')
['their', 'research', 'involv', 'explor', 'the', 'intricaci', 'of', 'artifici', 'intellig', ',', 'delv', 'into', 'themysteri', 'of', 'quantum', 'comput', ',', 'and', 'deciph', 'the', 'secret', 'of', 'genet', 'code', '.', 'their', 'work', 'has', 'been', 'call', 'awesom', ',', 'aw', 'and', 'akward']


In [33]:
df=pd.DataFrame({'Original Words':words,
                 'Porter Stemmed Words':porter_stemmed_words,
                 'Lancaster Stemmed Words':lancaster_stemmed_words,
                 'Snowball Stemmed Words':snowball_stemmed_words})
df
# Stemming Algorithm Comparison:
# ------------------------------
# Porter Stemmer:
#   - Pros: Simple, fast, widely used, generally preserves meaning.
#   - Cons: Can be aggressive, may produce non-words, less accurate than newer algorithms.
#   - Example: Stems "involves" to "involv".
#
# Lancaster Stemmer:
#   - Pros: Very fast and efficient, more aggressive than Porter.
#   - Cons: Can be overly aggressive, leading to short and obscure stems, potential loss of meaning.
#   - Example: Stems "exploring" to "explor".
#
# Snowball Stemmer:
#   - Pros: More accurate than Porter, supports multiple languages.
#   - Cons: Might be slightly slower than Porter or Lancaster.
#   - Example: Stems "involves" to "involv", but preserves "research" as "research".
#
# Summary:
#   - Porter: Good for simplicity and speed.
#   - Lancaster: Best for speed and aggressive stemming.
#   - Snowball: Recommended for accuracy and multi-language support.


Unnamed: 0,Original Words,Porter Stemmed Words,Lancaster Stemmed Words,Snowball Stemmed Words
0,Their,their,their,their
1,research,research,research,research
2,involves,involv,involv,involv
3,exploring,explor,expl,explor
4,the,the,the,the
5,intricacies,intricaci,int,intricaci
6,of,of,of,of
7,artificial,artifici,art,artifici
8,intelligence,intellig,intellig,intellig
9,",",",",",",","


In [34]:
#we can specify what expressions we can remove and create our own stemmer
regexp_stemmer = RegexpStemmer('ing$|e$|ed$|able$|es$')
regexp_stemmed_words = [regexp_stemmer.stem(word.lower()) for word in words]
print(regexp_stemmed_words)

['their', 'research', 'involv', 'explor', 'th', 'intricaci', 'of', 'artificial', 'intelligenc', ',', 'delv', 'into', 'themysteri', 'of', 'quantum', 'comput', ',', 'and', 'decipher', 'th', 'secrets', 'of', 'genetic', 'cod', '.', 'their', 'work', 'has', 'been', 'call', 'awesom', ',', 'awful', 'and', 'akward']


In [35]:
df=pd.DataFrame({'Original Words':words,
                 'Porter Stemmed Words':porter_stemmed_words,
                 'Lancaster Stemmed Words':lancaster_stemmed_words,
                 'Snowball Stemmed Words':snowball_stemmed_words,
                 'Regexp Stemmed Words':regexp_stemmed_words})
df

Unnamed: 0,Original Words,Porter Stemmed Words,Lancaster Stemmed Words,Snowball Stemmed Words,Regexp Stemmed Words
0,Their,their,their,their,their
1,research,research,research,research,research
2,involves,involv,involv,involv,involv
3,exploring,explor,expl,explor,explor
4,the,the,the,the,th
5,intricacies,intricaci,int,intricaci,intricaci
6,of,of,of,of,of
7,artificial,artifici,art,artifici,artificial
8,intelligence,intellig,intellig,intellig,intelligenc
9,",",",",",",",",","


In [36]:
#what happens when we have a word that has bolth the pattens in a regex expression
regexp_stemmer = RegexpStemmer('ing$|e$|ed$|able$|es$|s$')
regexp_stemmed_words = [regexp_stemmer.stem(word.lower()) for word in ['playings','played','play']]
print(regexp_stemmed_words)
#it removes the last patten it matches for eg:-playings has -ing and has -s but we ahve not defined anything as -ings so it removes just -s because it removes siffix and not from the middle of the word .

['playing', 'play', 'play']


In [43]:
#Lemmatization
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet2021


nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [44]:
wn_lemmatized_words = [WordNetLemmatizer().lemmatize(word) for word in words]
print(wn_lemmatized_words)

['Their', 'research', 'involves', 'exploring', 'the', 'intricacy', 'of', 'artificial', 'intelligence', ',', 'delving', 'into', 'themysteries', 'of', 'quantum', 'computing', ',', 'and', 'deciphering', 'the', 'secret', 'of', 'genetic', 'coding', '.', 'Their', 'work', 'ha', 'been', 'called', 'awesome', ',', 'awful', 'and', 'akward']


In [45]:
df1=pd.DataFrame({'Original Words':words,
                 'Wordnet Lemmatized Words':wn_lemmatized_words})
df1

Unnamed: 0,Original Words,Wordnet Lemmatized Words
0,Their,Their
1,research,research
2,involves,involves
3,exploring,exploring
4,the,the
5,intricacies,intricacy
6,of,of
7,artificial,artificial
8,intelligence,intelligence
9,",",","


In [46]:
wn_lemmatizer = WordNetLemmatizer()  # Initialize the lemmatizer
wn_lemmatized_words = [wn_lemmatizer.lemmatize(word) for word in words]
wn_pos_tags = pos_tag(words, tagset='universal')
wn_pos_tags

[('Their', 'PRON'),
 ('research', 'NOUN'),
 ('involves', 'VERB'),
 ('exploring', 'VERB'),
 ('the', 'DET'),
 ('intricacies', 'NOUN'),
 ('of', 'ADP'),
 ('artificial', 'ADJ'),
 ('intelligence', 'NOUN'),
 (',', '.'),
 ('delving', 'VERB'),
 ('into', 'ADP'),
 ('themysteries', 'NOUN'),
 ('of', 'ADP'),
 ('quantum', 'NOUN'),
 ('computing', 'NOUN'),
 (',', '.'),
 ('and', 'CONJ'),
 ('deciphering', 'VERB'),
 ('the', 'DET'),
 ('secrets', 'NOUN'),
 ('of', 'ADP'),
 ('genetic', 'ADJ'),
 ('coding', 'NOUN'),
 ('.', '.'),
 ('Their', 'PRON'),
 ('work', 'NOUN'),
 ('has', 'VERB'),
 ('been', 'VERB'),
 ('called', 'VERB'),
 ('awesome', 'ADJ'),
 (',', '.'),
 ('awful', 'ADJ'),
 ('and', 'CONJ'),
 ('akward', 'NOUN')]

In [47]:
wn_lemmatizer = WordNetLemmatizer()  # Initialize the lemmatizer
wn_lemmatized_words = [wn_lemmatizer.lemmatize(word) for word in words]
wn_pos_tags = pos_tag(words)
wn_pos_tags

[('Their', 'PRP$'),
 ('research', 'NN'),
 ('involves', 'VBZ'),
 ('exploring', 'VBG'),
 ('the', 'DT'),
 ('intricacies', 'NNS'),
 ('of', 'IN'),
 ('artificial', 'JJ'),
 ('intelligence', 'NN'),
 (',', ','),
 ('delving', 'VBG'),
 ('into', 'IN'),
 ('themysteries', 'NNS'),
 ('of', 'IN'),
 ('quantum', 'NN'),
 ('computing', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('deciphering', 'VBG'),
 ('the', 'DT'),
 ('secrets', 'NNS'),
 ('of', 'IN'),
 ('genetic', 'JJ'),
 ('coding', 'NN'),
 ('.', '.'),
 ('Their', 'PRP$'),
 ('work', 'NN'),
 ('has', 'VBZ'),
 ('been', 'VBN'),
 ('called', 'VBN'),
 ('awesome', 'JJ'),
 (',', ','),
 ('awful', 'JJ'),
 ('and', 'CC'),
 ('akward', 'NN')]

In [49]:
import nltk
nltk.download('tagsets_json')

[nltk_data] Downloading package tagsets_json to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets_json.zip.


True

In [50]:
nltk.help.upenn_tagset('JJ')

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...


In [51]:
def get_wordnet_pos(treebank_tag):
  if treebank_tag.startswith('J'):
    return wordnet.ADJ
  elif treebank_tag.startswith('V'):
    return wordnet.VERB
  elif treebank_tag.startswith('N'):
    return wordnet.NOUN
  elif treebank_tag.startswith('R'):
    return wordnet.ADV
  else:
    return wordnet.NOUN

In [52]:
wn_lemmatized_words = [wn_lemmatizer.lemmatize(word,get_wordnet_pos(pos)) for word,pos in wn_pos_tags]
df= pd.DataFrame({'Original Word':words,
                  'POS Tags' : [pos[1] for pos in wn_pos_tags],
                  'WordNet Lemmatized words' : wn_lemmatized_words})
df

Unnamed: 0,Original Word,POS Tags,WordNet Lemmatized words
0,Their,PRP$,Their
1,research,NN,research
2,involves,VBZ,involve
3,exploring,VBG,explore
4,the,DT,the
5,intricacies,NNS,intricacy
6,of,IN,of
7,artificial,JJ,artificial
8,intelligence,NN,intelligence
9,",",",",","
