# Stemming:
Stemming removes common suffixes from end of word tokens

ex. Boating --- boat is a stem of boating

In [1]:
#importing the nltk package
import nltk

Natural Language Toolkit (NLTK) is a suite of libraries and programs for symbolic and statistical natural language processing for English, written in the Python programming language.
NLTK also supports classification, tokenization, stemming, tagging, parsing, sematic and reasoning functionalities.

In [2]:
#importing all the libraries from the NLTK package
from nltk.stem.porter import *

In [3]:
#implementing PorterStemmer, an algorithm used for stemming words in natural language processing
p_stemmer = PorterStemmer()

In [4]:
#creating a list, of words
the_words = ['run','runner','running','runs','ran','runnable','fairly']

In [5]:
#using for loop to print the stem of each word, from the 'list of words
for word in the_words:
    print(f"{word} --- {p_stemmer.stem(word)}")

run --- run
runner --- runner
running --- run
runs --- run
ran --- ran
runnable --- runnabl
fairly --- fairli


In [6]:
#importing SnowballStemmer from the Nltk package
from nltk.stem.snowball import SnowballStemmer

In [7]:
#stemming an English word
s_stemmer = SnowballStemmer(language='english')

In [8]:
#creating another list of words
the_wordss = ['run','runner','running','runs','ran','runnable','fairly']

In [9]:
#using for loop, printing the stemmed word for each word in the list
for word in the_wordss:
    print(f"{word} --- {s_stemmer.stem(word)}")

run --- run
runner --- runner
running --- run
runs --- run
ran --- ran
runnable --- runnabl
fairly --- fair


# Lemmatization:

Lemmatization ensures that the output word is an existing, normalised form of word

ex. was --- be

mice --- mouse

In [10]:
#importing the spacy module
import spacy

In [11]:
#importing the English language model
nlp = spacy.load('en_core_web_sm')

In [12]:
#processing an English sentence
doc_1 = nlp("I am a runner running in a running race because I love to run, and since I ran today.")

In [13]:
#using for loop, printing each token from the English sentence along with its part of speech and dependency, after performing tokenization
for token in doc_1:
    print(token.text, "\t", token.pos_, "\t", token.lemma, "\t", token.lemma_)

I 	 PRON 	 4690420944186131903 	 I
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
running 	 VERB 	 12767647472892411841 	 run
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 4690420944186131903 	 I
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
, 	 PUNCT 	 2593208677638477497 	 ,
and 	 CCONJ 	 2283656566040971221 	 and
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today
. 	 PUNCT 	 12646065887601541794 	 .


In [14]:
#defining a function that will show is lemmatized word of another English word
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [15]:
doc_2 = nlp('I saw twenty mice today.')

In [16]:
show_lemmas(doc_2)

I            PRON   4690420944186131903    I
saw          VERB   11925638236994514241   see
twenty       NUM    8304598090389628520    twenty
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
.            PUNCT  12646065887601541794   .


# Stop Words:
Stop words are a collection of commonly used words in English language.

Stop words are most widely used in text mining and NLP to eliminate words that are often used, that they carry very little useful information.

ex. A, The

In [17]:
#importing the spacy module and loading the English language model
import spacy
nlp = spacy.load('en_core_web_sm')

In [18]:
print(nlp.Defaults.stop_words)

{'also', 'at', 'fifteen', 'move', 'else', 'somehow', 'because', 'how', 'sixty', 'ever', 'always', 'had', 'sometime', 'someone', 'toward', 'on', 'anyone', 'seem', 'for', 'become', 'besides', 'could', 'among', 'they', 'anyway', 'so', 'nowhere', 'well', 'mostly', 'out', "'d", 'each', 'except', 'hereafter', 'rather', 'put', 'still', 'perhaps', 'keep', 'together', 'does', 'just', 'almost', 'without', 'whose', 'here', 'and', 'throughout', 'namely', 'anywhere', '‘d', 'full', 'back', 'against', 'less', 'very', 'seeming', 'thereupon', 'nobody', 'only', 'anything', 'third', 'whence', 'becoming', 'do', 'from', 'us', '’re', 'again', 'herself', 'his', 'has', 'everywhere', '‘m', 'twelve', 'upon', 'when', 'several', 'seems', 'across', 'enough', 'otherwise', 'hundred', 'after', 'she', 'make', 'everything', 'is', 'really', '’ve', 'thus', 'yours', 'amongst', 'whether', 'own', 'sometimes', 'during', 'been', 'may', 'twenty', 'noone', 'cannot', "'ll", 'ourselves', 'name', 'more', 'former', 'beside', 'every

In [19]:
#checking if a word is a stop word
nlp.vocab['myself'].is_stop

True

In [20]:
nlp.vocab['although'].is_stop

True

In [21]:
#adding the word 'urselves' onto the list of all stop words in English language
nlp.Defaults.stop_words.add('urselves')

In [22]:
nlp.vocab['urselves'].is_stop

True

In [23]:
print(nlp.Defaults.stop_words)

{'also', 'at', 'fifteen', 'move', 'else', 'somehow', 'because', 'how', 'sixty', 'ever', 'always', 'had', 'sometime', 'someone', 'toward', 'on', 'anyone', 'seem', 'for', 'become', 'besides', 'could', 'among', 'they', 'anyway', 'so', 'nowhere', 'well', 'mostly', 'out', "'d", 'each', 'except', 'hereafter', 'rather', 'put', 'still', 'perhaps', 'keep', 'together', 'does', 'just', 'almost', 'without', 'whose', 'here', 'and', 'throughout', 'namely', 'anywhere', '‘d', 'full', 'back', 'against', 'less', 'very', 'seeming', 'thereupon', 'nobody', 'only', 'anything', 'third', 'whence', 'becoming', 'urselves', 'do', 'from', 'us', '’re', 'again', 'herself', 'his', 'has', 'everywhere', '‘m', 'twelve', 'upon', 'when', 'several', 'seems', 'across', 'enough', 'otherwise', 'hundred', 'after', 'she', 'make', 'everything', 'is', 'really', '’ve', 'thus', 'yours', 'amongst', 'whether', 'own', 'sometimes', 'during', 'been', 'may', 'twenty', 'noone', 'cannot', "'ll", 'ourselves', 'name', 'more', 'former', 'bes

The word 'urselves' is now added onto the list of stop words in English

Similarly, the word can also be removed from the list stop words in English

In [24]:
nlp.Defaults.stop_words.remove('urselves')

In [25]:
print(nlp.Defaults.stop_words)

{'also', 'at', 'fifteen', 'move', 'else', 'somehow', 'because', 'how', 'sixty', 'ever', 'always', 'had', 'sometime', 'someone', 'toward', 'on', 'anyone', 'seem', 'for', 'become', 'besides', 'could', 'among', 'they', 'anyway', 'so', 'nowhere', 'well', 'mostly', 'out', "'d", 'each', 'except', 'hereafter', 'rather', 'put', 'still', 'perhaps', 'keep', 'together', 'does', 'just', 'almost', 'without', 'whose', 'here', 'and', 'throughout', 'namely', 'anywhere', '‘d', 'full', 'back', 'against', 'less', 'very', 'seeming', 'thereupon', 'nobody', 'only', 'anything', 'third', 'whence', 'becoming', 'do', 'from', 'us', '’re', 'again', 'herself', 'his', 'has', 'everywhere', '‘m', 'twelve', 'upon', 'when', 'several', 'seems', 'across', 'enough', 'otherwise', 'hundred', 'after', 'she', 'make', 'everything', 'is', 'really', '’ve', 'thus', 'yours', 'amongst', 'whether', 'own', 'sometimes', 'during', 'been', 'may', 'twenty', 'noone', 'cannot', "'ll", 'ourselves', 'name', 'more', 'former', 'beside', 'every