# Stemming_and_Lemmatizing

In [71]:
import numpy as np
from collections import Counter
import string

from nltk import word_tokenize
from nltk.util import ngrams
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer

In [40]:
stop_words = stopwords.words('english')
len(stop_words)

179

In [41]:
mytext = open('alicesadventuresinwonderland.txt').read()
mytext[:300]

"Project Gutenberg's Alice's Adventures in Wonderland, by Lewis Carroll\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalmost no restrictions whatsoever.  You may copy it, give it away or\nre-use it under the terms of the Project Gutenberg License included\nwith this eBook or online "

In [42]:
# Cleaning text

mytext = mytext.replace('\n', ' ')
mytext = mytext.replace('  ', ' ')
mytext = mytext.lower()

len(mytext)

162143

In [43]:
# Removing Punctuation
for i in list(string.punctuation):
    if i in mytext:
        mytext = mytext.replace(i, ' ')
len(mytext)

162143

In [44]:
mylist = mytext.split()
len(mylist)

30528

In [45]:
new_list = []
for item in mylist:
    if item not in stop_words:
        new_list.append(item)
len(new_list)

14131

In [46]:
mybook = ' '.join(new_list)
len(mybook)

91602

In [47]:
mytoken = word_tokenize(mybook)
len(mytoken)

14136

In [48]:
mydict = Counter(mytoken)
print(mydict)



In [49]:
len(mydict)

2895

### Stemming

In [52]:
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer("english")

In [53]:
# Sample test
print(porter.stem('Re-testing'), lancaster.stem('Re-testing'), snowball.stem('Re-testing'))

re-test re-testing re-test


#### Porter Stemming

In [60]:
port_stem = porter.stem(mybook)
port_list = port_stem.split()

In [65]:
Counter(port_list)

Counter({'project': 87,
         'gutenberg': 93,
         'alice': 403,
         'adventures': 12,
         'wonderland': 8,
         'lewis': 4,
         'carroll': 4,
         'ebook': 10,
         'use': 31,
         'anyone': 5,
         'anywhere': 3,
         'cost': 4,
         'almost': 8,
         'restrictions': 2,
         'whatsoever': 2,
         'may': 28,
         'copy': 12,
         'give': 16,
         'away': 28,
         'terms': 22,
         'license': 16,
         'included': 3,
         'online': 4,
         'www': 6,
         'org': 13,
         'title': 1,
         'author': 1,
         'posting': 1,
         'date': 4,
         'june': 1,
         '25': 1,
         '2008': 1,
         '11': 4,
         'release': 1,
         'march': 35,
         '1994': 1,
         'language': 1,
         'english': 7,
         'character': 2,
         'set': 23,
         'encoding': 1,
         'ascii': 3,
         'start': 3,
         'millennium': 1,
         'fulcrum': 1

#### Lancaster Stemmer


In [70]:
lan_stem = lancaster.stem(mybook)
lan_list = lan_stem.split()
Counter(lan_list)


Counter({'project': 87,
         'gutenberg': 93,
         'alice': 403,
         'adventures': 12,
         'wonderland': 8,
         'lewis': 4,
         'carroll': 4,
         'ebook': 9,
         'use': 31,
         'anyone': 5,
         'anywhere': 3,
         'cost': 4,
         'almost': 8,
         'restrictions': 2,
         'whatsoever': 2,
         'may': 28,
         'copy': 12,
         'give': 16,
         'away': 28,
         'terms': 22,
         'license': 16,
         'included': 3,
         'online': 4,
         'www': 6,
         'org': 13,
         'title': 1,
         'author': 1,
         'posting': 1,
         'date': 4,
         'june': 1,
         '25': 1,
         '2008': 1,
         '11': 4,
         'release': 1,
         'march': 35,
         '1994': 1,
         'language': 1,
         'english': 7,
         'character': 2,
         'set': 23,
         'encoding': 1,
         'ascii': 3,
         'start': 3,
         'millennium': 1,
         'fulcrum': 1,

#### Snowball Stemmer

In [67]:
snow_stem = snowball.stem(mybook)
snow_list = snow_stem.split()
Counter(snow_list)

Counter({'project': 87,
         'gutenberg': 93,
         'alice': 403,
         'adventures': 12,
         'wonderland': 8,
         'lewis': 4,
         'carroll': 4,
         'ebook': 10,
         'use': 31,
         'anyone': 5,
         'anywhere': 3,
         'cost': 4,
         'almost': 8,
         'restrictions': 2,
         'whatsoever': 2,
         'may': 28,
         'copy': 12,
         'give': 16,
         'away': 28,
         'terms': 22,
         'license': 16,
         'included': 3,
         'online': 4,
         'www': 6,
         'org': 13,
         'title': 1,
         'author': 1,
         'posting': 1,
         'date': 4,
         'june': 1,
         '25': 1,
         '2008': 1,
         '11': 4,
         'release': 1,
         'march': 35,
         '1994': 1,
         'language': 1,
         'english': 7,
         'character': 2,
         'set': 23,
         'encoding': 1,
         'ascii': 3,
         'start': 3,
         'millennium': 1,
         'fulcrum': 1

In [68]:
# All lengths are same

### Lemmatizing

Lemmatization aims to achieve a similar base "stem" for a word, but aims to derive the genuine dictionary root word, not just a trunctated version of the word.

In [72]:
wnl = WordNetLemmatizer()

print(wnl.lemmatize(mybook))



In [76]:
print(len((wnl.lemmatize(mybook).split())))

14131


In [79]:
len(set(wnl.lemmatize(mybook).split()))

2894

In [80]:
# This length is same as that of Stemming

In [81]:
len(Counter(snow_list))

2894

## Examples

In [82]:
wnl.lemmatize('brightening', pos='v')

'brighten'

In [86]:
wnl.lemmatize('shining', pos='v')

'shin'

In [90]:
porter.stem('shining')

'shine'

In [88]:
lancaster.stem('shining')

'shin'

In [87]:
snowball.stem('shining')

'shine'