In [1]:
import nltk

In [2]:
plurals = ['caresses', 'flies', 'dies', 'mules', 'died', 'agreed', 'owned', 'humbled', 'sized', 'meeting', 'stating',
           'siezing', 'itemization', 'traditional', 'reference', 'colonizer', 'plotted', 'having', 'generously']

## Porter Stemmer

In [3]:
from nltk.stem.porter import PorterStemmer 
stemmer = PorterStemmer()
singles = [stemmer.stem(plural) for plural in plurals]
print(' '.join(singles))

caress fli die mule die agre own humbl size meet state siez item tradit refer colon plot have gener


## Snowball Stemmer

In [4]:
from nltk.stem.snowball import SnowballStemmer
print(SnowballStemmer.languages)

('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')


In [5]:
stemmer2 = SnowballStemmer(language='english')
singles = [stemmer2.stem(plural) for plural in plurals]
print(' '.join(singles))

caress fli die mule die agre own humbl size meet state siez item tradit refer colon plot have generous


## Wordnet Lemmatizer


In [6]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Leapfrog\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
lemmatizer = WordNetLemmatizer()
s = "We are putting in efforts to enhance our understanding of Lemmatization"
token_list = s.split()
print("The tokens are: ", token_list)
lemmatized_output = ' '.join([lemmatizer.lemmatize(token) for token in token_list])
print("The lemmatized output is: ", lemmatized_output)

The tokens are:  ['We', 'are', 'putting', 'in', 'efforts', 'to', 'enhance', 'our', 'understanding', 'of', 'Lemmatization']
The lemmatized output is:  We are putting in effort to enhance our understanding of Lemmatization


## POS Tagging

In [8]:
nltk.download('averaged_perceptron_tagger')
pos_tags = nltk.pos_tag(token_list)
pos_tags

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Leapfrog\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('We', 'PRP'),
 ('are', 'VBP'),
 ('putting', 'VBG'),
 ('in', 'IN'),
 ('efforts', 'NNS'),
 ('to', 'TO'),
 ('enhance', 'VB'),
 ('our', 'PRP$'),
 ('understanding', 'NN'),
 ('of', 'IN'),
 ('Lemmatization', 'NN')]

## POS tag Mapping

In [9]:
from  nltk.corpus import wordnet

def get_part_of_speech_tags(token):
    """Mpas POS tags to first character lemmatize() accepts.
    We are foucssing on Verbs, Nouns, Adjectives and Adverbs here."""
    
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R":wordnet.ADV}
    tag = nltk.pos_tag([token])[0][1][0].upper()
    
    return tag_dict.get(tag, wordnet.NOUN)

## Wordnet Lemmatizer with POS Tag Information

In [10]:
lemmatized_output_with_POS_information = [lemmatizer.lemmatize(token, get_part_of_speech_tags(token)) for token in token_list]
print(' '.join(lemmatized_output_with_POS_information))

We be put in effort to enhance our understand of Lemmatization


## Lemmatization vs Stemming

In [39]:
stemmer2 =  SnowballStemmer(language ='english')
stemmed_sentence =  [stemmer2.stem(token) for token in token_list]
print(' '.join(stemmed_sentence))

we are put in effort to enhanc our understand of lemmat


## spaCy Lemmatizer

In [16]:
!pip install spacy
!python -m spacy download en_core_web_lg 






Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     -------------------------------------- 0.0/587.7 MB 640.0 kB/s eta 0:15:19
     -------------------------------------- 0.0/587.7 MB 640.0 kB/s eta 0:15:19
     -------------------------------------- 0.1/587.7 MB 372.4 kB/s eta 0:26:19
     -------------------------------------- 0.1/587.7 MB 456.6 kB/s eta 0:21:28
     -------------------------------------- 0.1/587.7 MB 561.1 kB/s eta 0:17:28
     -------------------------------------- 0.2/587.7 MB 577.4 kB/s eta 0:16:58
     -------------------------------------- 0.2/587.7 MB 654.3 kB/s eta 0:14:58
     -------------------------------------- 0.2/587.7 MB 684.7 kB/s eta 0:14:19
     -------------------------------------- 0.3/587.7 MB 704.5 kB/s eta 0:13:54
     -------------------------



In [20]:
import spacy
nlp = spacy.load('en_core_web_lg')
doc = nlp("We are putting in efforts to enhance our understanding of Lemmatization")
" ".join([token.lemma_ for token in doc])

'we be put in effort to enhance our understanding of Lemmatization'

## Stopwords

In [22]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
", ".join(stop)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Leapfrog\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


"shan't, being, which, were, but, here, re, some, wasn, m, than, same, if, at, had, how, after, having, very, shan, yours, hers, herself, just, while, between, because, was, off, any, can, mustn, both, mightn, again, won, myself, up, wouldn, our, on, more, above, needn't, once, where, aren, haven't, do, before, isn, mustn't, we, i, out, against, ourselves, have, they, d, until, over, ll, didn, couldn, no, my, the, wasn't, haven, me, these, yourselves, hasn, during, a, about, ain, he, doing, that, such, theirs, yourself, few, below, be, shouldn't, you'd, as, only, weren't, you, what, when, you're, an, and, o, should've, themselves, them, who, don, been, you've, down, aren't, she's, is, in, ma, this, by, couldn't, your, ours, don't, hadn't, with, nor, mightn't, itself, you'll, will, its, didn't, that'll, each, doesn, himself, y, there, am, under, him, it, for, from, doesn't, those, t, it's, hadn, whom, too, their, now, of, own, wouldn't, into, then, isn't, all, hasn't, her, to, or, ve, h

In [24]:
wh_words = ['who','what', 'when', 'why', 'how', 'which','where', 'whom']

stop = set(stopwords.words('english'))

sentence = "how are we putting in efforts to enhance our understanding of Lemmatization"

for word in wh_words:
    stop.remove(word)
    
sentence_after_stopword_removal = [token for token in sentence.split() if token not in stop]
" ".join(sentence_after_stopword_removal)

'how putting efforts enhance understanding Lemmatization'

## Case Folding

In [27]:
s = "We are putting in efforts to enhance our understanding of Lemmatization"
s = s.lower()
s

'we are putting in efforts to enhance our understanding of lemmatization'

## N-grams

In [32]:
from nltk.util import ngrams
s =  "Natural Language Processing is the way to go"
tokens =  s.split()
bigrams = list(ngrams(tokens,2))
[" ".join(token) for token in bigrams]

['Natural Language',
 'Language Processing',
 'Processing is',
 'is the',
 'the way',
 'way to',
 'to go']

In [34]:
s = "Natural Lanuage Processing is the way to go"
tokens = s.split()
trigrams = list(ngrams(tokens, 3))
[" ".join(token) for token in trigrams]

['Natural Lanuage Processing',
 'Lanuage Processing is',
 'Processing is the',
 'is the way',
 'the way to',
 'way to go']

## Building a basic vocabulary

In [36]:
s = "Natural Language Processing is the way to go"
tokens = set(s.split())
vocabulary = sorted(tokens)
vocabulary

['Language', 'Natural', 'Processing', 'go', 'is', 'the', 'to', 'way']

## Removing HTML Tags

In [38]:
html = "<!DOCTYPE html><html><body><h1>My First Heading</h1><p>My first paragraph.</p></body></html>"
from bs4 import BeautifulSoup

soup = BeautifulSoup(html)
text = soup.get_text()
text

'My First HeadingMy first paragraph.'

## Summary:
In this assignment we used different steps such stemmer, lemmatizer that are needed	to	build a	naturallanguage	vocabulary.Here are breif theory behind them:

### Stemming:
The process of removing the	inflectional forms is stemming. The words which are in base form are called	the	*stem*. The	chopped-off	pieces	are referred to	as *affixes*. Porter and Snowball stemmer are commonly used stemmers.The Porter stemmer	supports	the English	language, whereas the Snowball stemmer,	which is an improvement	on the Porter stemmer, supports	multiple languages.

### Lemmatization:
lemmatization is a process wherein the	context	is	used to	convert	a word	to	its	meaningful	base	form.	It	helps	in	grouping together words	that have a common	base	form	and	so	can	be	identified	as	a	single	item.	The	base	form is referred to	as the lemma of	the	word and is	alsosometimes known	as	the dictionary	form. In this tutorial we used two lemmatizer; Wordnet Lemmatizer, Spacy Lemmatizer and Wordnet Lemmatizer POS tagging.

Inaddition to stemming and lemmatization we implemented some other steps such as removing stop words, HTML tag removal, case folding and n-tram tokenization etc.

