# Exploring Tokenization

@author: Aman Kedia

In [1]:
import nltk

In [2]:
plurals = ['caresses', 'flies', 'dies', 'mules', 'died', 'agreed', 'owned', 'humbled', 'sized', 'meeting', 'stating',
           'siezing', 'itemization', 'traditional', 'reference', 'colonizer', 'plotted', 'having', 'generously']

# Porter Stemmer

In [3]:
from nltk.stem.porter import PorterStemmer 
stemmer = PorterStemmer()
singles = [stemmer.stem(plural) for plural in plurals]
print(' '.join(singles))

caress fli die mule die agre own humbl size meet state siez item tradit refer colon plot have gener


# Snowball Stemmer

In [4]:
from nltk.stem.snowball import SnowballStemmer
print(SnowballStemmer.languages)

('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')


In [5]:
stemmer2 = SnowballStemmer(language='english')
singles = [stemmer2.stem(plural) for plural in plurals]
print(' '.join(singles))

caress fli die mule die agre own humbl size meet state siez item tradit refer colon plot have generous


# Wordnet Lemmatizer

In [6]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/amankedia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
lemmatizer = WordNetLemmatizer()
s = "We are putting in efforts to enhance our understanding of Lemmatization"
token_list = s.split()
print("The tokens are: ", token_list)
lemmatized_output = ' '.join([lemmatizer.lemmatize(token) for token in token_list])
print("The lemmatized output is: ", lemmatized_output)

The tokens are:  ['We', 'are', 'putting', 'in', 'efforts', 'to', 'enhance', 'our', 'understanding', 'of', 'Lemmatization']
The lemmatized output is:  We are putting in effort to enhance our understanding of Lemmatization


## POS Tagging

In [8]:
nltk.download('averaged_perceptron_tagger')
pos_tags = nltk.pos_tag(token_list)
pos_tags

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/amankedia/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('We', 'PRP'),
 ('are', 'VBP'),
 ('putting', 'VBG'),
 ('in', 'IN'),
 ('efforts', 'NNS'),
 ('to', 'TO'),
 ('enhance', 'VB'),
 ('our', 'PRP$'),
 ('understanding', 'NN'),
 ('of', 'IN'),
 ('Lemmatization', 'NN')]

## POS tag Mapping

In [9]:
from nltk.corpus import wordnet

##This is a common method which is widely used across the NLP community of practitioners and readers

def get_part_of_speech_tags(token):
    
    """Maps POS tags to first character lemmatize() accepts.
    We are focussing on Verbs, Nouns, Adjectives and Adverbs here."""

    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    
    tag = nltk.pos_tag([token])[0][1][0].upper()
    
    return tag_dict.get(tag, wordnet.NOUN)

## Wordnet Lemmatizer with POS Tag Information

In [10]:
lemmatized_output_with_POS_information = [lemmatizer.lemmatize(token, get_part_of_speech_tags(token)) for token in token_list]
print(' '.join(lemmatized_output_with_POS_information))

We be put in effort to enhance our understand of Lemmatization


## Lemmatization vs Stemming

In [11]:
stemmer2 = SnowballStemmer(language='english')
stemmed_sentence = [stemmer2.stem(token) for token in token_list]
print(' '.join(stemmed_sentence))

we are put in effort to enhanc our understand of lemmat


# spaCy Lemmatizer

In [12]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp("We are putting in efforts to enhance our understanding of Lemmatization")
" ".join([token.lemma_ for token in doc])

'-PRON- be put in effort to enhance -PRON- understanding of lemmatization'

# Stopwords

In [13]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
", ".join(stop)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/amankedia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


"it's, yours, an, doing, any, mightn't, you, having, wasn't, themselves, just, over, below, needn't, a, this, shan't, them, isn't, was, wouldn't, as, only, his, or, shan, wouldn, don, where, own, were, he, out, do, it, am, won, isn, there, hers, to, ll, most, for, weren, have, by, while, the, re, that, down, haven, has, is, here, itself, all, didn, herself, shouldn, him, ve, who, doesn, m, hadn't, after, further, weren't, at, hadn, should've, too, because, can, now, same, more, she's, wasn, these, yourself, himself, being, very, until, myself, few, so, which, ourselves, they, t, you'd, did, o, aren, but, that'll, such, whom, of, s, you'll, those, doesn't, my, what, aren't, during, hasn, through, will, couldn, i, mustn, needn, mustn't, d, had, me, under, won't, haven't, its, with, when, their, between, if, once, against, before, on, not, you're, each, yourselves, in, and, are, shouldn't, some, nor, her, does, she, off, how, both, our, then, why, again, we, no, y, be, other, ma, from, up

In [14]:
wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']

stop = set(stopwords.words('english'))

sentence = "how are we putting in efforts to enhance our understanding of Lemmatization"

for word in wh_words:
    stop.remove(word)

sentence_after_stopword_removal = [token for token in sentence.split() if token not in stop]
" ".join(sentence_after_stopword_removal)

'how putting efforts enhance understanding Lemmatization'

# Case Folding

In [15]:
s = "We are putting in efforts to enhance our understanding of Lemmatization"
s = s.lower()
s

'we are putting in efforts to enhance our understanding of lemmatization'

# N-grams

In [16]:
from nltk.util import ngrams
s = "Natural Language Processing is the way to go"
tokens = s.split()
bigrams = list(ngrams(tokens, 2))
[" ".join(token) for token in bigrams]

['Natural Language',
 'Language Processing',
 'Processing is',
 'is the',
 'the way',
 'way to',
 'to go']

In [17]:
s = "Natural Language Processing is the way to go"
tokens = s.split()
trigrams = list(ngrams(tokens, 3))
[" ".join(token) for token in trigrams]

['Natural Language Processing',
 'Language Processing is',
 'Processing is the',
 'is the way',
 'the way to',
 'way to go']

# Building a basic vocabulary

In [18]:
s = "Natural Language Processing is the way to go"
tokens = set(s.split())
vocabulary = sorted(tokens)
vocabulary

['Language', 'Natural', 'Processing', 'go', 'is', 'the', 'to', 'way']

# Removing HTML Tags

In [19]:
html = "<!DOCTYPE html><html><body><h1>My First Heading</h1><p>My first paragraph.</p></body></html>"
from bs4 import BeautifulSoup

soup = BeautifulSoup(html)
text = soup.get_text()
print(text)

My First HeadingMy first paragraph.
