# Text Pre-processing using NLTK

## Tokenization

In [None]:
# import nltk
# nltk.download('punkt')

In [4]:
from nltk.tokenize import sent_tokenize, word_tokenize

text = "Tokenization is the process by which big quantity of text is divided into smaller parts called tokens.\
    Natural language processing is used for building applications such as Text classification, intelligent chatbot, sentimental analysis, language translation, etc. It becomes vital to understand the pattern in the text to achieve the above-stated purpose. These tokens are very useful for finding such patterns as well as is considered as a base step for stemming and lemmatization."

# tokenize sentence - Split the paragraph into sentences
sent_tokenize(text)

['Tokenization is the process by which big quantity of text is divided into smaller parts called tokens.',
 'Natural language processing is used for building applications such as Text classification, intelligent chatbot, sentimental analysis, language translation, etc.',
 'It becomes vital to understand the pattern in the text to achieve the above-stated purpose.',
 'These tokens are very useful for finding such patterns as well as is considered as a base step for stemming and lemmatization.']

In [5]:
# tokenize words - Split the sentences into words
word_tokenize(text)

['Tokenization',
 'is',
 'the',
 'process',
 'by',
 'which',
 'big',
 'quantity',
 'of',
 'text',
 'is',
 'divided',
 'into',
 'smaller',
 'parts',
 'called',
 'tokens',
 '.',
 'Natural',
 'language',
 'processing',
 'is',
 'used',
 'for',
 'building',
 'applications',
 'such',
 'as',
 'Text',
 'classification',
 ',',
 'intelligent',
 'chatbot',
 ',',
 'sentimental',
 'analysis',
 ',',
 'language',
 'translation',
 ',',
 'etc',
 '.',
 'It',
 'becomes',
 'vital',
 'to',
 'understand',
 'the',
 'pattern',
 'in',
 'the',
 'text',
 'to',
 'achieve',
 'the',
 'above-stated',
 'purpose',
 '.',
 'These',
 'tokens',
 'are',
 'very',
 'useful',
 'for',
 'finding',
 'such',
 'patterns',
 'as',
 'well',
 'as',
 'is',
 'considered',
 'as',
 'a',
 'base',
 'step',
 'for',
 'stemming',
 'and',
 'lemmatization',
 '.']

## Stemming

In [7]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

print(stemmer.stem("playing"))
print(stemmer.stem("plays"))
print(stemmer.stem("played"))
print(stemmer.stem("increase"))

play
play
play
increas


## Lemmatization

In [9]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Resh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [22]:
from nltk.stem import WordNetLemmatizer

lemm = WordNetLemmatizer()

print(lemm.lemmatize("Increases"))
print(lemm.lemmatize("running"))

print(lemm.lemmatize("running", pos="v"))

print(lemm.lemmatize("controlling"))

Increases
running
run
controlling


In [14]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Resh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

## Part of Speech

In [15]:
from nltk import pos_tag

tokens = word_tokenize(text)
pos_tag(tokens)

[('Tokenization', 'NN'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('process', 'NN'),
 ('by', 'IN'),
 ('which', 'WDT'),
 ('big', 'JJ'),
 ('quantity', 'NN'),
 ('of', 'IN'),
 ('text', 'NN'),
 ('is', 'VBZ'),
 ('divided', 'VBN'),
 ('into', 'IN'),
 ('smaller', 'JJR'),
 ('parts', 'NNS'),
 ('called', 'VBD'),
 ('tokens', 'NNS'),
 ('.', '.'),
 ('Natural', 'JJ'),
 ('language', 'NN'),
 ('processing', 'NN'),
 ('is', 'VBZ'),
 ('used', 'VBN'),
 ('for', 'IN'),
 ('building', 'VBG'),
 ('applications', 'NNS'),
 ('such', 'JJ'),
 ('as', 'IN'),
 ('Text', 'NNP'),
 ('classification', 'NN'),
 (',', ','),
 ('intelligent', 'JJ'),
 ('chatbot', 'NN'),
 (',', ','),
 ('sentimental', 'JJ'),
 ('analysis', 'NN'),
 (',', ','),
 ('language', 'NN'),
 ('translation', 'NN'),
 (',', ','),
 ('etc', 'FW'),
 ('.', '.'),
 ('It', 'PRP'),
 ('becomes', 'VBZ'),
 ('vital', 'JJ'),
 ('to', 'TO'),
 ('understand', 'VB'),
 ('the', 'DT'),
 ('pattern', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('text', 'NN'),
 ('to', 'TO'),
 ('achieve', 'VB'),
 ('the'

In [16]:
from nltk.corpus import wordnet


# get synonyms
wordnet.synsets('good')

[Synset('good.n.01'),
 Synset('good.n.02'),
 Synset('good.n.03'),
 Synset('commodity.n.01'),
 Synset('good.a.01'),
 Synset('full.s.06'),
 Synset('good.a.03'),
 Synset('estimable.s.02'),
 Synset('beneficial.s.01'),
 Synset('good.s.06'),
 Synset('good.s.07'),
 Synset('adept.s.01'),
 Synset('good.s.09'),
 Synset('dear.s.02'),
 Synset('dependable.s.04'),
 Synset('good.s.12'),
 Synset('good.s.13'),
 Synset('effective.s.04'),
 Synset('good.s.15'),
 Synset('good.s.16'),
 Synset('good.s.17'),
 Synset('good.s.18'),
 Synset('good.s.19'),
 Synset('good.s.20'),
 Synset('good.s.21'),
 Synset('well.r.01'),
 Synset('thoroughly.r.02')]

In [21]:
# get synonyms
wordnet.synsets('computer')

[Synset('control.v.01'),
 Synset('control.v.02'),
 Synset('operate.v.03'),
 Synset('manipulate.v.05'),
 Synset('control.v.05'),
 Synset('control.v.06'),
 Synset('see.v.10'),
 Synset('master.v.04'),
 Synset('controlling.s.01')]

In [20]:
from nltk import ngrams

sentence = "I love to play football"

n=2
for gram in ngrams(word_tokenize(sentence), n):
    print (gram)

('I', 'love')
('love', 'to')
('to', 'play')
('play', 'football')
