## Tokenizing using NLTK


In [8]:
import nltk
data = "India (Hindi: Bhārat), officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area, the second-most populous country, and the most populous democracy in the world. Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west; China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand and Indonesia."
nltk.sent_tokenize(data)
# nltk.download('punkt')

['India (Hindi: Bhārat), officially the Republic of India, is a country in South Asia.',
 'It is the seventh-largest country by area, the second-most populous country, and the most populous democracy in the world.',
 'Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west; China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east.',
 'In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand and Indonesia.']

In [9]:
nltk.word_tokenize(data)

['India',
 '(',
 'Hindi',
 ':',
 'Bhārat',
 ')',
 ',',
 'officially',
 'the',
 'Republic',
 'of',
 'India',
 ',',
 'is',
 'a',
 'country',
 'in',
 'South',
 'Asia',
 '.',
 'It',
 'is',
 'the',
 'seventh-largest',
 'country',
 'by',
 'area',
 ',',
 'the',
 'second-most',
 'populous',
 'country',
 ',',
 'and',
 'the',
 'most',
 'populous',
 'democracy',
 'in',
 'the',
 'world',
 '.',
 'Bounded',
 'by',
 'the',
 'Indian',
 'Ocean',
 'on',
 'the',
 'south',
 ',',
 'the',
 'Arabian',
 'Sea',
 'on',
 'the',
 'southwest',
 ',',
 'and',
 'the',
 'Bay',
 'of',
 'Bengal',
 'on',
 'the',
 'southeast',
 ',',
 'it',
 'shares',
 'land',
 'borders',
 'with',
 'Pakistan',
 'to',
 'the',
 'west',
 ';',
 'China',
 ',',
 'Nepal',
 ',',
 'and',
 'Bhutan',
 'to',
 'the',
 'north',
 ';',
 'and',
 'Bangladesh',
 'and',
 'Myanmar',
 'to',
 'the',
 'east',
 '.',
 'In',
 'the',
 'Indian',
 'Ocean',
 ',',
 'India',
 'is',
 'in',
 'the',
 'vicinity',
 'of',
 'Sri',
 'Lanka',
 'and',
 'the',
 'Maldives',
 ';',
 'i

## POS Tags

In [11]:
#nltk.download('averaged_perceptron_tagger')
data =' We will see an example of POS tagging.'
nltk.pos_tag(nltk.word_tokenize(data))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sohel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


[('We', 'PRP'),
 ('will', 'MD'),
 ('see', 'VB'),
 ('an', 'DT'),
 ('example', 'NN'),
 ('of', 'IN'),
 ('POS', 'NNP'),
 ('tagging', 'NN'),
 ('.', '.')]

## Chunking

In [16]:
data =' We will see an example of POS tagging.'

pos = nltk.pos_tag(nltk.word_tokenize(data))

# now once the POS tag has been done. Let's say we want to further structure data such that Nouns are
# categorized under one specific node defined by us :
my_node='MN : {<NN>*<NNP>}'
chunk=nltk.RegexpParser(my_node)
result=chunk.parse(pos)
print(result)
result.draw()    # It will draw the pattern graphically which can be seen in Noun Phrase chunking

(S
  We/PRP
  will/MD
  see/VB
  an/DT
  example/NN
  of/IN
  (MN POS/NNP)
  tagging/NN
  ./.)


## Stop Words

In [23]:
nltk.download('stopwords')
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sohel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Punctuation

In [24]:
# We also have punctuations which we can ignore from our set of words just like stopwords.

import string

punct =string.punctuation
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [28]:
# Let's word tokenize the given sample after we remove the stopwords and punctuation. 

import nltk
import string
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
punct =string.punctuation

data = "India (Hindi: Bhārat), officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area, the second-most populous country, and the most populous democracy in the world. Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west; China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand and Indonesia."

clean_data=[]
for word in nltk.word_tokenize(data):
    if word not in stop_words:
        if word not in punct:
            clean_data.append(word)
clean_data

['India',
 'Hindi',
 'Bhārat',
 'officially',
 'Republic',
 'India',
 'country',
 'South',
 'Asia',
 'It',
 'seventh-largest',
 'country',
 'area',
 'second-most',
 'populous',
 'country',
 'populous',
 'democracy',
 'world',
 'Bounded',
 'Indian',
 'Ocean',
 'south',
 'Arabian',
 'Sea',
 'southwest',
 'Bay',
 'Bengal',
 'southeast',
 'shares',
 'land',
 'borders',
 'Pakistan',
 'west',
 'China',
 'Nepal',
 'Bhutan',
 'north',
 'Bangladesh',
 'Myanmar',
 'east',
 'In',
 'Indian',
 'Ocean',
 'India',
 'vicinity',
 'Sri',
 'Lanka',
 'Maldives',
 'Andaman',
 'Nicobar',
 'Islands',
 'share',
 'maritime',
 'border',
 'Thailand',
 'Indonesia']

In [29]:
nltk.pos_tag(clean_data)

[('India', 'NNP'),
 ('Hindi', 'NNP'),
 ('Bhārat', 'NNP'),
 ('officially', 'RB'),
 ('Republic', 'NNP'),
 ('India', 'NNP'),
 ('country', 'NN'),
 ('South', 'NNP'),
 ('Asia', 'IN'),
 ('It', 'PRP'),
 ('seventh-largest', 'JJ'),
 ('country', 'NN'),
 ('area', 'NN'),
 ('second-most', 'RB'),
 ('populous', 'JJ'),
 ('country', 'NN'),
 ('populous', 'JJ'),
 ('democracy', 'NN'),
 ('world', 'NN'),
 ('Bounded', 'NNP'),
 ('Indian', 'JJ'),
 ('Ocean', 'NNP'),
 ('south', 'NN'),
 ('Arabian', 'NNP'),
 ('Sea', 'NNP'),
 ('southwest', 'JJS'),
 ('Bay', 'NNP'),
 ('Bengal', 'NNP'),
 ('southeast', 'NN'),
 ('shares', 'NNS'),
 ('land', 'VBP'),
 ('borders', 'NNS'),
 ('Pakistan', 'NNP'),
 ('west', 'JJS'),
 ('China', 'NNP'),
 ('Nepal', 'NNP'),
 ('Bhutan', 'NNP'),
 ('north', 'JJ'),
 ('Bangladesh', 'NNP'),
 ('Myanmar', 'NNP'),
 ('east', 'NN'),
 ('In', 'IN'),
 ('Indian', 'JJ'),
 ('Ocean', 'NNP'),
 ('India', 'NNP'),
 ('vicinity', 'NN'),
 ('Sri', 'NNP'),
 ('Lanka', 'NNP'),
 ('Maldives', 'NNP'),
 ('Andaman', 'NNP'),
 ('Nicoba

## Stemming

In [30]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer, SnowballStemmer

lancaster = LancasterStemmer()
porter = PorterStemmer()
Snowball = SnowballStemmer("english")
print('Porter stemmer')
print(porter.stem("hobby"))
print(porter.stem("hobbies"))
print(porter.stem("computer"))
print(porter.stem("computation"))
print("**************************")  
print('lancaster stemmer')
print(lancaster.stem("hobby"))
print(lancaster.stem("hobbies"))
print(lancaster.stem("computer"))
print(porter.stem("computation"))
print("**************************")  
print('Snowball stemmer')
print(Snowball.stem("hobby"))
print(Snowball.stem("hobbies"))
print(Snowball.stem("computer"))
print(Snowball.stem("computation"))

Porter stemmer
hobbi
hobbi
comput
comput
**************************
lancaster stemmer
hobby
hobby
comput
comput
**************************
Snowball stemmer
hobbi
hobbi
comput
comput


In [31]:
sent = "I was going to the office on my bike when i saw a car passing by hit the tree."
token = list(nltk.word_tokenize(sent))
for stemmer in (Snowball, lancaster, porter):
    stemm = [stemmer.stem(t) for t in token]
    print(" ".join(stemm))

i was go to the offic on my bike when i saw a car pass by hit the tree .
i was going to the off on my bik when i saw a car pass by hit the tre .
I wa go to the offic on my bike when i saw a car pass by hit the tree .


In [32]:
print(porter.stem("running"))
print(porter.stem("runs"))
print(porter.stem("ran"))

run
run
ran


## Lemmatization

In [35]:
#Here, we can see the lemma has changed for the words with same base.
#This is because, we haven’t given any context to the Lemmatizer.

#nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()

print(lemma.lemmatize('running'))
print(lemma.lemmatize('runs'))
print(lemma.lemmatize('ran'))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sohel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


running
run
ran


In [36]:
#Generally, it is given by passing the POS tags for the words in a sentence. e.g.

print(lemma.lemmatize('running',pos='v'))
print(lemma.lemmatize('runs',pos='v'))
print(lemma.lemmatize('ran',pos='v'))

run
run
run


## Named Entity Recognition(NER)

In [39]:
# nltk.download('maxent_ne_chunker')
# nltk.download('words')
sent = "India, officially the Republic of India, is a country in South Asia."

words = nltk.word_tokenize(sent)
pos_tag = nltk.pos_tag(words)
namedEntity = nltk.ne_chunk(pos_tag)
print(namedEntity)
namedEntity.draw()

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\sohel\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\sohel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


(S
  (GPE India/NNP)
  ,/,
  officially/RB
  the/DT
  (ORGANIZATION Republic/NNP)
  of/IN
  (GPE India/NNP)
  ,/,
  is/VBZ
  a/DT
  country/NN
  in/IN
  (GPE South/NNP Asia/NNP)
  ./.)


## Parsing

In [40]:
#there are may liabrary for pasring , list is below:
dir(nltk.parse)

['BllipParser',
 'BottomUpChartParser',
 'BottomUpLeftCornerChartParser',
 'BottomUpProbabilisticChartParser',
 'ChartParser',
 'CoreNLPDependencyParser',
 'CoreNLPParser',
 'DependencyEvaluator',
 'DependencyGraph',
 'EarleyChartParser',
 'FeatureBottomUpChartParser',
 'FeatureBottomUpLeftCornerChartParser',
 'FeatureChartParser',
 'FeatureEarleyChartParser',
 'FeatureIncrementalBottomUpChartParser',
 'FeatureIncrementalBottomUpLeftCornerChartParser',
 'FeatureIncrementalChartParser',
 'FeatureIncrementalTopDownChartParser',
 'FeatureTopDownChartParser',
 'IncrementalBottomUpChartParser',
 'IncrementalBottomUpLeftCornerChartParser',
 'IncrementalChartParser',
 'IncrementalLeftCornerChartParser',
 'IncrementalTopDownChartParser',
 'InsideChartParser',
 'LeftCornerChartParser',
 'LongestChartParser',
 'MaltParser',
 'NaiveBayesDependencyScorer',
 'NonprojectiveDependencyParser',
 'ParserI',
 'ProbabilisticNonprojectiveParser',
 'ProbabilisticProjectiveDependencyParser',
 'ProjectiveDepe

In [41]:
grammar = nltk.CFG.fromstring("""
  S -> NP VP
  VP -> V NP | V NP PP
  PP -> P NP
  V -> "saw" | "slept" | "walked"
  NP -> "Rahul" | "Anjali" | Det N | Det N PP
  Det -> "a" | "an" | "the" | "my"
  N -> "man" | "dog" | "cat" | "telescope" | "park"
  P -> "in" | "on" | "by" | "with"
  """)

sent = "Rahul saw Anjali with a dog".split()
parser = nltk.RecursiveDescentParser(grammar)
for tree in parser.parse(sent):
    print(tree) 
    tree.draw()

(S
  (NP Rahul)
  (VP (V saw) (NP Anjali) (PP (P with) (NP (Det a) (N dog)))))


## Word Vectorization (Word Embedding)

### Bag Of Word Model

In [46]:
from sklearn.feature_extraction.text import CountVectorizer 
from nltk.tokenize import word_tokenize

string = ["This is an example of bag of words!"]
vect1 = CountVectorizer()
vect1.fit_transform(string)
print("bag of words :",vect1.get_feature_names())

bag of words : ['an', 'bag', 'example', 'is', 'of', 'this', 'words']


### N-Grams

In [48]:
# n-grams
from sklearn.feature_extraction.text import CountVectorizer 
from nltk.tokenize import word_tokenize

string = ["This is an example of n-gram!"]
vect1 = CountVectorizer(ngram_range=(1,1))
vect1.fit_transform(string)
vect2 = CountVectorizer(ngram_range=(2,2))
vect2.fit_transform(string)
vect3 = CountVectorizer(ngram_range=(3,3))
vect3.fit_transform(string)
vect4 = CountVectorizer(ngram_range=(4,4))
vect4.fit_transform(string)
print("1-gram  :",vect1.get_feature_names())
print("2-gram  :",vect2.get_feature_names())
print("3-gram  :",vect3.get_feature_names())
print("4-gram  :",vect4.get_feature_names())

1-gram  : ['an', 'example', 'gram', 'is', 'of', 'this']
2-gram  : ['an example', 'example of', 'is an', 'of gram', 'this is']
3-gram  : ['an example of', 'example of gram', 'is an example', 'this is an']
4-gram  : ['an example of gram', 'is an example of', 'this is an example']


## TF-IDF

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

tfid = TfidfVectorizer()

doc= ["Let's use python!", "Sklearn has package for Tf-idf.","Vectorization is fun!"]

doc_vector = tfid.fit_transform(doc)
df= pd.DataFrame(doc_vector.todense(),columns=tfid.get_feature_names())
df


Unnamed: 0,for,fun,has,idf,is,let,package,python,sklearn,tf,use,vectorization
0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.57735,0.0,0.0,0.57735,0.0
1,0.408248,0.0,0.408248,0.408248,0.0,0.0,0.408248,0.0,0.408248,0.408248,0.0,0.0
2,0.0,0.57735,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.57735


In [57]:
print(doc_vector,end='\n')
print('************************************************************************************************')
print(tfid.get_feature_names())

  (0, 7)	0.5773502691896257
  (0, 10)	0.5773502691896257
  (0, 5)	0.5773502691896257
  (1, 3)	0.4082482904638631
  (1, 9)	0.4082482904638631
  (1, 0)	0.4082482904638631
  (1, 6)	0.4082482904638631
  (1, 2)	0.4082482904638631
  (1, 8)	0.4082482904638631
  (2, 1)	0.5773502691896257
  (2, 4)	0.5773502691896257
  (2, 11)	0.5773502691896257
************************************************************************************************
['for', 'fun', 'has', 'idf', 'is', 'let', 'package', 'python', 'sklearn', 'tf', 'use', 'vectorization']
