# NLTK Introduction

# About NLTK library

- NLTK is a leading platform for building Python programs to work with human language data
- https://www.nltk.org/index.html

# NLTK library installation

In [1]:
# !pip install nltk

# Some vocabulary for NLP

In [31]:
import nltk as nlp
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,SnowballStemmer,RegexpStemmer
import matplotlib.pyplot as plt
import re

# NLTK version

In [6]:
nlp.__version__

'3.6.5'

# How to download nltk corpus data

In [5]:
# nlp.download()

# Sample Text

In [7]:
txt = """Albert Einstein (/ˈaɪnstaɪn/ EYEN-styne;[6] German: [ˈalbɛʁt ˈʔaɪnʃtaɪn] (audio speaker iconlisten); 14 March 1879 – 18 April 1955) was a German-born theoretical physicist,[7] widely acknowledged to be one of the greatest physicists of all time. Einstein is best known for developing the theory of relativity, but he also made important contributions to the development of the theory of quantum mechanics. Relativity and quantum mechanics are together the two pillars of modern physics.[3][8] His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been dubbed "the world's most famous equation".[9] His work is also known for its influence on the philosophy of science.[10][11] He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect",[12] a pivotal step in the development of quantum theory. His intellectual achievements and originality resulted in "Einstein" becoming synonymous with "genius".[13]
"""

In [2]:
print(txt)

Albert Einstein (/ˈaɪnstaɪn/ EYEN-styne;[6] German: [ˈalbɛʁt ˈʔaɪnʃtaɪn] (audio speaker iconlisten); 14 March 1879 – 18 April 1955) was a German-born theoretical physicist,[7] widely acknowledged to be one of the greatest physicists of all time. Einstein is best known for developing the theory of relativity, but he also made important contributions to the development of the theory of quantum mechanics. Relativity and quantum mechanics are together the two pillars of modern physics.[3][8] His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been dubbed "the world's most famous equation".[9] His work is also known for its influence on the philosophy of science.[10][11] He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect",[12] a pivotal step in the development of quantum theory. His intellectual achievements and originality resulted in "Einstein" becom

# Part-4

# Get the path of Copora from local disk

In [8]:
from nltk.corpus import state_union

In [9]:
state_union.abspath # All the corpus is loaded here

<bound method CorpusReader.abspath of <PlaintextCorpusReader in '/Users/pankajkumarsoni/nltk_data/corpora/state_union'>>

In [11]:
# !ls -al /Users/pankajkumarsoni/nltk_data/corpora/state_union

In [15]:
# file = state_union.raw("2003-GWBush.txt")
# print(file)

# POS (Part of Speech tagging)
- https://www.scikit-yb.org/en/latest/api/text/postag.html

In [16]:
a = "I am going to market, and will bug a Dog."
print(a)

I am going to market, and will bug a Dog.


In [17]:
a = "I am going to market, and will bug a Dog."
wt = word_tokenize(a)
print(wt)

['I', 'am', 'going', 'to', 'market', ',', 'and', 'will', 'bug', 'a', 'Dog', '.']


In [18]:
nlp.pos_tag(wt)

[('I', 'PRP'),
 ('am', 'VBP'),
 ('going', 'VBG'),
 ('to', 'TO'),
 ('market', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('will', 'MD'),
 ('bug', 'VB'),
 ('a', 'DT'),
 ('Dog', 'NNP'),
 ('.', '.')]

# Get meaning of POS

In [20]:
nlp.help.upenn_tagset("CC")

CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet


In [21]:
nlp.help.upenn_tagset("NNP")

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...


In [22]:
nlp.help.upenn_tagset("NN")

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


# How to filter only given POS

In [24]:
wt2 = word_tokenize(txt)
print(wt2)

['Albert', 'Einstein', '(', '/ˈaɪnstaɪn/', 'EYEN-styne', ';', '[', '6', ']', 'German', ':', '[', 'ˈalbɛʁt', 'ˈʔaɪnʃtaɪn', ']', '(', 'audio', 'speaker', 'iconlisten', ')', ';', '14', 'March', '1879', '–', '18', 'April', '1955', ')', 'was', 'a', 'German-born', 'theoretical', 'physicist', ',', '[', '7', ']', 'widely', 'acknowledged', 'to', 'be', 'one', 'of', 'the', 'greatest', 'physicists', 'of', 'all', 'time', '.', 'Einstein', 'is', 'best', 'known', 'for', 'developing', 'the', 'theory', 'of', 'relativity', ',', 'but', 'he', 'also', 'made', 'important', 'contributions', 'to', 'the', 'development', 'of', 'the', 'theory', 'of', 'quantum', 'mechanics', '.', 'Relativity', 'and', 'quantum', 'mechanics', 'are', 'together', 'the', 'two', 'pillars', 'of', 'modern', 'physics', '.', '[', '3', ']', '[', '8', ']', 'His', 'mass–energy', 'equivalence', 'formula', 'E', '=', 'mc2', ',', 'which', 'arises', 'from', 'relativity', 'theory', ',', 'has', 'been', 'dubbed', '``', 'the', 'world', "'s", 'most', 'f

In [31]:
pos = nlp.pos_tag(wt2)
# pos

In [30]:
for tag in pos:
    if tag[-1] == "NNP":
        print(tag)

('Albert', 'NNP')
('Einstein', 'NNP')
('EYEN-styne', 'NNP')
('ˈalbɛʁt', 'NNP')
('ˈʔaɪnʃtaɪn', 'NNP')
(']', 'NNP')
('March', 'NNP')
('–', 'NNP')
('April', 'NNP')
('Einstein', 'NNP')
('Relativity', 'NNP')
(']', 'NNP')
('E', 'NNP')
('=', 'NNP')
(']', 'NNP')
('Nobel', 'NNP')
('Prize', 'NNP')
('Physics', 'NNP')
('Einstein', 'NNP')


In [32]:
for tag in pos:
    if tag[-1] == "VBG":
        print(tag)

('developing', 'VBG')
('becoming', 'VBG')


In [33]:
for tag in pos:
    if tag[-1] == "CD":
        print(tag)

('6', 'CD')
('14', 'CD')
('1879', 'CD')
('18', 'CD')
('1955', 'CD')
('7', 'CD')
('one', 'CD')
('two', 'CD')
('3', 'CD')
('8', 'CD')
('9', 'CD')
('10', 'CD')
('11', 'CD')
('1921', 'CD')
('12', 'CD')
('13', 'CD')


# Chunking

In [1]:
a = "I am going to market to buy a Dog. Then go for Dinner"
print(a)

I am going to market to buy a Dog. Then go for Dinner


## How to draw chunk tree

In [6]:
wt = word_tokenize(a)
print(wt)

['I', 'am', 'going', 'to', 'market', 'to', 'buy', 'a', 'Dog', '.', 'Then', 'go', 'for', 'Dinner']


In [7]:
pos_tag = nlp.pos_tag(wt)
print(pos_tag)

[('I', 'PRP'), ('am', 'VBP'), ('going', 'VBG'), ('to', 'TO'), ('market', 'NN'), ('to', 'TO'), ('buy', 'VB'), ('a', 'DT'), ('Dog', 'NNP'), ('.', '.'), ('Then', 'RB'), ('go', 'VB'), ('for', 'IN'), ('Dinner', 'NNP')]


## How to define chunk grammer

In [19]:
# Syntax = "<grp_name>:{<POS><POS>..}"
grammar = """Branch: {<VBG><TO><NN>|<RB><VB><IN>|<TO><VB>}"""
chunkParser = nlp.RegexpParser(grammar)
# chunkGrammar
tree = chunkParser.parse(pos_tag)
tree.draw()

## How to filter given chunk grammer from tree

In [18]:
for st in tree.subtrees():
    print(st)

(S
  I/PRP
  am/VBP
  (Branch going/VBG to/TO market/NN)
  (Branch to/TO buy/VB)
  a/DT
  Dog/NNP
  ./.
  (Branch Then/RB go/VB for/IN)
  Dinner/NNP)
(Branch going/VBG to/TO market/NN)
(Branch to/TO buy/VB)
(Branch Then/RB go/VB for/IN)


In [21]:
for st in tree.subtrees():
    if st._label == "Branch":
        print(st)

(Branch going/VBG to/TO market/NN)
(Branch to/TO buy/VB)
(Branch Then/RB go/VB for/IN)


# Chinking
Removing something from the `Chunk`

In [24]:
# Syntax = "<grp_name>:}<POS><POS>..{"       # notice the reverse order of curly braces.
grammar = """Branch: {<RB><VB><IN>|<TO><VB>}
            }<VBG><TO><NN>{"""
chunkParser = nlp.RegexpParser(grammar)
# chunkGrammar
tree = chunkParser.parse(pos_tag)
tree.draw()

# NER (Named entity recognition)

In [35]:
a = "My name is Pankaj and I am going to visit USA on 23rd DECEMBER. Will go to GOOGLE office."
print(a)

My name is Pankaj and I am going to visit USA on 23rd DECEMBER. Will go to GOOGLE office.


In [37]:
a = "I work in PyCSR and going to USA in December."
print(a)

I work in PyCSR and going to USA in December.


In [39]:
wt = nlp.pos_tag(word_tokenize(a))
ner_tag = nlp.ne_chunk(wt)
print(ner_tag)
ner_tag.draw()

(S
  I/PRP
  work/VBP
  in/IN
  (ORGANIZATION PyCSR/NNP)
  and/CC
  going/VBG
  to/TO
  (ORGANIZATION USA/NNP)
  in/IN
  December/NNP
  ./.)


## For Reference: 

![image.png](attachment:image.png)