# Chunking

In [2]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [3]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

In [4]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)


In [5]:
tokenized = custom_sent_tokenizer.tokenize(sample_text)


In [6]:
type(tokenized)

list

In [7]:
len(tokenized)

346

In [8]:
type(train_text)

str

In [9]:
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            print(chunked)    
            #chnked.draw()
    except Exception as e:
        print(str(e))



In [10]:
process_content()

(S
  (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (Chunk ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP JOINT/NNP SESSION/NNP)
  OF/IN
  (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
  OF/IN
  (Chunk THE/NNP UNION/NNP January/NNP)
  31/CD
  ,/,
  2006/CD
  (Chunk THE/NNP PRESIDENT/NNP)
  :/:
  (Chunk Thank/NNP)
  you/PRP
  all/DT
  ./.)
(S
  (Chunk Mr./NNP Speaker/NNP)
  ,/,
  (Chunk Vice/NNP President/NNP Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (Chunk Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (Chunk Supreme/NNP Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:
  Today/VB
  our/PRP$
  nation/NN
  lost/VBD
  a/DT
  beloved/VBN
  ,/,
  graceful/JJ
  ,/,
  courageous/JJ
  woman/NN
  who/WP
  (Chunk called/VBD America/NNP)
  to/TO
  its/PRP$
  founding/NN
  ideals/NNS
  and/CC
  carried/VBD
  on/IN
  a/DT
  noble/JJ
  dream/NN
  ./.)
(S
  Tonight/NN
  we/PRP
  are/VBP
  comforted

In [11]:
#Each "chunk" and "non chunk" is a "subtree" of the tree. 
#We can reference these by doing something like chunked.subtrees.

In [12]:
for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            for subtree in chunked.subtrees():
                print(subtree)
            

(S
  (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (Chunk ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP JOINT/NNP SESSION/NNP)
  OF/IN
  (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
  OF/IN
  (Chunk THE/NNP UNION/NNP January/NNP)
  31/CD
  ,/,
  2006/CD
  (Chunk THE/NNP PRESIDENT/NNP)
  :/:
  (Chunk Thank/NNP)
  you/PRP
  all/DT
  ./.)
(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
(Chunk ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk THE/NNP UNION/NNP January/NNP)
(Chunk THE/NNP PRESIDENT/NNP)
(Chunk Thank/NNP)
(S
  (Chunk Mr./NNP Speaker/NNP)
  ,/,
  (Chunk Vice/NNP President/NNP Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (Chunk Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (Chunk Supreme/NNP Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:
  Today/VB
  our/PRP$
  nation/NN
  lost/VBD
  a/DT
  beloved/VBN
  ,/,
  g

In [13]:
for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
                print(subtree)

(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
(Chunk ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk THE/NNP UNION/NNP January/NNP)
(Chunk THE/NNP PRESIDENT/NNP)
(Chunk Thank/NNP)
(Chunk Mr./NNP Speaker/NNP)
(Chunk Vice/NNP President/NNP Cheney/NNP)
(Chunk Congress/NNP)
(Chunk Supreme/NNP Court/NNP)
(Chunk called/VBD America/NNP)
(Chunk Coretta/NNP Scott/NNP King/NNP)
(Chunk Applause/NNP)
(Chunk President/NNP George/NNP W./NNP Bush/NNP)
(Chunk State/NNP)
(Chunk Union/NNP Address/NNP)
(Chunk Capitol/NNP)
(Chunk Tuesday/NNP)
(Chunk Jan/NNP)
(Chunk White/NNP House/NNP photo/NN)
(Chunk Eric/NNP DraperEvery/NNP time/NN)
(Chunk Capitol/NNP dome/NN)
(Chunk have/VBP served/VBN America/NNP)
(Chunk Tonight/NNP)
(Chunk Union/NNP)
(Chunk Applause/NNP)
(Chunk United/NNP)
(Chunk America/NNP)
(Chunk Applause/NNP)
(Chunk America/NNP)
(Chunk September/NNP)
(Chunk Dictatorships/NNP shelter/NN)
(Chunk Applause/NNP)
(Chunk Afghanistan/NNP)
(

# Chinking

In [14]:
for i in tokenized[5:]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            chunkGram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{"""

            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
                print(subtree)

(Chunk 31/CD ,/, 2006/CD ./.)
(Chunk White/NNP House/NNP photo/NN)
(Chunk Eric/NNP DraperEvery/NNP time/NN I/PRP)
(Chunk invited/JJ)
(Chunk rostrum/NN ,/, I/PRP)
(Chunk privilege/NN ,/, and/CC mindful/NN)
(Chunk history/NN we/PRP)
(Chunk together/RB ./.)
(Chunk We/PRP)
(Chunk Capitol/NNP dome/NN)
(Chunk moments/NNS)
(Chunk national/JJ mourning/NN and/CC national/JJ achievement/NN ./.)
(Chunk We/PRP)
(Chunk America/NNP)
(Chunk one/CD)
(Chunk most/RBS consequential/JJ periods/NNS)
(Chunk our/PRP$ history/NN --/: and/CC it/PRP)
(Chunk my/PRP$ honor/NN)
(Chunk you/PRP ./.)
(Chunk system/NN)
(Chunk
  two/CD
  parties/NNS
  ,/,
  two/CD
  chambers/NNS
  ,/,
  and/CC
  two/CD
  elected/JJ
  branches/NNS
  ,/,
  there/EX
  will/MD
  always/RB)
(Chunk differences/NNS and/CC debate/NN ./.)
(Chunk But/CC even/RB tough/JJ debates/NNS can/MD)
(Chunk
  civil/JJ
  tone/NN
  ,/,
  and/CC
  our/PRP$
  differences/NNS
  can/MD
  not/RB)
(Chunk anger/NN ./.)
(Chunk great/JJ issues/NNS)
(Chunk us/PRP ,/, 

# Named Entity

In [15]:
for i in tokenized[5:]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged,binary=False)
            print(namedEnt)

(S 31/CD ,/, 2006/CD ./.)
(S
  (FACILITY White/NNP)
  (ORGANIZATION House/NNP)
  photo/NN
  by/IN
  (PERSON Eric/NNP)
  DraperEvery/NNP
  time/NN
  I/PRP
  'm/VBP
  invited/JJ
  to/TO
  this/DT
  rostrum/NN
  ,/,
  I/PRP
  'm/VBP
  humbled/VBN
  by/IN
  the/DT
  privilege/NN
  ,/,
  and/CC
  mindful/NN
  of/IN
  the/DT
  history/NN
  we/PRP
  've/VBP
  seen/VBN
  together/RB
  ./.)
(S
  We/PRP
  have/VBP
  gathered/VBN
  under/IN
  this/DT
  Capitol/NNP
  dome/NN
  in/IN
  moments/NNS
  of/IN
  national/JJ
  mourning/NN
  and/CC
  national/JJ
  achievement/NN
  ./.)
(S
  We/PRP
  have/VBP
  served/VBN
  (GPE America/NNP)
  through/IN
  one/CD
  of/IN
  the/DT
  most/RBS
  consequential/JJ
  periods/NNS
  of/IN
  our/PRP$
  history/NN
  --/:
  and/CC
  it/PRP
  has/VBZ
  been/VBN
  my/PRP$
  honor/NN
  to/TO
  serve/VB
  with/IN
  you/PRP
  ./.)
(S
  In/IN
  a/DT
  system/NN
  of/IN
  two/CD
  parties/NNS
  ,/,
  two/CD
  chambers/NNS
  ,/,
  and/CC
  two/CD
  elected/JJ
  branches/NNS
