# IMPORTS

In [2]:
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
%matplotlib inline

# Exercise 1

The IOB format categorizes tagged tokens as I, O and B. Why are three tags necessary? What problem would be caused if we used I and O tags exclusively?

We cannot determine where chunks actually start because there will be no borders between adjacent chunks

# Exercise 2

Write a tag pattern to match noun phrases containing plural head nouns, e.g. "many/JJ researchers/NNS", "two/CD weeks/NNS", "both/DT new/JJ positions/NNS". Try to do this by generalizing the tag pattern that handled singular noun phrases.

In [5]:
from nltk.corpus import brown

In [9]:
chunk_string = "NP: {<(JJ|CD|DT).*>+<NNS?>}"

In [12]:
def search_chunks(chunk_string, tagged_sents):
    chunk_label = chunk_string[:chunk_string.find(':')]
    cp = nltk.RegexpParser(chunk_string)
    for sent in tagged_sents:
        tree = cp.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == chunk_label: print(subtree)

In [13]:
search_chunks(chunk_string, brown.tagged_sents()[:50])

(NP recent/JJ primary/NN)
(NP any/DTI irregularities/NNS)
(NP over-all/JJ charge/NN)
(NP hard-fought/JJ primary/NN)
(NP relative/JJ handful/NN)
(NP such/JJ reports/NNS)
(NP widespread/JJ interest/NN)
(NP this/DT city/NN)
(NP these/DTS laws/NNS)
(NP grand/JJ jury/NN)
(NP best/JJT interest/NN)
(NP these/DTS two/CD offices/NNS)
(NP greater/JJR efficiency/NN)
(NP clerical/JJ personnel/NNS)
(NP this/DT problem/NN)
(NP outgoing/JJ jury/NN)
(NP effective/JJ date/NN)
(NP orderly/JJ implementation/NN)
(NP grand/JJ jury/NN)
(NP federal/JJ funds/NNS)
(NP foster/JJ homes/NNS)
(NP major/JJ items/NNS)
(NP general/JJ assistance/NN)
(NP these/DTS funds/NNS)
(NP this/DT money/NN)
(NP proportionate/JJ distribution/NN)
(NP these/DTS funds/NNS)
(NP this/DT program/NN)
(NP populous/JJ counties/NNS)
(NP some/DTI portion/NN)
(NP these/DTS available/JJ funds/NNS)
(NP disproportionate/JJ burden/NN)
(NP two/CD previous/JJ grand/JJ juries/NNS)
(NP These/DTS actions/NNS)
(NP undue/JJ costs/NNS)
(NP unmeritorious/

# Exercise 3

Pick one of the three chunk types in the CoNLL corpus. Inspect the CoNLL corpus and try to observe any patterns in the POS tag sequences that make up this kind of chunk. Develop a simple chunker using the regular expression chunker nltk.RegexpParser. Discuss any tag sequences that are difficult to chunk reliably.

In [17]:
from nltk.corpus import conll2000

In [27]:
def search_chunk_type(chunked_sents, chunk_type):
    for sent in chunked_sents:
        for subtree in sent.subtrees():
            if subtree.label() == chunk_type: print(subtree)

In [31]:
search_chunk_type(conll2000.chunked_sents('train.txt')[:50], 'VP')

(VP is/VBZ widely/RB expected/VBN to/TO take/VB)
(VP fail/VB to/TO show/VB)
(VP has/VBZ helped/VBN to/TO prevent/VB)
(VP reckon/VBP)
(VP has/VBZ been/VBN eroded/VBN)
(VP to/TO announce/VB)
(VP has/VBZ increased/VBN)
(VP being/VBG forced/VBN to/TO increase/VB)
(VP to/TO defend/VB)
(VP say/VBP)
(VP are/VBP)
(VP said/VBD)
(VP is/VBZ)
(VP could/MD be/VB)
(VP noted/VBD)
(VP range/VBP)
(VP expect/VBP)
(VP to/TO show/VB)
(VP reported/VBD)
(VP registered/VBN)
(VP are/VBP topped/VBN)
(VP said/VBD)
(VP is/VBZ)
(VP is/VBZ transforming/VBG)
(VP to/TO boost/VB)
(VP remains/VBZ)
(VP reckons/VBZ)
(VP will/MD narrow/VB)
(VP said/VBD)
(VP believes/VBZ)
(VP could/MD lead/VB)
(VP could/MD narrow/VB)
(VP forecasts/VBZ)
(VP warns/VBZ)
(VP are/VBP)
(VP wo/MD n't/RB advance/VB)
(VP will/MD want/VB to/TO see/VB)
(VP adjusting/VBG)
(VP noted/VBD)
(VP will/MD want/VB to/TO go/VB)
(VP remains/VBZ)
(VP warned/VBD)
(VP can/MD be/VB expected/VBN)
(VP takes/VBZ)
(VP are/VBP)
(VP released/VBD)
(VP do/VBP n't/RB sugge

In [56]:
cp = nltk.RegexpParser('VP: {<MD>?<V.*>*<RB>?<TO>?<V.*>+}')

In [57]:
print(cp.evaluate(conll2000.chunked_sents('test.txt', chunk_types=['VP'])))

ChunkParse score:
    IOB Accuracy:  97.3%%
    Precision:     82.6%%
    Recall:        88.7%%
    F-Measure:     85.6%%


# Exercise 4

An early definition of chunk was the material that occurs between chinks. Develop a chunker that starts by putting the whole sentence in a single chunk, and then does the rest of its work solely by chinking. Determine which tags (or tag sequences) are most likely to make up chinks with the help of your own utility program. Compare the performance and simplicity of this approach relative to a chunker based entirely on chunk rules.

In [6]:
from nltk.corpus import conll2000

In [41]:
chunk_string = """
    VP:
      {<.*>+}
      }<(JJ|NN|IN|CD|DT|\W|CC|PRP|W).*>+{
"""

In [42]:
cp = nltk.RegexpParser(chunk_string)

In [43]:
print(cp.evaluate(conll2000.chunked_sents('test.txt', chunk_types=['VP'])))

ChunkParse score:
    IOB Accuracy:  93.2%%
    Precision:     59.2%%
    Recall:        79.5%%
    F-Measure:     67.9%%


In the case of VP chunking, building chunk parser using chinking is not a good idea according to performance and simplicity perspective

# Exercise 5

Write a tag pattern to cover noun phrases that contain gerunds, e.g. "the/DT receiving/VBG end/NN", "assistant/NN managing/VBG editor/NN". Add these patterns to the grammar, one per line. Test your work using some tagged sentences of your own devising.

In [69]:
from nltk.corpus import brown

In [70]:
def test_pattern(pattern, n):
    label = pattern.split()[0][:-1]
    cp = nltk.RegexpParser(pattern)
    for sent in brown.tagged_sents()[:n]:
        tree = cp.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == label: 
                print(subtree)

In [71]:
np_vbg_pattern = """
    NP_VBG:
      {<DT><VBG><N.*>}
      {<N.*><VBG><N.*>}
"""

In [72]:
test_pattern(np_vbg_pattern, 1000)

(NP_VBG County/NN-TL purchasing/VBG departments/NNS)
(NP_VBG Dallas/NP authorizing/VBG establishment/NN)
(NP_VBG Galveston/NP authorizing/VBG establishment/NN)
(NP_VBG school/NN teaching/VBG certificate/NN)
(NP_VBG welfare/NN consulting/VBG firm/NN)
(NP_VBG cent/NN starting/VBG Jan./NP)
(NP_VBG days/NNS following/VBG discharge/NN)
(NP_VBG Community/NN visiting/VBG nurse/NN)
(NP_VBG law/NN providing/VBG grants/NNS)
(NP_VBG laws/NNS regulating/VBG Sunday/NR)
(NP_VBG ordinance/NN permitting/VBG motorists/NNS)
(NP_VBG state/NN financing/VBG aid/NN)
(NP_VBG 1920s/NNS following/VBG adoption/NN)
(NP_VBG Administration's/NN$-TL housing/VBG bill/NN)
(NP_VBG each/DT passing/VBG week/NN)
(NP_VBG points/NNS bordering/VBG Lafayette/NP-TL)
(NP_VBG another/DT vexing/VBG issue/NN)
(NP_VBG problem/NN confronting/VBG Davis/NP)
(NP_VBG vouchers/NNS certifying/VBG work/NN)
(NP_VBG bill/NN raising/VBG fees/NNS)
(NP_VBG dinner/NN honoring/VBG Sen./NN-TL)


# Exercise 6

Write one or more tag patterns to handle coordinated noun phrases, e.g. "July/NNP and/CC August/NNP", "all/DT your/PRP$ managers/NNS and/CC supervisors/NNS", "company/NN courts/NNS and/CC adjudicators/NNS".

In [73]:
np_cc_pattern = """
    NP_CC:
      {<N.*><CC><N.*>}
      {<DT><P.*><N.*><CC><N.*>}
      {<N.*><N.*><CC><N.*>}
"""

In [74]:
test_pattern(np_cc_pattern, 1000)

(NP_CC praise/NN and/CC thanks/NNS)
(NP_CC registration/NN and/CC election/NN)
(NP_CC Atlanta/NP and/CC Fulton/NP-TL)
(NP_CC guardians/NNS and/CC administrators/NNS)
(NP_CC fees/NNS and/CC compensation/NN)
(NP_CC intern/NN or/CC extern/NN)
(NP_CC night/NN and/CC weekend/NN)
(NP_CC administration/NN and/CC operation/NN)
(NP_CC Bellwood/NP and/CC Alpharetta/NP)
(NP_CC man/NN and/CC wife/NN)
(NP_CC principal/NN and/CC chairman/NN)
(NP_CC Davis/NP and/CC Bush/NP)
(NP_CC insurance/NN and/CC pipeline/NN)
(NP_CC Harlingen/NP and/CC Howard/NP)
(NP_CC Tarrant/NP and/CC El/NP)
(NP_CC Berry/NP and/CC Joe/NP)
(NP_CC gifts/NNS and/CC donations/NNS)
(NP_CC stocks/NNS and/CC bonds/NNS)
(NP_CC Legislature/NN-TL and/CC Congress/NP)
(NP_CC Dallas/NP and/CC Fort/NN-TL)
(NP_CC Dallas/NP and/CC Sen./NN-TL)
(NP_CC Newton/NP and/CC Joe/NP)
(NP_CC Dallas/NP and/CC Fort/NN-TL)
(NP_CC math/NN or/CC English/NP)
(NP_CC A/NN &/CC I/NN)
(NP_CC College/NN-TL and/CC Massachusetts/NP-TL)
(NP_CC teacher/NN and/CC princ

# Exercise 7

Carry out the following evaluation tasks for any of the chunkers you have developed earlier. (Note that most chunking corpora contain some internal inconsistencies, such that any reasonable rule-based approach will produce errors.)<div>
    a. Evaluate your chunker on 100 sentences from a chunked corpus, and report the precision, recall and F-measure.<div>
    b. Use the chunkscore.missed() and chunkscore.incorrect() methods to identify the errors made by your chunker. Discuss.<div>
    c. Compare the performance of your chunker to the baseline chunker discussed in the evaluation section of this chapter.<div>