# IMPORTS

In [1]:
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

# Exercise 1

The IOB format categorizes tagged tokens as I, O and B. Why are three tags necessary? What problem would be caused if we used I and O tags exclusively?

We cannot determine where chunks actually start because there will be no borders between adjacent chunks

# Exercise 2

Write a tag pattern to match noun phrases containing plural head nouns, e.g. "many/JJ researchers/NNS", "two/CD weeks/NNS", "both/DT new/JJ positions/NNS". Try to do this by generalizing the tag pattern that handled singular noun phrases.

In [5]:
from nltk.corpus import brown

In [9]:
chunk_string = "NP: {<(JJ|CD|DT).*>+<NNS?>}"

In [12]:
def search_chunks(chunk_string, tagged_sents):
    chunk_label = chunk_string[:chunk_string.find(':')]
    cp = nltk.RegexpParser(chunk_string)
    for sent in tagged_sents:
        tree = cp.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == chunk_label: print(subtree)

In [13]:
search_chunks(chunk_string, brown.tagged_sents()[:50])

(NP recent/JJ primary/NN)
(NP any/DTI irregularities/NNS)
(NP over-all/JJ charge/NN)
(NP hard-fought/JJ primary/NN)
(NP relative/JJ handful/NN)
(NP such/JJ reports/NNS)
(NP widespread/JJ interest/NN)
(NP this/DT city/NN)
(NP these/DTS laws/NNS)
(NP grand/JJ jury/NN)
(NP best/JJT interest/NN)
(NP these/DTS two/CD offices/NNS)
(NP greater/JJR efficiency/NN)
(NP clerical/JJ personnel/NNS)
(NP this/DT problem/NN)
(NP outgoing/JJ jury/NN)
(NP effective/JJ date/NN)
(NP orderly/JJ implementation/NN)
(NP grand/JJ jury/NN)
(NP federal/JJ funds/NNS)
(NP foster/JJ homes/NNS)
(NP major/JJ items/NNS)
(NP general/JJ assistance/NN)
(NP these/DTS funds/NNS)
(NP this/DT money/NN)
(NP proportionate/JJ distribution/NN)
(NP these/DTS funds/NNS)
(NP this/DT program/NN)
(NP populous/JJ counties/NNS)
(NP some/DTI portion/NN)
(NP these/DTS available/JJ funds/NNS)
(NP disproportionate/JJ burden/NN)
(NP two/CD previous/JJ grand/JJ juries/NNS)
(NP These/DTS actions/NNS)
(NP undue/JJ costs/NNS)
(NP unmeritorious/

# Exercise 3

Pick one of the three chunk types in the CoNLL corpus. Inspect the CoNLL corpus and try to observe any patterns in the POS tag sequences that make up this kind of chunk. Develop a simple chunker using the regular expression chunker nltk.RegexpParser. Discuss any tag sequences that are difficult to chunk reliably.

In [17]:
from nltk.corpus import conll2000

In [27]:
def search_chunk_type(chunked_sents, chunk_type):
    for sent in chunked_sents:
        for subtree in sent.subtrees():
            if subtree.label() == chunk_type: print(subtree)

In [31]:
search_chunk_type(conll2000.chunked_sents('train.txt')[:50], 'VP')

(VP is/VBZ widely/RB expected/VBN to/TO take/VB)
(VP fail/VB to/TO show/VB)
(VP has/VBZ helped/VBN to/TO prevent/VB)
(VP reckon/VBP)
(VP has/VBZ been/VBN eroded/VBN)
(VP to/TO announce/VB)
(VP has/VBZ increased/VBN)
(VP being/VBG forced/VBN to/TO increase/VB)
(VP to/TO defend/VB)
(VP say/VBP)
(VP are/VBP)
(VP said/VBD)
(VP is/VBZ)
(VP could/MD be/VB)
(VP noted/VBD)
(VP range/VBP)
(VP expect/VBP)
(VP to/TO show/VB)
(VP reported/VBD)
(VP registered/VBN)
(VP are/VBP topped/VBN)
(VP said/VBD)
(VP is/VBZ)
(VP is/VBZ transforming/VBG)
(VP to/TO boost/VB)
(VP remains/VBZ)
(VP reckons/VBZ)
(VP will/MD narrow/VB)
(VP said/VBD)
(VP believes/VBZ)
(VP could/MD lead/VB)
(VP could/MD narrow/VB)
(VP forecasts/VBZ)
(VP warns/VBZ)
(VP are/VBP)
(VP wo/MD n't/RB advance/VB)
(VP will/MD want/VB to/TO see/VB)
(VP adjusting/VBG)
(VP noted/VBD)
(VP will/MD want/VB to/TO go/VB)
(VP remains/VBZ)
(VP warned/VBD)
(VP can/MD be/VB expected/VBN)
(VP takes/VBZ)
(VP are/VBP)
(VP released/VBD)
(VP do/VBP n't/RB sugge

In [56]:
cp = nltk.RegexpParser('VP: {<MD>?<V.*>*<RB>?<TO>?<V.*>+}')

In [57]:
print(cp.evaluate(conll2000.chunked_sents('test.txt', chunk_types=['VP'])))

ChunkParse score:
    IOB Accuracy:  97.3%%
    Precision:     82.6%%
    Recall:        88.7%%
    F-Measure:     85.6%%


# Exercise 4

An early definition of chunk was the material that occurs between chinks. Develop a chunker that starts by putting the whole sentence in a single chunk, and then does the rest of its work solely by chinking. Determine which tags (or tag sequences) are most likely to make up chinks with the help of your own utility program. Compare the performance and simplicity of this approach relative to a chunker based entirely on chunk rules.

In [6]:
from nltk.corpus import conll2000

In [41]:
chunk_string = """
    VP:
      {<.*>+}
      }<(JJ|NN|IN|CD|DT|\W|CC|PRP|W).*>+{
"""

In [42]:
cp = nltk.RegexpParser(chunk_string)

In [43]:
print(cp.evaluate(conll2000.chunked_sents('test.txt', chunk_types=['VP'])))

ChunkParse score:
    IOB Accuracy:  93.2%%
    Precision:     59.2%%
    Recall:        79.5%%
    F-Measure:     67.9%%


In the case of VP chunking, building chunk parser using chinking is not a good idea according to performance and simplicity perspective

# Exercise 5

Write a tag pattern to cover noun phrases that contain gerunds, e.g. "the/DT receiving/VBG end/NN", "assistant/NN managing/VBG editor/NN". Add these patterns to the grammar, one per line. Test your work using some tagged sentences of your own devising.

In [9]:
from nltk.corpus import conll2000

In [10]:
def filter_chunks(sent, pos):
    result = []
    for part in sent:
        if (isinstance(part, nltk.Tree) 
            and pos not in {t for w, t in part.leaves()}):
            result.extend(part.leaves())
        else:
            result.append(part)
    return nltk.Tree('S', result)

In [11]:
train_sents = [filter_chunks(sent, 'VBG') for sent in conll2000.chunked_sents('train.txt', chunk_types=['NP'])]

In [26]:
train_sents[53].draw()

In [22]:
for i, sent in enumerate(train_sents):
    for subtree in sent.subtrees():    
        if subtree.label() == 'NP':
            print(i)

2
11
32
53
63
69
74
83
90
108
109
139
175
207
210
217
250
255
274
291
306
324
331
348
362
368
370
373
378
382
391
397
398
410
474
498
513
524
541
542
556
557
560
583
594
597
628
688
716
725
738
778
781
798
806
827
828
841
847
859
867
867
878
882
889
897
901
902
963
964
971
998
1000
1015
1044
1050
1051
1057
1059
1082
1088
1119
1128
1144
1150
1156
1166
1169
1202
1208
1265
1282
1367
1398
1453
1494
1545
1547
1594
1610
1676
1678
1680
1681
1687
1688
1688
1703
1719
1719
1739
1763
1765
1850
1949
1975
1981
1987
2003
2070
2074
2085
2089
2134
2157
2189
2214
2216
2226
2230
2240
2280
2300
2304
2315
2317
2329
2341
2343
2346
2351
2354
2363
2375
2397
2405
2458
2478
2485
2499
2505
2582
2586
2615
2688
2689
2709
2714
2731
2743
2752
2757
2760
2763
2772
2773
2785
2832
2854
2856
2870
2873
2934
2973
2976
2978
3005
3006
3059
3096
3098
3172
3184
3186
3208
3214
3228
3231
3237
3275
3317
3346
3369
3406
3447
3453
3461
3469
3470
3478
3480
3586
3622
3636
3642
3642
3642
3648
3652
3655
3657
3679
3680
3683
3707
3740
37

In [None]:
train_sents = [
    nltk.Tree('S', [subtree.leaves() if 'VBG' in subtree else subtree for subtree in sent.subtrees()]) 
    for sent in conll2000.chunked_sents('train.txt', chunk_types=['NP'])
]

In [5]:
x = conll2000.chunked_sents('train.txt', chunk_types=['NP'])[0]

In [6]:
x

The Ghostscript executable isn't found.
See http://web.mit.edu/ghostscript/www/Install.htm
If you're using a Mac, you can try installing
https://docs.brew.sh/Installation then `brew install ghostscript`


LookupError: 

Tree('S', [Tree('NP', [('Confidence', 'NN')]), ('in', 'IN'), Tree('NP', [('the', 'DT'), ('pound', 'NN')]), ('is', 'VBZ'), ('widely', 'RB'), ('expected', 'VBN'), ('to', 'TO'), ('take', 'VB'), Tree('NP', [('another', 'DT'), ('sharp', 'JJ'), ('dive', 'NN')]), ('if', 'IN'), Tree('NP', [('trade', 'NN'), ('figures', 'NNS')]), ('for', 'IN'), Tree('NP', [('September', 'NNP')]), (',', ','), ('due', 'JJ'), ('for', 'IN'), Tree('NP', [('release', 'NN')]), Tree('NP', [('tomorrow', 'NN')]), (',', ','), ('fail', 'VB'), ('to', 'TO'), ('show', 'VB'), Tree('NP', [('a', 'DT'), ('substantial', 'JJ'), ('improvement', 'NN')]), ('from', 'IN'), Tree('NP', [('July', 'NNP'), ('and', 'CC'), ('August', 'NNP')]), Tree('NP', [("'s", 'POS'), ('near-record', 'JJ'), ('deficits', 'NNS')]), ('.', '.')])

In [6]:
y = [subtree for subtree in x]

In [7]:
y

[Tree('NP', [('Confidence', 'NN')]),
 ('in', 'IN'),
 Tree('NP', [('the', 'DT'), ('pound', 'NN')]),
 ('is', 'VBZ'),
 ('widely', 'RB'),
 ('expected', 'VBN'),
 ('to', 'TO'),
 ('take', 'VB'),
 Tree('NP', [('another', 'DT'), ('sharp', 'JJ'), ('dive', 'NN')]),
 ('if', 'IN'),
 Tree('NP', [('trade', 'NN'), ('figures', 'NNS')]),
 ('for', 'IN'),
 Tree('NP', [('September', 'NNP')]),
 (',', ','),
 ('due', 'JJ'),
 ('for', 'IN'),
 Tree('NP', [('release', 'NN')]),
 Tree('NP', [('tomorrow', 'NN')]),
 (',', ','),
 ('fail', 'VB'),
 ('to', 'TO'),
 ('show', 'VB'),
 Tree('NP', [('a', 'DT'), ('substantial', 'JJ'), ('improvement', 'NN')]),
 ('from', 'IN'),
 Tree('NP', [('July', 'NNP'), ('and', 'CC'), ('August', 'NNP')]),
 Tree('NP', [("'s", 'POS'), ('near-record', 'JJ'), ('deficits', 'NNS')]),
 ('.', '.')]

In [None]:
and ('VBG' not in {t for w, t in y.leaves()})

In [32]:
filter_chunks(x, 'VBG').draw()

In [None]:
result = []
for sent in conll2000.chunked_sents('train.txt', chunk_types=['NP'])[:10]:
    for part in sent:
        if isinstance(part, nltk.Tree) and 'VBG' not in {t for w, t in part.leaves()}:
            

In [14]:
[y.leaves() if isinstance(y, nltk.Tree) and ('VBG' not in {t for w, t in y.leaves()}) else y for y in x]

[[('Confidence', 'NN')],
 ('in', 'IN'),
 [('the', 'DT'), ('pound', 'NN')],
 ('is', 'VBZ'),
 ('widely', 'RB'),
 ('expected', 'VBN'),
 ('to', 'TO'),
 ('take', 'VB'),
 [('another', 'DT'), ('sharp', 'JJ'), ('dive', 'NN')],
 ('if', 'IN'),
 [('trade', 'NN'), ('figures', 'NNS')],
 ('for', 'IN'),
 [('September', 'NNP')],
 (',', ','),
 ('due', 'JJ'),
 ('for', 'IN'),
 [('release', 'NN')],
 [('tomorrow', 'NN')],
 (',', ','),
 ('fail', 'VB'),
 ('to', 'TO'),
 ('show', 'VB'),
 [('a', 'DT'), ('substantial', 'JJ'), ('improvement', 'NN')],
 ('from', 'IN'),
 [('July', 'NNP'), ('and', 'CC'), ('August', 'NNP')],
 [("'s", 'POS'), ('near-record', 'JJ'), ('deficits', 'NNS')],
 ('.', '.')]

In [16]:
isinstance(x[2], nltk.Tree)

True

In [17]:
{t for w, t in x[2].leaves()}

{'DT', 'NN'}

In [34]:
y = x[0]

In [38]:
x.flatten()

The Ghostscript executable isn't found.
See http://web.mit.edu/ghostscript/www/Install.htm
If you're using a Mac, you can try installing
https://docs.brew.sh/Installation then `brew install ghostscript`


LookupError: 

Tree('S', [('Confidence', 'NN'), ('in', 'IN'), ('the', 'DT'), ('pound', 'NN'), ('is', 'VBZ'), ('widely', 'RB'), ('expected', 'VBN'), ('to', 'TO'), ('take', 'VB'), ('another', 'DT'), ('sharp', 'JJ'), ('dive', 'NN'), ('if', 'IN'), ('trade', 'NN'), ('figures', 'NNS'), ('for', 'IN'), ('September', 'NNP'), (',', ','), ('due', 'JJ'), ('for', 'IN'), ('release', 'NN'), ('tomorrow', 'NN'), (',', ','), ('fail', 'VB'), ('to', 'TO'), ('show', 'VB'), ('a', 'DT'), ('substantial', 'JJ'), ('improvement', 'NN'), ('from', 'IN'), ('July', 'NNP'), ('and', 'CC'), ('August', 'NNP'), ("'s", 'POS'), ('near-record', 'JJ'), ('deficits', 'NNS'), ('.', '.')])

In [41]:
x = nltk.Tree('S', [1, 2, 3, 4])

In [43]:
x.flatten()

The Ghostscript executable isn't found.
See http://web.mit.edu/ghostscript/www/Install.htm
If you're using a Mac, you can try installing
https://docs.brew.sh/Installation then `brew install ghostscript`


LookupError: 

Tree('S', [1, 2, 3, 4])