In [2]:
# Lab Session 9
import nltk
import pandas as pd
from nltk import CFG, ChartParser

In [2]:
# Lectures example
tokenized_sent = ['small', 'cats', 'and', 'mice']
grammar = CFG.fromstring('''
    NP -> NNS | JJ NNS | NP CC NP
    NNS -> "cats" | "dogs" | "mice" | NNS CC NNS
    JJ -> "big" | "small"
    CC -> "and" | "or"
''')
parser = ChartParser(grammar)
parse = parser.parse(tokenized_sent)
for tree in parse:
    print(tree)

(NP (JJ small) (NNS (NNS cats) (CC and) (NNS mice)))
(NP (NP (JJ small) (NNS cats)) (CC and) (NP (NNS mice)))


In [3]:
# Problem example
tokenized_sent = ['lazy', 'cats', 'play', 'with', 'mice']
grammar = CFG.fromstring('''
    S -> NP VP
    NP -> JJ NNS
    VP -> VBP PP
    JJ -> "lazy"
    NNS -> "cats" 
    VBP -> "play"
    PP -> IN NP
    IN -> "with"
    NP -> NNS
    NNS -> "mice"
''')
parser = ChartParser(grammar)
parse = parser.parse(tokenized_sent)
for tree in parse:
    print(tree)

(S
  (NP (JJ lazy) (NNS cats))
  (VP (VBP play) (PP (IN with) (NP (NNS mice)))))


In [4]:
# Joint grammar example
# Expand the grammar in the example of non-probabilistic chart parsers
# in order to subsume the sentence:
grammar = CFG.fromstring('''
    S -> NP VP | NP
    VP -> VBP PP
    JJ -> "lazy" 
    VBP -> "play"
    PP -> IN NP
    IN -> "with"
    NP -> NNS | JJ NNS | NP CC NP
    NNS -> "cats" | "dogs" | "mice" | NNS CC NNS
    JJ -> "big" | "small"
    CC -> "and" | "or"
''')
parser = ChartParser(grammar)
parse = parser.parse(['small', 'cats', 'and', 'mice'])
print(['small', 'cats', 'and', 'mice'])
for tree in parse:
    print('-------')
    print(tree)
print('##############')
parse = parser.parse(['lazy', 'cats', 'play', 'with', 'mice'])
print(['lazy', 'cats', 'play', 'with', 'mice'])
for tree in parse:
    print('-------')
    print(tree)
print('##############')

['small', 'cats', 'and', 'mice']
-------
(S (NP (JJ small) (NNS (NNS cats) (CC and) (NNS mice))))
-------
(S (NP (NP (JJ small) (NNS cats)) (CC and) (NP (NNS mice))))
##############
['lazy', 'cats', 'play', 'with', 'mice']
-------
(S
  (NP (JJ lazy) (NNS cats))
  (VP (VBP play) (PP (IN with) (NP (NNS mice)))))
##############


In [10]:
# Perform the constituency parsing using a 
# BottomUpChartParser, BottomUpLeftCornerChartParser & LeftCornerChartParser
# For each one of them, provide the resulting tree, the
# number of edges and the list of explored edges.

def parse(parser, tokenized_sent):
    print('##############')
    parse = parser.parse(tokenized_sent)
    print(tokenized_sent)
    print('-------')
    print('Resulting tree:')
    for tree in parse:
        print('-------')
        print(tree)
    print('##############')
    parse = parser.chart_parse(tokenized_sent)
    print('Number of edges:', parse.num_edges())
    print('-------')
    print('Resulting tree with edges:')
    print('-------')
    for tree in parse:
        print(tree)
    print('##############')
    return 
    
from nltk.parse.chart import BottomUpChartParser
parse(BottomUpChartParser(grammar), tokenized_sent)
from nltk.parse.chart import BottomUpLeftCornerChartParser
parse(BottomUpLeftCornerChartParser(grammar), tokenized_sent)
from nltk.parse.chart import LeftCornerChartParser
parse(LeftCornerChartParser(grammar), tokenized_sent)

##############
['lazy', 'cats', 'play', 'with', 'mice']
-------
Resulting tree:
-------
(S
  (NP (JJ lazy) (NNS cats))
  (VP (VBP play) (PP (IN with) (NP (NNS mice)))))
##############
Number of edges: 52
-------
Resulting tree with edges:
-------
[0:1] 'lazy'
[1:2] 'cats'
[2:3] 'play'
[3:4] 'with'
[4:5] 'mice'
[0:0] JJ -> * 'lazy'
[0:1] JJ -> 'lazy' *
[0:0] NP -> * JJ NNS
[0:1] NP -> JJ * NNS
[1:1] NNS -> * 'cats'
[1:2] NNS -> 'cats' *
[1:1] NP -> * NNS
[1:1] NNS -> * NNS CC NNS
[0:2] NP -> JJ NNS *
[1:2] NP -> NNS *
[1:2] NNS -> NNS * CC NNS
[1:1] S  -> * NP VP
[1:1] S  -> * NP
[1:1] NP -> * NP CC NP
[1:2] S  -> NP * VP
[1:2] S  -> NP *
[1:2] NP -> NP * CC NP
[0:0] S  -> * NP VP
[0:0] S  -> * NP
[0:0] NP -> * NP CC NP
[0:2] S  -> NP * VP
[0:2] S  -> NP *
[0:2] NP -> NP * CC NP
[2:2] VBP -> * 'play'
[2:3] VBP -> 'play' *
[2:2] VP -> * VBP PP
[2:3] VP -> VBP * PP
[3:3] IN -> * 'with'
[3:4] IN -> 'with' *
[3:3] PP -> * IN NP
[3:4] PP -> IN * NP
[4:4] NNS -> * 'mice'
[4:5] NNS -> 'mice' *

In [None]:
# Which parser is the most efficient for parsing the sentence? Which edges are
# filtered out by each parser and why?

In [None]:
# Dependency parsing. Consider the first three pairs of sentences from the
# training set of the evaluation framework of the project. 

In [51]:
# Corenlp test
from nltk.parse import corenlp
parser = corenlp.CoreNLPDependencyParser(url='http://localhost:9000')
parse, = parser.raw_parse('The quick brown fox jumps over the lazy dog.')
for governor, dep, dependent in parse.triples():
    print(governor, dep, dependent)

('jumps', 'VBZ') nsubj ('fox', 'NN')
('fox', 'NN') det ('The', 'DT')
('fox', 'NN') amod ('quick', 'JJ')
('fox', 'NN') amod ('brown', 'JJ')
('jumps', 'VBZ') nmod ('dog', 'NN')
('dog', 'NN') case ('over', 'IN')
('dog', 'NN') det ('the', 'DT')
('dog', 'NN') amod ('lazy', 'JJ')
('jumps', 'VBZ') punct ('.', '.')


In [80]:
# IHLT framework line 1,2,3 analysis
from nltk.metrics import jaccard_distance

with open('IHLT-eval-framework/train/msr_paraphrase_train_input.txt', 'r') as f:
    line1, line2, line3 = next(f), next(f), next(f)
sent_pairs = [line1.split('\t'), line2.split('\t'), line3.split('\t')]

for pair in sent_pairs:
    print('\nSent pair')
    print(pair)
    sent_0 = pair[0]
    sent_1 = pair[1]
    parse_0, = parser.raw_parse(sent_0)
    parse_1, = parser.raw_parse(sent_1)
    triples_0 = [''.join([str(gov), str(dep), str(depdt)]) for gov, dep, depdt in parse_0.triples()]
    triples_1 = [''.join([str(gov), str(dep), str(depdt)]) for gov, dep, depdt in parse_1.triples()]
#     print('\nTriples')
#     print(triples_0)
#     print('---------------------------')
#     print(triples_1)
    print('\nJaccard Distance')
    print(jaccard_distance(set(triples_0), set(triples_1)))
    print('###########################')


Sent pair
['Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence. ', ' Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence.\n']

Jaccard Distance
0.6296296296296297
###########################

Sent pair
["Yucaipa owned Dominick's before selling the chain to Safeway in 1998 for $2.5 billion. ", " Yucaipa bought Dominick's in 1995 for $693 million and sold it to Safeway for $1.8 billion in 1998.\n"]

Jaccard Distance
0.8387096774193549
###########################

Sent pair
['They had published an advertisement on the Internet on June 10, offering the cargo for sale, he added. ', " On June 10, the ship's owners had published an advertisement on the Internet, offering the explosives for sale.\n"]

Jaccard Distance
0.5555555555555556
###########################


In [77]:
# IHLT framework line 1,2,3 analysis
# Flattenned array
from nltk.metrics import jaccard_distance

with open('IHLT-eval-framework/train/msr_paraphrase_train_input.txt', 'r') as f:
    line1, line2, line3 = next(f), next(f), next(f)
sent_pairs = [line1.split('\t'), line2.split('\t'), line3.split('\t')]

for pair in sent_pairs:
    print('\nSent pair')
    print(pair)
    sent_0 = pair[0]
    sent_1 = pair[1]
    parse_0, = parser.raw_parse(sent_0)
    parse_1, = parser.raw_parse(sent_1)
    triples_0 = []
    triples_1 = []
    
    for gov, dep, depdt in parse_0.triples():
        triples_0.append(str(gov))
        triples_0.append(str(dep))
        triples_0.append(str(depdt))

    for gov, dep, depdt in parse_1.triples():
        triples_1.append(str(gov))
        triples_1.append(str(dep))
        triples_1.append(str(depdt))
    
#     print('\nTriples')
#     print(triples_0)
#     print('---------------------------')
#     print(triples_1)
    print('\nJaccard Distance')
    print(jaccard_distance(set(triples_0), set(triples_1)))
    print('###########################')


Sent pair
['Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence. ', ' Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence.\n']

Jaccard Distance
0.3333333333333333
###########################

Sent pair
["Yucaipa owned Dominick's before selling the chain to Safeway in 1998 for $2.5 billion. ", " Yucaipa bought Dominick's in 1995 for $693 million and sold it to Safeway for $1.8 billion in 1998.\n"]

Jaccard Distance
0.5135135135135135
###########################

Sent pair
['They had published an advertisement on the Internet on June 10, offering the cargo for sale, he added. ', " On June 10, the ship's owners had published an advertisement on the Internet, offering the explosives for sale.\n"]

Jaccard Distance
0.3235294117647059
###########################
