In [8]:
import pprint

pp = pprint.PrettyPrinter(indent=4)

def parse(parser, grammar, sent):
    parser = parser(grammar)
    parse = parser.parse(sent)
    ts = []
    for t in parse:
        ts.append(t)
        print(t)
        t.pretty_print()
    print('number of trees:', len(ts))
    parse = parser.chart_parse(sent)
    print("TD num edges = ", parse.num_edges())
    pp.pprint(parse.edges())

In [9]:
# MANDATORY 1

import nltk
from nltk import CFG, ChartParser, BottomUpChartParser, BottomUpLeftCornerChartParser, LeftCornerChartParser

grammar = CFG.fromstring('''
S -> NP VP
NP -> NNS | JJ NNS | NP CC NP 
VP -> VB NNS
NNS -> "cats" | "dogs" | "mice" | NNS CC NNS | IN NNS
JJ -> "big" | "small" | "lazy"
CC -> "and" | "or"
VB -> "play"
IN -> "with"
''')

sent = ['lazy', 'cats', 'play', 'with', 'mice']
parsers = [ChartParser, BottomUpChartParser, BottomUpLeftCornerChartParser, LeftCornerChartParser]

for parser in parsers:
    print('Parser: ', parser)
    parse(parser, grammar, sent)
    print('')


Parser:  <class 'nltk.parse.chart.ChartParser'>
(S
  (NP (JJ lazy) (NNS cats))
  (VP (VB play) (NNS (IN with) (NNS mice))))
               S                
       ________|____             
      |             VP          
      |         ____|____        
      NP       |        NNS     
  ____|___     |     ____|___    
 JJ      NNS   VB   IN      NNS 
 |        |    |    |        |   
lazy     cats play with     mice

number of trees: 1
TD num edges =  32
[   [Edge: [0:1] 'lazy'],
    [Edge: [1:2] 'cats'],
    [Edge: [2:3] 'play'],
    [Edge: [3:4] 'with'],
    [Edge: [4:5] 'mice'],
    [Edge: [0:1] JJ -> 'lazy' *],
    [Edge: [0:1] NP -> JJ * NNS],
    [Edge: [1:2] NNS -> 'cats' *],
    [Edge: [1:2] NP -> NNS *],
    [Edge: [1:2] NNS -> NNS * CC NNS],
    [Edge: [0:2] NP -> JJ NNS *],
    [Edge: [0:2] S  -> NP * VP],
    [Edge: [0:2] NP -> NP * CC NP],
    [Edge: [1:2] S  -> NP * VP],
    [Edge: [1:2] NP -> NP * CC NP],
    [Edge: [2:3] VB -> 'play' *],
    [Edge: [2:3] VP -> VB *

The most efficient parser for this case was the LeftCornerChartParser as it was able to calculate the chart with least amount of edges.

The edges filtered out by the BottomUpLeftCornerChartParser are the ones without any word subsumption, for example [Edge: [0:0] NP -> * JJ NNS] or edge [Edge: [1:1] NNS -> * 'cats'], which are edges that are present in the BottomUpChartParser, but not in this parser, because they are not generalizing. 

Then, the edges filtered out by the LeftCornerChartParser are the subsumptions that are already present, so basically any edges with CCs are filtered out since they are present in other edges. For example: [Edge: [1:2] NNS -> NNS * CC NNS] is filtered out

In [10]:
import nltk
nltk.download('sentiwordnet')

[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /home/santiago/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


True

In [11]:
# MANDATORY 2
import nltk
from nltk.wsd import lesk
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.metrics import jaccard_distance
from nltk.corpus import wordnet as wn 
from nltk.parse.corenlp import CoreNLPDependencyParser
from scipy.stats import pearsonr
  
def lemmatize(p):
    if p[1][0] in {'N', 'V'}:
        return wnl.lemmatize(p[0].lower(), pos=p[1][0].lower())
    return p[0]

def penn2morphy(penntag, returnNone=False):
    morphy_tag = {'NN':wn.NOUN, 'JJ':wn.ADJ,
                  'VB':wn.VERB, 'RB':wn.ADV}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return None if returnNone else ''

def getHypernym(s):
    if not s:
        return [s]
    if s.hypernyms() == []:
        return [None]
    return s.hypernyms()

def parseWithCoreNLP(s):
    # res = []
    # tree = next(parser.raw_parse(s))
    # for t in tree.triples():
    #     res.append(tuple([x[0] for x in t if isinstance(x, tuple)]))
    # return res
    res = []
    tree = next(parser.raw_parse(s))
    for t in tree.triples():
        res.append(tuple([lesk(s, x[0], pos=penn2morphy(x[1][0])) for x in t if isinstance(x, tuple)]))
    return res

wnl = WordNetLemmatizer()
parser = CoreNLPDependencyParser(url='http://localhost:9000')

input_file = 'trial/STS.input.txt'
with open(input_file) as f:
    input_data = f.readlines()
    
document_distances = []
lesks_distances = []
lesks_hyper_distances = []
morphology_distances = []
corenlp_distances = []
for i in input_data:
    sentences = nltk.sent_tokenize(i[4:])
    words = [nltk.word_tokenize(sent) for sent in sentences]
    pairs = [pos_tag(w) for w in words]
    l_words = [[lemmatize(p) for p in pair] for pair in pairs]
    synsets = [[[lesk(w, p[0], pos=penn2morphy(p[1][0])) for p in pair] for pair in pairs] for w in words]
    hypernyms = [[getHypernym(s) for s in synsets[c][c]] for c in range(len(synsets))]
    hypernyms = [[hy[0] for hy in hyp] for hyp in hypernyms]
    corenlp = [parseWithCoreNLP(s) for s in sentences]
    lesks_distances.append(jaccard_distance(set(synsets[0][0]), set(synsets[1][1])))
    morphology_distances.append(jaccard_distance(set(l_words[0]),set(l_words[1])))
    document_distances.append(jaccard_distance(set(words[0]),set(words[1])))
    lesks_hyper_distances.append(jaccard_distance(set(hypernyms[0]),set(hypernyms[1])))
    corenlp_distances.append(jaccard_distance(set(corenlp[0]),set(corenlp[1])))
            
            
    
print("Word distances: " + str(document_distances))
print("Lemmatized distances: " + str(morphology_distances))
print("Lesk distances: " + str(lesks_distances))
print("Lesk variant with hypernyms distances: " + str(lesks_hyper_distances))
print("CoreNLP dependency parser distances: " + str(corenlp_distances))
print("")

gold_file = 'trial/STS.gs.txt'
with open(gold_file) as f:
    gold_data = f.readlines()
gold = [int(g[4:5]) for g in gold_data]

doc_pearson = pearsonr(document_distances, gold)[0]
mor_pearson = pearsonr(morphology_distances, gold)[0]
lesk_pearson = pearsonr(lesks_distances, gold)[0]
hypernyms_pearson = pearsonr(lesks_hyper_distances, gold)[0]
corenlp_pearson = pearsonr(corenlp_distances, gold)[0]

print('Words correlation: ' + str(doc_pearson))
print('Lemmatized words correlation: ' + str(mor_pearson))
print('Lesk correlation: ' + str(lesk_pearson))
print('Lesk variant with hypernyms correlation: ' + str(hypernyms_pearson))
print("CoreNLP dependency parser correlation: " + str(corenlp_pearson))

Word distances: [0.6923076923076923, 0.7368421052631579, 0.6666666666666666, 0.5454545454545454, 0.7692307692307693, 0.8620689655172413]
Lemmatized distances: [0.6923076923076923, 0.6666666666666666, 0.6666666666666666, 0.5454545454545454, 0.7692307692307693, 0.8620689655172413]
Lesk distances: [0.7, 0.7857142857142857, 0.5, 0.8888888888888888, 0.9, 0.92]
Lesk variant with hypernyms distances: [0.7, 0.8, 0.4444444444444444, 0.8333333333333334, 0.875, 0.9]
CoreNLP dependency parser distances: [1.0, 0.8823529411764706, 0.9411764705882353, 0.5714285714285714, 1.0, 0.9655172413793104]

Words correlation: 0.4143770872333895
Lemmatized words correlation: 0.517276212426234
Lesk correlation: 0.6056964784272112
Lesk variant with hypernyms correlation: 0.5101560894527944
CoreNLP dependency parser correlation: -0.06178309306886861


The result is awful compared to the previous strategies. We tried to calculate the synsets of the words and the result improved a little bit. That might be because it is more difficult to find similarities between the pairs since they are less common than the simple words. Using the same reasoning, we don't think that using NEs would help the result.

'id1\tThe bird is bathing in the sink.\tBirdie is washing itself in the water basin.\n'