In [2]:
import pprint

pp = pprint.PrettyPrinter(indent=4)

def parse(parser, grammar, sent):
    parser = parser(grammar)
    parse = parser.parse(sent)
    ts = []
    for t in parse:
        ts.append(t)
        print(t)
        t.pretty_print()
    print('number of trees:', len(ts))
    parse = parser.chart_parse(['small', 'cats', 'and', 'mice'])
    print("TD num edges = ", parse.num_edges())
    pp.pprint(parse.edges())

In [3]:
import nltk
from nltk import CFG, ChartParser, BottomUpChartParser, BottomUpLeftCornerChartParser, LeftCornerChartParser

grammar = CFG.fromstring('''
S -> NP VP
NP -> NNS | JJ NNS | NP CC NP 
VP -> VB NNS
NNS -> "cats" | "dogs" | "mice" | NNS CC NNS | IN NNS
JJ -> "big" | "small" | "lazy"
CC -> "and" | "or"
VB -> "play"
IN -> "with"
''')

sent = ['lazy', 'cats', 'play', 'with', 'mice']
parsers = [ChartParser, BottomUpChartParser, BottomUpLeftCornerChartParser, LeftCornerChartParser]

for parser in parsers:
    print('Parser: ', parser)
    parse(parser, grammar, sent)
    print('')


Parser:  <class 'nltk.parse.chart.ChartParser'>
(S
  (NP (JJ lazy) (NNS cats))
  (VP (VB play) (NNS (IN with) (NNS mice))))
               S                
       ________|____             
      |             VP          
      |         ____|____        
      NP       |        NNS     
  ____|___     |     ____|___    
 JJ      NNS   VB   IN      NNS 
 |        |    |    |        |   
lazy     cats play with     mice

number of trees: 1
TD num edges =  33
[   [Edge: [0:1] 'small'],
    [Edge: [1:2] 'cats'],
    [Edge: [2:3] 'and'],
    [Edge: [3:4] 'mice'],
    [Edge: [0:1] JJ -> 'small' *],
    [Edge: [0:1] NP -> JJ * NNS],
    [Edge: [1:2] NNS -> 'cats' *],
    [Edge: [1:2] NP -> NNS *],
    [Edge: [1:2] NNS -> NNS * CC NNS],
    [Edge: [0:2] NP -> JJ NNS *],
    [Edge: [0:2] S  -> NP * VP],
    [Edge: [0:2] NP -> NP * CC NP],
    [Edge: [1:2] S  -> NP * VP],
    [Edge: [1:2] NP -> NP * CC NP],
    [Edge: [2:3] CC -> 'and' *],
    [Edge: [1:3] NNS -> NNS CC * NNS],
    [Edge: [0:

In [70]:
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.wsd import lesk
from nltk.stem import WordNetLemmatizer
from nltk.metrics import jaccard_distance
from nltk.corpus import wordnet as wn

def lemmatize(p):
    if p[1][0] in {'N', 'V'}:
        return wnl.lemmatize(p[0].lower(), pos=p[1][0].lower())
    return p[0]

def penn2morphy(penntag, returnNone=False):
    morphy_tag = {'NN':wn.NOUN, 'JJ':wn.ADJ,
                  'VB':wn.VERB, 'RB':wn.ADV}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return None if returnNone else ''
    
def words_and_ne(nerc):
    if(isinstance(nerc,nltk.Tree)):
        return ' '.join([r[0] for r in nerc.leaves()])
    return nerc[0]
        
def getvalue(triple):
    value = triple[0][0], triple[1], triple[2][0]
    return value
    
wnl = WordNetLemmatizer()

input_file = 'trial/STS.input.txt'
with open(input_file) as f:
    input_data = f.readlines()
    
document_distances = []
lesks_distances = []
morphology_distances = []
wordsne_distances = []
corenlp_distances = []
for i in input_data:
    sentences = nltk.sent_tokenize(i[4:])
    words = [nltk.word_tokenize(sent) for sent in sentences]
    pairs = [pos_tag(w) for w in words]
    l_words = [[lemmatize(p) for p in pair] for pair in pairs]
    res = [ne_chunk(p, binary=True) for p in pairs]
    wordsne = [[words_and_ne(r) for r in ress] for ress in res]
    synsets = [[[lesk(w, p[0], pos=penn2morphy(p[1][0])) for p in pair] for pair in pairs] for w in words]
    
    #CoreNLP
    parsed = [parser.raw_parse(sent) for sent in sentences]
    triples = [[t for t in next(par).triples()] for par in parsed]
    corenlp = [[getvalue(tri) for tri in triple ]for triple in triples]
    
    lesks_distances.append(jaccard_distance(set(synsets[0][0]), set(synsets[1][1])))
    morphology_distances.append(jaccard_distance(set(l_words[0]),set(l_words[1])))
    document_distances.append(jaccard_distance(set(words[0]),set(words[1])))
    wordsne_distances.append(jaccard_distance(set(wordsne[0]), set(wordsne[1])))
    corenlp_distances.append(jaccard_distance(set(corenlp[0]), set(corenlp[1])))
    
    
    
print("Lab2 distances: " + str(document_distances))
print("Lab3 distances: " + str(morphology_distances))
print("Lesk distance: " + str(lesks_distances))
print('Words and NE distance:' + str(wordsne_distances))
print('CoreNLP parser distance:' + str(corenlp_distances))

Lab2 distances: [0.6923076923076923, 0.7368421052631579, 0.6666666666666666, 0.5454545454545454, 0.7692307692307693, 0.8620689655172413]
Lab3 distances: [0.6923076923076923, 0.6666666666666666, 0.6666666666666666, 0.5454545454545454, 0.7692307692307693, 0.8620689655172413]
Lesk distance: [0.7, 0.7857142857142857, 0.5, 0.8888888888888888, 0.9, 0.92]
Words and NE distance:[0.6923076923076923, 0.7368421052631579, 0.6666666666666666, 0.5454545454545454, 0.7692307692307693, 0.8620689655172413]
CoreNLP parser distance:[1.0, 1.0, 0.9444444444444444, 0.6, 1.0, 0.9666666666666667]


In [71]:
from scipy.stats import pearsonr

gold_file = 'trial/STS.gs.txt'
with open(gold_file) as f:
    gold_data = f.readlines()
gold = [int(g[4:5]) for g in gold_data]

doc_pearson = pearsonr(document_distances, gold)[0]
mor_pearson = pearsonr(morphology_distances, gold)[0]
lesk_pearson = pearsonr(lesks_distances, gold)[0]
wordsne_pearson = pearsonr(wordsne_distances, gold)[0]
corenlp_pearson = pearsonr(corenlp_distances, gold)[0]

print('Lab2 pearson correlation: ' + str(doc_pearson))
print('Lab3 pearson correlation: ' + str(mor_pearson))
print('Lesk pearson correlation: ' + str(lesk_pearson))
print('Words and NE pearson correlation: ' + str(wordsne_pearson))
print('CoreNLP parser pearson correlation: ' + str(corenlp_pearson))

Lab2 pearson correlation: 0.4143770872333895
Lab3 pearson correlation: 0.517276212426234
Lesk pearson correlation: 0.6056964784272112
Words and NE pearson correlation: 0.4143770872333895
CoreNLP parser pearson correlation: -0.17322964246636668


In [7]:
from nltk.parse.corenlp import CoreNLPDependencyParser

parser = CoreNLPDependencyParser(url='http://localhost:9000')
parse = parser.raw_parse('Smith jumps over the lazy dog')

tree = next(parse)
for t in tree.triples():
    print(t)

(('jumps', 'VBZ'), 'nsubj', ('Smith', 'NNP'))
(('jumps', 'VBZ'), 'nmod', ('dog', 'NN'))
(('dog', 'NN'), 'case', ('over', 'IN'))
(('dog', 'NN'), 'det', ('the', 'DT'))
(('dog', 'NN'), 'amod', ('lazy', 'JJ'))


In [48]:

input_file = 'trial/STS.input.txt'
with open(input_file) as f:
    input_data = f.readlines()
    
for i in input_data:
    sentences = nltk.sent_tokenize(i[4:])
    parsed = [parser.raw_parse(sent) for sent in sentences]
    triples = [[t for t in next(par).triples()] for par in parsed]
    

print(triples)

[[(('went', 'VBD'), 'nsubj', ('John', 'NNP')), (('went', 'VBD'), 'dobj', ('horse', 'NN')), (('went', 'VBD'), 'xcomp', ('riding', 'VBG')), (('riding', 'VBG'), 'advmod', ('back', 'RB')), (('riding', 'VBG'), 'nmod', ('dawn', 'NN')), (('dawn', 'NN'), 'case', ('at', 'IN')), (('riding', 'VBG'), 'nmod', ('group', 'NN')), (('group', 'NN'), 'case', ('with', 'IN')), (('group', 'NN'), 'det', ('a', 'DT')), (('group', 'NN'), 'amod', ('whole', 'JJ')), (('group', 'NN'), 'nmod', ('friends', 'NNS')), (('friends', 'NNS'), 'case', ('of', 'IN')), (('went', 'VBD'), 'punct', ('.', '.'))], [(('view', 'NN'), 'nsubj', ('Sunrise', 'NNP')), (('Sunrise', 'NNP'), 'nmod', ('dawn', 'NN')), (('dawn', 'NN'), 'case', ('at', 'IN')), (('view', 'NN'), 'cop', ('is', 'VBZ')), (('view', 'NN'), 'det', ('a', 'DT')), (('view', 'NN'), 'amod', ('magnificent', 'JJ')), (('view', 'NN'), 'acl', ('take', 'VB')), (('take', 'VB'), 'mark', ('to', 'TO')), (('take', 'VB'), 'compound:prt', ('in', 'RP')), (('take', 'VB'), 'advcl', ('wake', '

In [72]:
def getvalue(triple):
    value = triple[0][0], triple[1], triple[2][0]
    return value

cons = [[getvalue(tri) for tri in triple ]for triple in triples]

cons
#jaccard_distance(set(cons[0]), set(cons[1]))

[[('went', 'nsubj', 'John'),
  ('went', 'dobj', 'horse'),
  ('went', 'xcomp', 'riding'),
  ('riding', 'advmod', 'back'),
  ('riding', 'nmod', 'dawn'),
  ('dawn', 'case', 'at'),
  ('riding', 'nmod', 'group'),
  ('group', 'case', 'with'),
  ('group', 'det', 'a'),
  ('group', 'amod', 'whole'),
  ('group', 'nmod', 'friends'),
  ('friends', 'case', 'of'),
  ('went', 'punct', '.')],
 [('view', 'nsubj', 'Sunrise'),
  ('Sunrise', 'nmod', 'dawn'),
  ('dawn', 'case', 'at'),
  ('view', 'cop', 'is'),
  ('view', 'det', 'a'),
  ('view', 'amod', 'magnificent'),
  ('view', 'acl', 'take'),
  ('take', 'mark', 'to'),
  ('take', 'compound:prt', 'in'),
  ('take', 'advcl', 'wake'),
  ('wake', 'mark', 'if'),
  ('wake', 'nsubj', 'you'),
  ('wake', 'advmod', 'up'),
  ('wake', 'advmod', 'enough'),
  ('enough', 'advmod', 'early'),
  ('enough', 'nmod', 'it'),
  ('it', 'case', 'for'),
  ('view', 'punct', '.')]]