In [2]:
from nltk.parse.stanford import StanfordDependencyParser

import argparse
import sys
import re

In [4]:
path_to_jar = '../stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0.jar'
path_to_models_jar = '../stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0-models.jar'
dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)
r = re.compile(r'\d{3,100}[ -]+\d{3,100}[ -]+\d{3,100}')

In [5]:
def dependency_parse(sentence):
    try:
        result = dependency_parser.raw_parse(sentence)
        dep = result.__next__()
        return list(dep.triples())
    except:
        print("EXCEPTION: while parsing sentence:")
        sys.exit(0)

In [6]:
dependency_parse("ARFTS specially binds to a distinct domain in XIAP-BIR3")

[(('ARFTS', 'NNS'), 'amod', ('binds', 'JJ')),
 (('binds', 'JJ'), 'advmod', ('specially', 'RB')),
 (('ARFTS', 'NNS'), 'nmod', ('domain', 'NN')),
 (('domain', 'NN'), 'case', ('to', 'TO')),
 (('domain', 'NN'), 'det', ('a', 'DT')),
 (('domain', 'NN'), 'amod', ('distinct', 'JJ')),
 (('domain', 'NN'), 'nmod', ('XIAP-BIR3', 'NN')),
 (('XIAP-BIR3', 'NN'), 'case', ('in', 'IN'))]

The above output is wrong. You need to fix it! Compare the output from the Stanford CoreNLP demo

__Caution:__ Do not use the parser from NLTK!

In [25]:
# adopted from https://github.com/smilli/py-corenlp
import json
import os
from pprint import pprint
from pycorenlp import StanfordCoreNLP

In [11]:
nlp = StanfordCoreNLP('http://localhost:9000')
properties={'annotators': 'depparse', 'outputFormat': 'json'}

def parse(fname):
    save_output = ''
    with open(fname, 'r') as fhandle, open(fname + '.deps.json', 'w') as whandle:
        for line in fhandle:
            output = nlp.annotate(line.strip(), properties)
            whandle.write(json.dumps(output['sentences'][0]) + "\n")

Let us test the dependencies on a sample file:

In [12]:
%cat ../sampleFile.txt

Royal likes Mangoes
Royal wants to go back to Mangalore
Royal wants to spend the rest of his life reading books and learning new languages.
at times , the suspense is palpable , but by the end there 's a sense that the crux of the mystery hinges on a technicality that strains credulity and leaves the viewer haunted by the waste of potential .

In [13]:
parse('../sampleFile.txt')

### Dependency Details
The above way of parsing gives three levels of dependency parsing:
- basicDependencies,
- enhancedDepencies, and
- enhancedPlusDepencies

For now, I am saving all the information that I get from dependency parsing. I will later exploe what each of these levels mean.

In [277]:
import numpy as np

basic_dep_tags = ['csubj', 'aux', 'acl:relcl', 'mark', 'expl', 'amod', 'acl', 'parataxis', 'compound',
            'advmod', 'nmod:poss', 'cc:preconj', 'det', 'case', 'ROOT', 'punct', 'nmod:npmod', 
            'nsubjpass', 'det:predet', 'advcl', 'root', 'dep', 'mwe', 'xcomp', 'nmod', 'cop', 
            'cc', 'nsubj', 'csubjpass', 'appos', 'conj', 'nummod', 'discourse', 'auxpass', 'ccomp',
            'nmod:tmod', 'iobj', 'compound:prt', 'dobj', 'neg', 'NO_DEP']

pos_tags = ['RBS', "''", 'VB', '#', '.', 'WP$', 'SYM', 'LS', 'WDT', 'NNP', 'TO', 'CD', 'NNPS', 
            'NN', 'MD', 'RBR', 'JJS', 'VBN', 'VBP', '``', 'WRB', 'JJR', 'VBD', 'FW', 'RB', 'NNS',
            'POS', ',', 'PDT', 'UH', 'VBG', '$', 'PRP$', 'VBZ', 'PRP', ':', 'WP', 'IN', 'CC', 'DT',
            'JJ', 'RP', 'EX', 'NO_POS']

def get_dep_pos(string):
    this_pos = []
    json_dict = json.loads(string)

    for token in json_dict['tokens']:
        this_pos.append(pos_tags.index(token['pos']))

    len_dep = len(this_pos)
    this_dep = [None] * len_dep
    headwords = [None] * len_dep
    this_head_index = [None] * len_dep

    for dep in json_dict['basicDependencies']:
        index, dependency = dep['dependent'], dep['dep']
        this_dep[index - 1] = dep_tags.index(dependency)
        headwords[index - 1] = dep['governorGloss']
        this_head_index[index - 1] = dep['governor']

    this_head_dep = []
    this_head_pos = []

    for head_index in this_head_index:
        if head_index - 1 < 0:
            this_head_dep.append(basic_dep_tags.index('NO_DEP'))
            this_head_pos.append(pos_tags.index('NO_POS'))
        else:
            this_head_dep.append(this_dep[head_index - 1])
            this_head_pos.append(this_pos[head_index - 1])

    tags = this_dep + this_pos
    headtags = this_head_dep + this_head_pos
    return headwords, headtags, tags

def one_hot(dep_tag, pos_tag):
    basic_dependency_size = 40
    pos_size = 43
    dep_narray = np.array(dep_tag)
    pos_narray = np.array(pos_tag)
    
    dep_one_hot = np.zeros((dep_narray.size, basic_dependency_size + 1))
    pos_one_hot = np.zeros((pos_narray.size, pos_size + 1))
    
    dep_one_hot[np.arange(basic_dependency_size), dep_narray] = 1
    pos_one_hot[np.arange(len(pos_tag)), pos_narray] = 1

    for x, y in zip(dep_one_hot, pos_one_hot):
        print(x.tolist() + y.tolist())
    



#     dep_one_hot[np.arange(basic_dependency_size), dep_narray] = 1
#     pos_one_hot[np.arange(pos_size), pos_narray] = 1

#     concatenated_one_hot = []
#     for x, y in zip(dep_one_hot, pos_one_hot):
#         concatenated_one_hot.append(x.tolist() + y.tolist())

#     return concatenated_one_hot
              
def unfold_tags(tags):
    mid = len(tags) // 2 
    dep_tag = tags[:mid]
    pos_tag = tags[mid:]
    return(one_hot(dep_tag, pos_tag))
    
with open('../sampleFile.txt.deps.json') as json_file:
    for line in json_file:
        headword, headtag, wordtag = get_dep_pos(line)
        unfold_tags(headtag)
        break
            

            

IndexError: shape mismatch: indexing arrays could not be broadcast together with shapes (40,) (3,) 

In [255]:
import glob

all_deps_list = set()
all_pos_list = set()

for file in glob.glob('../data/*.json'):
#     print(file)
    with open(file) as json_file:
        for line in json_file:
            json_dict = json.loads(line)

            for tok in json_dict['basicDependencies']:
                if tok['dep'] not in all_deps_list:
                    all_deps_list.add(tok['dep'])

            for tok in json_dict['tokens']:
                if tok['pos'] not in all_deps_list:
                    all_pos_list.add(tok['pos'])

print(len(all_deps_list))
print(len(all_pos_list))

40
43


In [192]:
%cat sampleFile.txt.deps.json

cat: sampleFile.txt.deps.json: No such file or directory


### To Do:
- readme instruction to run the server first
- nsubj:xcomp, experiments retaining it and separating it
