In [1]:
import argparse
import json
import os
import re
import sys

from allennlp.predictors.predictor import Predictor
from lxml import etree
from nltk.tokenize import TreebankWordTokenizer
from tqdm import tqdm

In [2]:
file_path = "data/semeval14/Laptop_Train_v2_text.txt"
model_path = "biaffine-dependency-parser-ptb-2020.04.06.tar.gz"

with open(file_path, 'r') as f:
    sentences = f.readlines()

print("Number of sentences: {}".format(len(sentences)))

Number of sentences: 1488


In [3]:
docs = []
predictor = Predictor.from_path(model_path)

print('Predicting dependency information...')
for index, item in enumerate(sentences):
    if index == 5:
        break
    parse_predict = predictor.predict(sentence=sentences[index])
    sentence = sentences[index]
    parse_predict["sentence"] = sentence
    docs.append(parse_predict)

#print("Output of Biaffine parsing: {}".format(docs[:2]))

error loading _jsonnet (this is expected on Windows), treating C:\Users\harsh\AppData\Local\Temp\tmpxvol387_\config.json as plain json
Your label namespace was 'pos'. We recommend you use a namespace ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by default to your vocabulary.  See documentation for `non_padded_namespaces` parameter in Vocabulary.


Predicting dependency information...


## Format annotation: sentence of keys

                                - tokens
                                - tags
                                - predicted_dependencies
                                - predicted_heads
                                - dependencies

In [4]:
def dependencies2format(doc):  
    sentence = dict()
    sentence["sentence"] = doc["sentence"]
    sentence['tokens'] = doc['words']
    sentence['tags'] = doc['pos']
    predicted_dependencies = doc['predicted_dependencies']
    predicted_heads = doc['predicted_heads']
    sentence['predicted_dependencies'] = doc['predicted_dependencies']
    sentence['predicted_heads'] = doc['predicted_heads']
    sentence['dependencies'] = []
    for idx, item in enumerate(predicted_dependencies):
        dep_tag = item
        frm = predicted_heads[idx]
        to = idx + 1
        sentence['dependencies'].append([dep_tag, frm, to])

    return sentence

In [5]:
def deparse(sentences):
    biaffine_deparse = []
    
    for idx, sentence in enumerate(sentences):
        if idx == 5:
            break
        example = dict()
        example['sentence'] = sentences[idx]['sentence'] 
        example['tokens'] = sentences[idx]['tokens'] 
        example['tags'] = sentences[idx]['tags']
        example['predicted_dependencies'] = sentences[idx]['predicted_dependencies']
        example['predicted_heads'] = sentences[idx]['predicted_heads']
        example['dependencies'] = sentences[idx]['dependencies']


        example["aspect_sentiment"] = []
        example['from_to'] = [] #left and right offset of the target word 

        for index, tag in enumerate(example["tags"]):
            if tag == "NOUN" or tag=="PROPN":
                example["aspect_sentiment"].append((example['tokens'][index]))
                from_index = index
                if from_index != 0:
                    to = example['predicted_heads'][from_index - 1]
                    if example['predicted_heads'][from_index] == to and example['predicted_heads'][from_index + 1] !=to:
                        to_index = example['predicted_heads'][from_index + 1]
                    else:
                        to_index = example['predicted_heads'][from_index]


                example['from_to'].append((from_index, to_index))

        biaffine_deparse.append(example)  
    
    return biaffine_deparse

### Docs <- list(biaffineparser(sentence))

In [6]:
doc_sentences = [dependencies2format(doc) for doc in docs]
deparsed_sentences = deparse(doc_sentences)

In [7]:
deparsed_sentences

[{'sentence': 'I charge it at night and skip taking the cord with me because of the good battery life.\n',
  'tokens': ['I',
   'charge',
   'it',
   'at',
   'night',
   'and',
   'skip',
   'taking',
   'the',
   'cord',
   'with',
   'me',
   'because',
   'of',
   'the',
   'good',
   'battery',
   'life',
   '.'],
  'tags': ['PRON',
   'VERB',
   'PRON',
   'ADP',
   'NOUN',
   'CCONJ',
   'VERB',
   'VERB',
   'DET',
   'NOUN',
   'ADP',
   'PRON',
   'SCONJ',
   'ADP',
   'DET',
   'ADJ',
   'NOUN',
   'NOUN',
   'PUNCT'],
  'predicted_dependencies': ['nsubj',
   'root',
   'dep',
   'advmod',
   'dep',
   'nsubj',
   'dep',
   'dep',
   'det',
   'dobj',
   'prep',
   'dep',
   'advmod',
   'prep',
   'dep',
   'amod',
   'nn',
   'pobj',
   'punct'],
  'predicted_heads': [2,
   0,
   2,
   2,
   4,
   7,
   2,
   7,
   10,
   8,
   8,
   11,
   14,
   12,
   17,
   17,
   18,
   14,
   2],
  'dependencies': [['nsubj', 2, 1],
   ['root', 0, 2],
   ['dep', 2, 3],
   ['advmod', 2