In [1]:
# SpacyWordSplitter
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
# WordTokenizer: Wraps around SpacyWordSplitter
from allennlp.data.tokenizers.word_tokenizer import WordTokenizer

splitter = SpacyWordSplitter(pos_tags=True)
tokenizer = WordTokenizer(word_splitter=splitter)

sent_tokens = tokenizer.tokenize("This sentence is short.") # Tokens
sent_tokens = splitter.split_words("This sentence is short.") # Tokens from Spacy (same)
print(sent_tokens)

[This, sentence, is, short, .]


In [2]:
import re

def parse_results(tagged_sentence):
    """ Given raw tagged sentence, outputs all relations in whole sentences

    Args:
        tagged_sentence (json): Tagged from raw sentence
    Returns:
        rel_tuples: Tuples of form <arg1, rel, arg2>
    """
    rel_tuples = []
    for entry in tagged_sentence["verbs"]:
        phrases = re.findall(r"\[(.*?)\]", entry["description"])
        rel, pre_args, post_args = retrieve_tuples(phrases)
        relation = " ".join([word.split(": ")[1] for word in rel])
        tuples = [(pre.split(": ")[1], relation, post.split(": ")[1]) \
                        for pre in pre_args for post in post_args]
        rel_tuples += tuples
    return rel_tuples


def retrieve_tuples(phrases):
    """ Given BIO tagged phrases in a sentence, split into args and relationship

    Args:
        phrases (list): BIO tagged phrases
    Returns:
        rel, pre_args, post_args (list): separated phrases
    """
    rel, pre_args, post_args = [], [], []
    rel_found = False
    for phrase in phrases:
        if "V:" in phrase:
            rel.append(phrase)
            rel_found = True
        elif not rel_found:
            pre_args.append(phrase)
        else:
            post_args.append(phrase)
    return rel, pre_args, post_args


In [3]:
# Find all verbs in the input sentence
pred_ids = [i for (i, t)
            in enumerate(sent_tokens)
            if t.pos_ == "VERB"]
# Pair predicate index with sentence
json_list = [{"sentence": sent_tokens,
          "predicate_index": pred_id} for pred_id in pred_ids]
print(json_list)

[{'sentence': [This, sentence, is, short, .], 'predicate_index': 2}]


In [4]:
from allennlp.data.dataset_readers.semantic_role_labeling import SrlReader

# DatasetReader is SemanticRoleLabeler
dataset_reader = SrlReader()

def json_to_instance(json_dict):
    tokens = json_dict["sentence"]
    predicate_index = int(json_dict["predicate_index"])
    verb_labels = [0 for _ in tokens]
    verb_labels[predicate_index] = 1
    return dataset_reader.text_to_instance(tokens, verb_labels)

instances = [json_to_instance(json_dict) for json_dict in json_list] # Representation of the sentence paired with a predicate

# Fields within individual instances
for field_name in instances[0].fields:
    print(field_name)
    field = instances[0][field_name]
    print(field)


tokens
TextField of length 5 with text: 
 		[This, sentence, is, short, .]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'}
verb_indicator
SequenceLabelField of length 5 with labels:
 		[0, 0, 1, 0, 0]
 		in namespace: 'labels'.
metadata
MetadataField (print field.metadata to see specific information).


In [24]:
import spacy
# from spacy import displacy
nlp = spacy.load("en_core_web_sm")

# This is temporary to facilitate exploration
from allennlp.models.archival import Archive, load_archive
from allennlp.predictors.predictor import Predictor

archived_oie = load_archive("https://s3-us-west-2.amazonaws.com/allennlp/models/openie-model.2018-08-20.tar.gz")
archived_srl = load_archive("https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz")
archived_ner = load_archive("https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.12.18.tar.gz")
archived_dp = load_archive("https://s3-us-west-2.amazonaws.com/allennlp/models/biaffine-dependency-parser-ptb-2018.08.23.tar.gz")

oie = Predictor.from_archive(archived_oie, "open-information-extraction")
srl = Predictor.from_archive(archived_srl)
ner = Predictor.from_archive(archived_ner)
dp = Predictor.from_archive(archived_dp)



In [6]:
# archived_custom = load_archive("./out_scratch/")
# custom = Predictor.from_archive(archived_custom, "open-information-extraction")

In [7]:
srl_model = srl._model
oie_model = oie._model # Semantic_Role_Labeler 
encoder = oie_model.encoder # Seq2SeqWrapper
module = encoder._module # StackedAlternatingLstm
layers = module.lstm_layers # List of AugmentedLstms
# print(len(layers))
# print(layers[0])
# print(module.input_size)
# print(module.hidden_size)

# Semantic Role Labeler output per Instance
forward_out = oie_model.forward_on_instance(instances[0])
keys = forward_out.keys() # Logits and class probabilities have shape (5, 62), Mask (5, )

In [38]:
def print_results(predicted):
    for verb in predicted["verbs"]: # Print tags
        print(verb["tags"])
    print(parse_results(predicted))

# Predictor Instance
sentences = \
    [
        "Courtalds' spinoff reflects pressure on British industry to boost share prices beyond the reach of corporate raiders.",
        "The stock began trading at $14 apiece.",
        "Mercury filling, particularly prevalent in the USA, was banned in the EU, partly because it causes antibiotic resistance",
        "In 1971 , the FDA banned the use of Amphetamines after studies linked it to cancer and other problems in daughters of women who took the drug.",
        "Instead there was a funeral, at st. francis de sales roman catholic church, in Belle_harbor, Queens, the parish of his birth.",
        "FBI examined the relationship between Bin Laden and the Taliban",
        "Deva has moved to Mumbai and is residing at Boney Kapoor's old place called Green Acres.",
        "Did Uriah honestly think he could beat The Legend of Zelda in under three hours.",
        "Mr. Agnew was vice president of the U.S. from 1969 until he resigned in 1973."
    ]

In [43]:
sentence = sentences[8]
print(sentence)

print("================= OIE ===================")
predicted = oie.predict_json({"sentence": sentence})
print_results(predicted)

print("================= NER ===================")
predicted = ner.predict_json({"sentence": sentence})
print(predicted["tags"])
# print_results(predicted)
doc = nlp(sentence)
print("+++++ SPACY +++++")
for ent in doc.ents:
    print(ent, ent.label_)
    
print("================= DP ===================")
predicted = dp.predict_json({"sentence": sentence})
tree = predicted['hierplane_tree']
root = tree['root']
print(root['word'])
print(root['attributes'])
for child in root['children']:
    print(child)



# print("================= CUSTOM ================")
# predicted = custom.predict_json({"sentence": sentence})
# print_results(predicted)

# print("================= SRL ===================")
# predicted = srl.predict_json({"sentence": sentence})
# for verb in predicted["verbs"]:
#     print(verb["tags"])
# print(parse_results(predicted))

Mr. Agnew was vice president of the U.S. from 1969 until he resigned in 1973.
['B-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'B-ARG2', 'I-ARG2', 'B-ARG3', 'I-ARG3', 'I-ARG3', 'I-ARG3', 'I-ARG3', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'O']
[('Mr. Agnew', 'was', 'vice president of the U.S.'), ('Mr. Agnew', 'was', 'from 1969'), ('Mr. Agnew', 'was', 'until he resigned in 1973'), ('he', 'resigned', 'in 1973')]
['O', 'U-PER', 'O', 'O', 'O', 'O', 'O', 'U-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
+++++ SPACY +++++
Agnew PERSON
U.S. GPE
1969 DATE
1973 DATE
president
['NN']
{'word': 'Agnew', 'nodeType': 'nsubj', 'attributes': ['NNP'], 'link': 'nsubj', 'spans': [{'start': 4, 'end': 10}], 'children': [{'word': 'Mr.', 'nodeType': 'nn', 'attributes': ['NNP'], 'link': 'nn', 'spans': [{'start': 0, 'end': 4}]}]}
{'word': 'was', 'nodeType': 'cop', 'attributes': ['VBD'], 'link': 'cop', 'spans': [{'start': 10, 'e

False