In [2]:
# This is temporary to facilitate exploration
from allennlp.models.archival import Archive, load_archive
from allennlp.predictors.predictor import Predictor

archived_oie = load_archive("https://s3-us-west-2.amazonaws.com/allennlp/models/openie-model.2018-08-20.tar.gz")
predictor = Predictor.from_archive(archived_oie, "open-information-extraction")



In [5]:
import spacy
from spacy.tokenizer import Tokenizer
nlp = spacy.load("en_core_web_sm")

from typing import Dict

def word_tokenize(sentence: str):
    return nlp(sentence)
    
def json_to_instance(json_dict: Dict) -> Instance:
    """
    Expects JSON that looks like ``{"sentence": "...", "predicate_index": "..."}``.
    Assumes sentence is tokenized, and that predicate_index points to a specific
    predicate (word index) within the sentence, for which to produce Open IE extractions.
    """
    tokens = json_dict["sentence"]
    predicate_index = int(json_dict["predicate_index"])
    verb_labels = [0 for _ in tokens]
    verb_labels[predicate_index] = 1
    return text_to_instance(tokens, verb_labels)

def text_to_instance(tokens, verb_labels: List[int], tags=None):
    fields: Dict[str, Field] = {}
    text_field = TextField(tokens, token_indexers=self._token_indexers)
    fields['tokens'] = text_field
    fields['verb_indicator'] = SequenceLabelField(verb_label, text_field)

    if all([x == 0 for x in verb_label]):
        verb = None
    else:
        verb = tokens[verb_label.index(1)].text
    metadata_dict = {"words": [x.text for x in tokens], "verb": verb}

NameError: name 'Instance' is not defined

In [6]:
import torch
import torch.nn.functional as F
from allennlp.data.dataset import Batch
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits


tokenizer = predictor._tokenizer # WordTokenizer
splitter = tokenizer._word_splitter # SpacyWordSplitter
word_filter = tokenizer._word_filter # PassThroughWordFilter
stemmer = tokenizer._word_stemmer # PassThroughWordStemmer

sentence = "Harry is married to Mary." # Tokenizes to Token
tokens = word_tokenize(sentence)

pred_ids = [i for (i, t) in enumerate(tokens) if t.pos_ == "VERB"] # List of indexes of predicates
sent_pred_pairs = [{"sentence": tokens, "predicate_index": pred_id} for pred_id in pred_ids] # Paired

instances = [predictor._json_to_instance(sent_pred_pair) for sent_pred_pair in sent_pred_pairs]
# Each Instance: TextField, SequenceLableField, MetadataField

model = predictor._model
vocab = model.vocab

dataset = Batch(instances)
dataset.index_instances(vocab) # Index fields within Instances to pass into vocabulary embedding - Applies only to textfield
tensor_dict = dataset.as_tensor_dict()

tokens = tensor_dict["tokens"] # Dict of 'tokens' tensor[indexes of words in vocab]
verb_ind = tensor_dict["verb_indicator"] # tensor[1 for verb 0 otherwise]
meta = tensor_dict["metadata"] # words and verb in text form

embedder = model.text_field_embedder
print("Token Embedder: %s" % str(embedder.token_embedder_tokens.num_embeddings))
print("Token Embedder: %s" % str(embedder.token_embedder_tokens.output_dim))
embedded_text_input = embedder(tokens) # BS - TS - 100 (Embedder shape) ------- Embedder [no. of words in vocab + 1 UNK]
mask = get_text_field_mask(tokens) # Basically 0 for padding, 1 otherwise
bf_embedding = model.binary_feature_embedding
print("Verb Indicator Embedder: %s" % str(bf_embedding.num_embeddings))
print("Verb Indicator Embedder: %s" % str(bf_embedding.output_dim))
embedded_verb_indicator = bf_embedding(verb_ind)

concatenated = torch.cat([embedded_text_input, embedded_verb_indicator], -1)
print("Concatenated %s" % str(concatenated.shape))

encoder = model.encoder
stacked_bdlstm = encoder._module
encoded = encoder(concatenated, mask)
print("Encoded %s" % str(encoded.shape))

tag_projection_layer = model.tag_projection_layer
logits = tag_projection_layer(encoded)
print("Logits %s" % str(logits.shape))

num_labels = model.num_classes
reshaped_log_probs = logits.view(-1, num_labels)
print("Reshaped log probabilities %s" % str(reshaped_log_probs.shape))

batch_size, sequence_length, _ = concatenated.size()
class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view([batch_size, sequence_length, num_labels])
print("Class probs %s" % str(class_probabilities.shape))

output_dict = {"logits": logits, "class_probabilities": class_probabilities, "mask":mask}

words, verbs = zip(*[(x["words"], x["verb"]) for x in meta])
output_dict["words"] = list(words)
output_dict["verb"] = list(verbs)

tags = model.decode(output_dict)["tags"] # Viterbi decoding, dump from Allen
print(tags)



Token Embedder: 97872
Token Embedder: 100
Verb Indicator Embedder: 2
Verb Indicator Embedder: 100
Concatenated torch.Size([1, 6, 200])
Encoded torch.Size([1, 6, 300])
Logits torch.Size([1, 6, 62])
Reshaped log probabilities torch.Size([6, 62])
Class probs torch.Size([1, 6, 62])
[['B-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'I-ARG1', 'O']]


In [12]:
print(vocab.get_token_from_index(0))
print(vocab.get_token_from_index(1))
vocab.save_to_files("./test_implementation/vocab") # Retrieve this and use

@@PADDING@@
@@UNKNOWN@@


In [113]:
forward = model.forward_on_instance(instances[0]) # Only the tags are important (Look into this)
print(forward["tags"])

sentence = "Harry is married to Mary and has two children."
predictor.predict_json({"sentence": sentence})
print()

['B-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'I-ARG1', 'O']



In [154]:
oie_model = oie._model
for parameter in oie_model.parameters():
    print(parameter.shape)

torch.Size([97872, 100])
torch.Size([1800, 200])
torch.Size([1800])
torch.Size([1500, 300])
torch.Size([1500])
torch.Size([1800, 300])
torch.Size([1800])
torch.Size([1500, 300])
torch.Size([1500])
torch.Size([1800, 300])
torch.Size([1800])
torch.Size([1500, 300])
torch.Size([1500])
torch.Size([1800, 300])
torch.Size([1800])
torch.Size([1500, 300])
torch.Size([1500])
torch.Size([1800, 300])
torch.Size([1800])
torch.Size([1500, 300])
torch.Size([1500])
torch.Size([1800, 300])
torch.Size([1800])
torch.Size([1500, 300])
torch.Size([1500])
torch.Size([1800, 300])
torch.Size([1800])
torch.Size([1500, 300])
torch.Size([1500])
torch.Size([1800, 300])
torch.Size([1800])
torch.Size([1500, 300])
torch.Size([1500])
torch.Size([2, 100])
torch.Size([62, 300])
torch.Size([62])
