In [37]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from attention_graph_util import *
import seaborn as sns
import itertools 
import matplotlib as mpl
import networkx as nx
import os
from util import constants

from absl import app
from absl import flags
import pandas as pd

from util.models import MODELS
from util.tasks import TASKS
#from dnotebook_utils import *
from attention_graph_util import *
%matplotlib inline
from util.config_util import get_task_params
from notebooks.notebook_utils import *
from util import inflect

from tqdm import tqdm

rc={'font.size': 10, 'axes.labelsize': 10, 'legend.fontsize': 10.0, 
    'axes.titlesize': 32, 'xtick.labelsize': 20, 'ytick.labelsize': 16}
plt.rcParams.update(**rc)
mpl.rcParams['axes.linewidth'] = .5 #set the value globally

import torch
from transformers import *
from transformers import BertConfig, BertForMaskedLM, BertTokenizer

In [20]:
task_name = 'word_sv_agreement_lm'
task_params = get_task_params(batch_size=10)
task = TASKS[task_name](task_params, data_dir='../InDist/data')
cl_token = task.sentence_encoder().encode(constants.bos)
task_tokenizer = task.sentence_encoder()._tokenizer

INFO:absl:Load dataset info from ../InDist/data/word_sv_agreement/0.1.0
INFO:absl:Constructing tf.data.Dataset for split validation, from ../InDist/data/word_sv_agreement/0.1.0
INFO:absl:Constructing tf.data.Dataset for split test, from ../InDist/data/word_sv_agreement/0.1.0
INFO:absl:Constructing tf.data.Dataset for split train, from ../InDist/data/word_sv_agreement/0.1.0


Vocab len:  10032


In [21]:
# Transformers has a unified API
# for 8 transformer architectures and 30 pretrained weights.
#          Model          | Tokenizer          | Pretrained weights shortcut
MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
          (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
          (GPT2Model,       GPT2Tokenizer,       'gpt2'),
          (CTRLModel,       CTRLTokenizer,       'ctrl'),
          (TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
          (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
          (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
          (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),
          (RobertaModel,    RobertaTokenizer,    'roberta-base')]

# Each architecture is provided with several class for fine-tuning on down-stream tasks, e.g.
BERT_MODEL_CLASSES = [BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction,
                      BertForSequenceClassification, BertForTokenClassification, BertForQuestionAnswering]

# All the classes for an architecture can be initiated from pretrained weights for this architecture
# Note that additional weights added for fine-tuning are only initialized
# and need to be trained on the down-stream task
pretrained_weights = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)

In [22]:
model = BertForMaskedLM.from_pretrained(pretrained_weights,
                                  output_hidden_states=True,
                                  output_attentions=True)

In [25]:
def offset_convertor(encoded_input_task, task_offset, task_encoder, tokenizer):
    string_part1 = task_encoder.decode(encoded_input_task[:task_offset])
    tokens_part1 = tokenizer.tokenize(string_part1)
    
    return len(tokens_part1)


In [41]:
for x,y in task.test_dataset:
    sentence = task.sentence_encoder().decode(x[0][1:])
    print(sentence)
    break

tokens = ['cls']+tokenizer.tokenize(sentence)+['sep']
print(len(tokens), tokens)
tf_input_ids = tokenizer.encode(sentence)
input_ids = torch.tensor([tf_input_ids])
all_hidden_states, all_attentions = model(input_ids)[-2:]

_attentions = [att.detach().numpy() for att in all_attentions]
attentions_mat = np.asarray(_attentions)[:,0]
print(attentions_mat.shape)

many NNS of woodland remain and support a JJ sector in the southern portion of the state .
21 ['cls', 'many', 'n', '##ns', 'of', 'woodland', 'remain', 'and', 'support', 'a', 'jj', 'sector', 'in', 'the', 'southern', 'portion', 'of', 'the', 'state', '.', 'sep']
(24, 16, 21, 21)


In [45]:
infl_eng = inflect.engine()
verb_infl, noun_infl = gen_inflect_from_vocab(infl_eng, '../InDist/notebooks/wiki.vocab')

test_data = task.databuilder.as_dataset(split='validation', batch_size=1)
e = 0
for examples in tqdm(test_data):
    print(examples)
    sentence = task.sentence_encoder().decode(examples['sentence'][0])
    print(sentence)
    

    sentence = ['cls']+tokenizer.tokenize(sentence)+['sep']
    tf_input_ids = tokenizer.encode(sentence)
    input_ids = torch.tensor([tf_input_ids])

    verb_position = examples['verb_position']+1  #+1 because of adding cls.
    # The verb it self is also masked
    mask = tf.cast(tf.sequence_mask(verb_position,maxlen=tf.shape(sentences)[1]), dtype=tf.int64)
    max_length = tf.reduce_max(verb_position + 1)

    last_index_mask = tf.gather(tf.eye(tf.shape(sentences)[1], dtype=tf.int64),verb_position)
    last_index_mask = last_index_mask * eos[0]

    inputs = (sentences * mask + last_index_mask)[:,:max_length]


    s_shape = tf.shape(inputs)
    batch_size, length = s_shape[0], s_shape[1]
    verb_classes = examples['verb_class']
    actual_verbs = examples['verb']
    inflected_verbs = [verb_infl[v.decode("utf-8")] for v in actual_verbs.numpy()]

    distances = examples['distance'].numpy()
    nz = examples['n_intervening'].numpy()
    n_diffs = examples['n_diff_intervening'].numpy()

    actual_verb_indexes = [task.databuilder.sentence_encoder().encode(v)[0] for v in actual_verbs.numpy()]
    inflected_verb_indexes = [task.databuilder.sentence_encoder().encode(v)[0] for v in inflected_verbs]


    predictions = model(inputs)
    predictions = np.argmax(predictions, axis=-1)
    corrects = predictions == verb_classes
    break




INFO:absl:Constructing tf.data.Dataset for split validation, from ../InDist/data/word_sv_agreement/0.1.0
0it [00:00, ?it/s]

{'distance': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>, 'n_diff_intervening': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>, 'n_intervening': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>, 'sentence': <tf.Tensor: shape=(1, 20), dtype=int64, numpy=
array([[  84, 2547,   40, 1386,   76,  490, 3509,   34, 1118, 2459,  707,
          82,  105, 3526, 8490,   66,   92, 3849, 3007,   13]])>, 'verb': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'have'], dtype=object)>, 'verb_class': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([1])>, 'verb_position': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([15], dtype=int32)>}
be ready to serve at every opportunity , yet making sure that your fellow servers have an equal chance .





AttributeError: 'str' object has no attribute 'shape'