In [1]:
import torch

from pytorch_pretrained_bert.modeling import BertConfig, BertModel

from allennlp.common.testing import ModelTestCase
from allennlp.data.dataset import Batch
from allennlp.data.fields import TextField, ListField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers.wordpiece_indexer import PretrainedBertIndexer
from allennlp.data.tokenizers import WordTokenizer
from allennlp.data.tokenizers.word_splitter import BertBasicWordSplitter
from allennlp.data.vocabulary import Vocabulary
from allennlp.modules.token_embedders.bert_token_embedder import BertEmbedder




In [7]:
# token_indexer = PretrainedBertIndexer(str(vocab_path))

# config_path = self.FIXTURES_ROOT / 'bert' / 'config.json'
config = BertConfig(vocab_size_or_config_json_file=30522)
bert_model = BertModel(config)
token_embedder = BertEmbedder(bert_model)

bert_name = 'bert-base-uncased'
token_indexer = PretrainedBertIndexer(pretrained_model=bert_name, use_starting_offsets=True)

In [43]:
tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

#            2   3    4   3     5     6   8      9    2   14   12
sentence1 = "the quickest quick brown fox jumped over the lazy dog"
tokens1 = tokenizer.tokenize(sentence1)

#            2   3     5     6   8      9    2  15 10 11 14   1
sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
tokens2 = tokenizer.tokenize(sentence2)

vocab = Vocabulary()

instance1 = Instance({"tokens": TextField(tokens1, {"bert": token_indexer})})
instance2 = Instance({"tokens": TextField(tokens2, {"bert": token_indexer})})

batch = Batch([instance1, instance2])
batch.index_instances(vocab)

padding_lengths = batch.get_padding_lengths()
tensor_dict = batch.as_tensor_dict(padding_lengths)
tokens = tensor_dict["tokens"]


# Offsets, should get 10 vectors back.
bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"])
assert list(bert_vectors.shape) == [2, 10, 768]

# Now try top_layer_only = True
tlo_embedder = BertEmbedder(bert_model, top_layer_only=True)
bert_vectors = tlo_embedder(tokens["bert"], offsets=tokens["bert-offsets"])
assert list(bert_vectors.shape) == [2, 10, 768]

In [47]:
type(bert_vectors)

torch.Tensor

In [54]:
bert_vectors.data.numpy()

array([[[-1.93796647e+00, -2.21370563e-01, -4.27188054e-02, ...,
         -2.00201765e-01, -2.62679043e-03,  1.89772642e+00],
        [-2.84264302e+00,  2.51756072e-01, -1.14900100e+00, ...,
          2.18602911e-01,  2.27466643e-01,  1.60831720e-01],
        [-2.28935361e+00,  1.27255058e+00, -9.51204360e-01, ...,
          8.86204302e-01,  1.15967286e+00, -5.60903192e-01],
        ...,
        [-1.89784360e+00, -8.26657474e-01, -1.54118264e+00, ...,
         -1.32414654e-01,  2.39476383e-01,  1.58546090e-01],
        [-1.79235995e+00,  1.04425693e+00,  1.96723536e-01, ...,
          8.53676200e-01,  1.08933592e+00,  1.38736570e+00],
        [-9.26102638e-01, -8.25094283e-02, -1.17659962e+00, ...,
          2.74015963e-01, -2.32073471e-01, -8.63033310e-02]],

       [[-1.51391673e+00,  6.94965363e-01, -1.46800399e+00, ...,
         -1.05817728e-01, -3.58425975e-01,  9.83054340e-01],
        [-3.24107230e-01,  4.46848214e-01, -1.27278852e+00, ...,
         -5.27161777e-01,  1.14167440e

In [42]:
type(tokens1[0])

allennlp.data.tokenizers.token.Token