In [1]:
from datasets import load_multitask_data
import bert
from config import PretrainedConfig
import torch
from datasets import SentencePairDataset
from tokenizer import BertTokenizer


In [2]:
#This library gives a better idea of how to compute the hidden states
# https://github.com/codertimo
# https://pypi.org/project/bert-pytorch/

In [3]:
#Install with pip install bert-pytorch
import bert_pytorch.model.embedding.bert as bert_module

In [4]:
# Load project's datasets
sentiment_data, num_labels, paraphrased_data, similarity_data = load_multitask_data("data/ids-sst-train.csv", "data/quora-train.csv", 
                    "data/sts-train.csv")
#Show example 
sentiment_data[0]

Loaded 8544 train examples from data/ids-sst-train.csv
Loaded 141498 train examples from data/quora-train.csv
Loaded 6040 train examples from data/sts-train.csv


("the rock is destined to be the 21st century 's new `` conan '' and that he 's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .",
 3,
 '32a4f146782cbde1b7fa65799')

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [6]:
# Get initialization parameters to validate the methods we create
class BertConfig(PretrainedConfig):
  model_type = "bert"

  def __init__(
    self,
    vocab_size=30522,
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    hidden_act="gelu",
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    max_position_embeddings=512,
    type_vocab_size=2,
    initializer_range=0.02,
    layer_norm_eps=1e-12,
    pad_token_id=0,
    gradient_checkpointing=False,
    position_embedding_type="absolute",
    use_cache=True,
    name_or_path = "checkpoint",
    **kwargs
  ):
    super().__init__(pad_token_id=pad_token_id, **kwargs)

    self.vocab_size = vocab_size
    self.hidden_size = hidden_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.hidden_act = hidden_act
    self.intermediate_size = intermediate_size
    self.hidden_dropout_prob = hidden_dropout_prob
    self.attention_probs_dropout_prob = attention_probs_dropout_prob
    self.max_position_embeddings = max_position_embeddings
    self.type_vocab_size = type_vocab_size
    self.initializer_range = initializer_range
    self.layer_norm_eps = layer_norm_eps
    self.gradient_checkpointing = gradient_checkpointing
    self.position_embedding_type = position_embedding_type
    self.use_cache = use_cache
    self.name_or_path = name_or_path

In [7]:
config = BertConfig()

In [59]:
import importlib
importlib.reload(importlib.import_module("bert"))

bert_mod = bert.BertModel(config)

In [9]:
# Tokenizing and passing data

In [35]:
padded_length = 15

dummy_data = ["the rock is destined to be the 21st century 's new `` conan "]
dummy_data = dummy_data*128

In [36]:
tokenized_data = tokenizer(dummy_data, return_tensors='pt', padding='max_length', truncation=True,max_length=50)

In [62]:
result_dict = bert_mod.forward(tokenized_data['input_ids'],tokenized_data['attention_mask'])

hidden state from key layer size torch.Size([128, 50, 768])
all head size 768
concatenated tensor shape torch.Size([128, 50, 768])
input shape torch.Size([128, 50, 768])
output shape torch.Size([128, 50, 768])
input shape torch.Size([128, 50, 768])
output shape torch.Size([128, 50, 3072])
normalized_score_layer shape torch.Size([128, 50, 768])
hidden state from key layer size torch.Size([128, 50, 768])
all head size 768
concatenated tensor shape torch.Size([128, 50, 768])
input shape torch.Size([128, 50, 768])
output shape torch.Size([128, 50, 768])
input shape torch.Size([128, 50, 768])
output shape torch.Size([128, 50, 3072])
normalized_score_layer shape torch.Size([128, 50, 768])
hidden state from key layer size torch.Size([128, 50, 768])
all head size 768
concatenated tensor shape torch.Size([128, 50, 768])
input shape torch.Size([128, 50, 768])
output shape torch.Size([128, 50, 768])
input shape torch.Size([128, 50, 768])
output shape torch.Size([128, 50, 3072])
normalized_score_l

In [63]:
result_dict.keys()

dict_keys(['last_hidden_state', 'pooler_output'])

In [67]:
result_dict['last_hidden_state'].shape

torch.Size([128, 50, 768])

In [65]:
result_dict['pooler_output'].shape

torch.Size([128, 768])

In [61]:
bert_mod.encode(hidden_states,tokenized_data['attention_mask'])

hidden state from key layer size torch.Size([128, 50, 768])
all head size 768
concatenated tensor shape torch.Size([128, 50, 768])
input shape torch.Size([128, 50, 768])
output shape torch.Size([128, 50, 768])
input shape torch.Size([128, 50, 768])
output shape torch.Size([128, 50, 3072])
normalized_score_layer shape torch.Size([128, 50, 768])
hidden state from key layer size torch.Size([128, 50, 768])
all head size 768
concatenated tensor shape torch.Size([128, 50, 768])
input shape torch.Size([128, 50, 768])
output shape torch.Size([128, 50, 768])
input shape torch.Size([128, 50, 768])
output shape torch.Size([128, 50, 3072])
normalized_score_layer shape torch.Size([128, 50, 768])
hidden state from key layer size torch.Size([128, 50, 768])
all head size 768
concatenated tensor shape torch.Size([128, 50, 768])
input shape torch.Size([128, 50, 768])
output shape torch.Size([128, 50, 768])
input shape torch.Size([128, 50, 768])
output shape torch.Size([128, 50, 3072])
normalized_score_l

tensor([[[ 2.4397, -0.2588,  0.6161,  ..., -2.2646,  0.6325,  0.6096],
         [-0.2055, -1.0703,  0.4219,  ..., -0.4757, -0.0072,  0.0887],
         [-0.4720,  0.0000,  0.1476,  ..., -0.0000,  3.2542, -0.8519],
         ...,
         [ 0.0682, -0.5010,  0.3616,  ..., -0.2935,  0.0000, -0.5955],
         [ 0.0877,  0.3808, -3.2701,  ..., -0.0000, -0.2870, -0.0835],
         [ 0.1736, -0.0000, -0.0000,  ..., -0.0000, -0.0076, -0.7290]],

        [[ 1.4783, -0.5213,  0.5138,  ...,  0.4407,  0.0385,  0.3364],
         [-0.2955, -1.7612,  0.1537,  ...,  0.5004,  0.4838, -3.4897],
         [-0.0482, -1.6392, -0.3932,  ...,  0.7169,  1.7805,  0.3056],
         ...,
         [-0.3663, -0.0000,  0.0000,  ..., -1.2152, -0.6961,  0.5336],
         [ 0.4940, -0.0000, -0.0615,  ..., -0.0000,  1.6459,  1.1692],
         [ 1.3144,  1.3468, -0.0000,  ..., -1.7047,  1.5514, -0.0000]],

        [[ 0.5243, -1.3725,  1.3530,  ...,  0.4718,  0.2918, -0.5091],
         [ 0.1038, -0.8666,  0.2350,  ...,  1