In [136]:
from datasets import load_multitask_data
from bert import BertSelfAttention
from bert import BertModel
from config import PretrainedConfig
import torch
from datasets import SentencePairDataset
from tokenizer import BertTokenizer


In [137]:
#This library gives a better idea of how to compute the hidden states
# https://github.com/codertimo
# https://pypi.org/project/bert-pytorch/
from bert_pytorch.model.embedding.bert import *

In [138]:
# Load project's datasets
sentiment_data, num_labels, paraphrased_data, similarity_data = load_multitask_data("data/ids-sst-train.csv", "data/quora-train.csv", 
                    "data/sts-train.csv")
#Show example 
sentiment_data[0]

Loaded 8544 train examples from data/ids-sst-train.csv
Loaded 141498 train examples from data/quora-train.csv
Loaded 6040 train examples from data/sts-train.csv


("the rock is destined to be the 21st century 's new `` conan '' and that he 's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .",
 3,
 '32a4f146782cbde1b7fa65799')

In [139]:
# Get initialization parameters to validate the methods we create
class BertConfig(PretrainedConfig):
  model_type = "bert"

  def __init__(
    self,
    vocab_size=30522,
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    hidden_act="gelu",
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    max_position_embeddings=512,
    type_vocab_size=2,
    initializer_range=0.02,
    layer_norm_eps=1e-12,
    pad_token_id=0,
    gradient_checkpointing=False,
    position_embedding_type="absolute",
    use_cache=True,
    name_or_path = "checkpoint",
    **kwargs
  ):
    super().__init__(pad_token_id=pad_token_id, **kwargs)

    self.vocab_size = vocab_size
    self.hidden_size = hidden_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.hidden_act = hidden_act
    self.intermediate_size = intermediate_size
    self.hidden_dropout_prob = hidden_dropout_prob
    self.attention_probs_dropout_prob = attention_probs_dropout_prob
    self.max_position_embeddings = max_position_embeddings
    self.type_vocab_size = type_vocab_size
    self.initializer_range = initializer_range
    self.layer_norm_eps = layer_norm_eps
    self.gradient_checkpointing = gradient_checkpointing
    self.position_embedding_type = position_embedding_type
    self.use_cache = use_cache
    self.name_or_path = name_or_path

In [140]:
config = BertConfig()
bert = BertModel(config)

In [32]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [132]:
encoding1 = tokenizer(sentence1, return_tensors='pt', padding='max_length', truncation=True,max_length=512)

In [135]:
encoding1['input_ids'].shape

torch.Size([1024, 512])

In [141]:
#TODO explore how the sentencePairDataset tokenizes and enumerates the raw data
data = SentencePairDataset(paraphrased_data,args=[])
BS = 1024 #batch size
padded_data = data.collate_fn(paraphrased_data[0:BS])
padded_data.keys()
for k,v in padded_data.items():
    if isinstance(v,list):
        print(k,len(k))
    else:
        print(k,v.size())

token_ids_1 torch.Size([1024, 49])
token_type_ids_1 torch.Size([1024, 49])
attention_mask_1 torch.Size([1024, 49])
token_ids_2 torch.Size([1024, 76])
token_type_ids_2 torch.Size([1024, 76])
attention_mask_2 torch.Size([1024, 76])
labels torch.Size([1024])
sent_ids 8


In [154]:

padded_data['attention_mask_1'][2]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0])

In [156]:
padded_data['token_ids_1'][2]

tensor([ 101, 2040, 2003, 1996, 2190, 3539, 2704, 2634, 2038, 2412, 2018, 1029,
         102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0])

In [166]:
padded_data['token_type_ids_1'][2]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0])

In [163]:
#Initialize the bert embedding implementation found on github
encoder = BERTEmbedding(vocab_size=config.vocab_size,embed_size=config.hidden_size)
token_ids_encoding = padded_data['token_ids_1']
attention_mask = padded_data['attention_mask_1']

mistery_number = token_ids_encoding.size()[1]
#The mistery number appears to be seq_len which reflects the size of the largest tokenized and encoded input sequence
# That is why when we increase the batch size, it is going to change depending on the new largest
# tokenized input sequence from that batch.
# segment label corresponds to token_types_ids
segment_label = padded_data['token_type_ids_1']
# We randomized the segment label, but it depends on the one we picked
hidden_states = encoder.forward(token_ids_encoding,segment_label)

In [164]:
hidden_states.shape

torch.Size([1024, 49, 768])

In [8]:
attention_mask[100]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [165]:
bert.bert_layers[0].self_attention.forward(hidden_states = hidden_states,attention_mask = attention_mask)

NotImplementedError: 