In [3]:
from datasets import load_multitask_data
from bert import BertSelfAttention
from bert import BertModel
from config import PretrainedConfig

In [4]:
sentiment_data, num_labels, paraphrased_data, similarity_data = load_multitask_data("data/ids-sst-train.csv", "data/quora-train.csv", 
                    "data/sts-train.csv")

Loaded 8544 train examples from data/ids-sst-train.csv
Loaded 141498 train examples from data/quora-train.csv
Loaded 6040 train examples from data/sts-train.csv


In [5]:
sentiment_data[0]

("the rock is destined to be the 21st century 's new `` conan '' and that he 's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .",
 3,
 '32a4f146782cbde1b7fa65799')

In [6]:

class BertConfig(PretrainedConfig):
  model_type = "bert"

  def __init__(
    self,
    vocab_size=30522,
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    hidden_act="gelu",
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    max_position_embeddings=512,
    type_vocab_size=2,
    initializer_range=0.02,
    layer_norm_eps=1e-12,
    pad_token_id=0,
    gradient_checkpointing=False,
    position_embedding_type="absolute",
    use_cache=True,
    name_or_path = "checkpoint",
    **kwargs
  ):
    super().__init__(pad_token_id=pad_token_id, **kwargs)

    self.vocab_size = vocab_size
    self.hidden_size = hidden_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.hidden_act = hidden_act
    self.intermediate_size = intermediate_size
    self.hidden_dropout_prob = hidden_dropout_prob
    self.attention_probs_dropout_prob = attention_probs_dropout_prob
    self.max_position_embeddings = max_position_embeddings
    self.type_vocab_size = type_vocab_size
    self.initializer_range = initializer_range
    self.layer_norm_eps = layer_norm_eps
    self.gradient_checkpointing = gradient_checkpointing
    self.position_embedding_type = position_embedding_type
    self.use_cache = use_cache
    self.name_or_path = name_or_path

In [7]:
config = BertConfig()
bert = BertModel(config)

In [8]:
bert.bert_layers[0].self_attention

BertSelfAttention(
  (query): Linear(in_features=768, out_features=768, bias=True)
  (key): Linear(in_features=768, out_features=768, bias=True)
  (value): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [9]:
# Here we should try to tokenize and find the vector representation of the input sentences. 
# Key,value and query are obtained from the transform method of BertSelfAttention
# These 3 values are 3 linear layers of the same size equal to the hidden size.
# We should be able to test the attention method with the initialized values
# Where do we get the attention mask from? Where do we process the data?
# It seems that we get the attention masks from the datasets script and the tokenizers produced the attention mask!
# So we could instantiate class SentencePairDataset(Dataset): for instance to get the masks

In [10]:
from datasets import SentencePairDataset

In [13]:
data = SentencePairDataset(paraphrased_data,args=[])

In [20]:
padded_data = data.collate_fn(paraphrased_data)

In [28]:
padded_data['attention_mask_1'].size()

torch.Size([141498, 118])

In [30]:
padded_data['token_ids_1'].size()

torch.Size([141498, 118])

In [29]:
padded_data.keys()

dict_keys(['token_ids_1', 'token_type_ids_1', 'attention_mask_1', 'token_ids_2', 'token_type_ids_2', 'attention_mask_2', 'labels', 'sent_ids'])

In [32]:
bert.bert_layers[0].self_attention.forward(hidden_states = padded_data['token_ids_1'],attention_mask = padded_data['attention_mask_1'])

RuntimeError: mat1 and mat2 shapes cannot be multiplied (141498x118 and 768x768)