In [46]:
from datasets import load_multitask_data, SentenceClassificationDataset, SentencePairDataset
from config import PretrainedConfig
import torch
from base_bert import BertPreTrainedModel

In [2]:
sentiment_data, num_labels, paraphrase_data, similarity_data = load_multitask_data(sentiment_filename='data/ids-sst-train.csv',paraphrase_filename='data/quora-train.csv',similarity_filename='data/sts-train.csv')

Loaded 8544 train examples from data/ids-sst-train.csv
Loaded 141498 train examples from data/quora-train.csv
Loaded 6040 train examples from data/sts-train.csv


In [3]:
class tempArgs():
    def __init__(self,):
        self.local_files_only=True


In [4]:
tempargs = tempArgs()

In [5]:
para_train_data = SentencePairDataset(paraphrase_data,tempargs,isRegression=False)

In [6]:
batched_para_data = para_train_data.collate_fn(para_train_data.dataset)

In [8]:
import torch

sep_token_id = para_train_data.tokenizer.vocab['[SEP]']
# Determine the length needed for padding
max_length = 600
def concatenate_with_padding_and_mask(tensor1, tensor2,mask1,mask2):
    concatenated_tensors = []

    
    

    for i in range(len(tensor1)):
        non_zero_indices1 = torch.nonzero(mask1[i], as_tuple=False).squeeze()
        non_zero_indices2 = torch.nonzero(mask2[i], as_tuple=False).squeeze()
        
        # Get the maximum non-zero index for each tensor separately
        max_index1 = torch.max(non_zero_indices1).item() + 1
        max_index2 = torch.max(non_zero_indices2).item() + 1

        # Concatenate the tensors and update attention masks
        # the second tensor concatenatesfrom position 1 because the separator 
        concatenated_tensor = torch.cat((tensor1[i][:max_index1], tensor2[i][1:max_index2]), dim=0)

        # Pad the concatenated tensor and mask to the maximum length
        padding_length = max_length - len(concatenated_tensor)
        padding = torch.zeros(padding_length)

        concatenated_tensor = torch.cat((concatenated_tensor, padding), dim=0)

        concatenated_tensors.append(concatenated_tensor)

    return torch.stack(concatenated_tensors)

# Example usage with batch_size=2 and sequence_length=6
tensor1 = torch.tensor([[1, 2, 3, 0, 0, 0],
                        [7, 8, 0, 0, 0, 0]])
tensor2 = torch.tensor([[0, 0, 0, 4, 5, 6],
                        [9, 0, 0, 0, 0, 0]])


mask1 = torch.tensor([[1, 1, 1, 0, 0, 0],
                      [1, 1, 0, 0, 0, 0]])
mask2 = torch.tensor([[0, 0, 0, 1, 1, 1],
                      [1, 0, 0, 0, 0, 0]])


result= concatenate_with_padding_and_mask(tensor1, tensor2, mask1, mask2)
print(result.shape)
print("Concatenated tensor:", result)



torch.Size([2, 600])
Concatenated tensor: tensor([[1., 2., 3.,  ..., 0., 0., 0.],
        [7., 8., 0.,  ..., 0., 0., 0.]])


In [9]:
concatenation = concatenate_with_padding_and_mask(batched_para_data['token_ids_1'],batched_para_data['token_ids_2'],batched_para_data['attention_mask_1'],batched_para_data['attention_mask_2'])


In [50]:
concatenation[0]

tensor([  101.,  2129.,  2064.,  1045.,  3040.,  2870.,  1999., 10988.,  1029.,
          102.,  2129.,  2064.,  1045.,  3040., 10988.,  2005.,  1996.,  4937.,
         1011.,  2403.,  1029.,   102.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0., 

In [14]:
def find_first_occurrence(tensor):
    # Find the index of the first occurrence of value 102 (SEP token) for each row
    first_occurrence_indices = []
    for index, row in enumerate(tensor):
        index = torch.nonzero(row == 102, as_tuple=False)
        if len(index) > 0:
            first_occurrence_indices.append(index[0, 0].item())
        else:
            print(index)
            first_occurrence_indices.append(None)
    return first_occurrence_indices

In [15]:
def create_attention_mask(tensor):
    # Get the first occurrence of 102 for each row in the tensor
    first_occurrence_indices = find_first_occurrence(tensor)

    # Create an attention mask tensor filled with ones
    attention_mask = torch.ones_like(tensor)

    # Set the values before the first occurrence of 102 to zeros
    for i, idx in enumerate(first_occurrence_indices):
        if idx is not None:
            attention_mask[i, :idx + 1] = 0
        else:
            attention_mask[i] = 0

    return attention_mask

In [16]:
new_attention_mask = create_attention_mask(concatenation)

In [22]:
new_attention_mask[0]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 

In [42]:
print(new_attention_mask[0][:25])
print(concatenation[0][:25])

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1.])
tensor([  101.,  2129.,  2064.,  1045.,  3040.,  2870.,  1999., 10988.,  1029.,
          102.,  2129.,  2064.,  1045.,  3040., 10988.,  2005.,  1996.,  4937.,
         1011.,  2403.,  1029.,   102.,     0.,     0.,     0.])


In [44]:
import torch.nn as nn
from datasets import load_multitask_data
import bert
from config import PretrainedConfig
import torch
from datasets import SentencePairDataset
from tokenizer import BertTokenizer

In [61]:
class BertModel(BertPreTrainedModel):
  """
  the bert model returns the final embeddings for each token in a sentence
  it consists
  1. embedding (used in self.embed)
  2. a stack of n bert layers (used in self.encode)
  3. a linear transformation layer for [CLS] token (used in self.forward, as given)
  """
  def __init__(self, config):
    super().__init__(config)
    self.config = config

    # embedding
    self.word_embedding = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
    self.pos_embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size)
    self.tk_type_embedding = nn.Embedding(config.type_vocab_size, config.hidden_size)
    self.embed_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
    self.embed_dropout = nn.Dropout(config.hidden_dropout_prob)
    # position_ids (1, len position emb) is a constant, register to buffer
    position_ids = torch.arange(config.max_position_embeddings).unsqueeze(0)
    self.register_buffer('position_ids', position_ids)

    # bert encoder
    self.bert_layers = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])

    # for [CLS] token
    self.pooler_dense = nn.Linear(config.hidden_size, config.hidden_size)
    self.pooler_af = nn.Tanh()

    self.init_weights()

  def embed(self, input_ids):
    input_shape = input_ids.size()
    seq_length = input_shape[1]

    # Get word embedding from self.word_embedding into input_embeds.
    inputs_embeds = self.word_embedding(input_ids)

    # Get position index and position embedding from self.pos_embedding into pos_embeds.
    pos_ids = self.position_ids[:, :seq_length] #subsets a list of positions 0:512 to 0:seq_length. 

    pos_embeds = self.pos_embedding(pos_ids)


    # Get token type ids, since we are not consider token type, just a placeholder.
    #tk_type_ids = torch.zeros(input_shape, dtype=torch.long, device=input_ids.device)
    tk_type_ids = concatenate_with_padding_and_mask(inputs_embeds['token_ids_1'],inputs_embeds['token_ids_2'],inputs_embeds['attention_mask_1'],inputs_embeds['attention_mask_2'])
    tk_type_ids = create_attention_mask(tk_type_ids)
    tk_type_embeds = self.tk_type_embedding(tk_type_ids)

    # Add three embeddings together; then apply embed_layer_norm and dropout and return.
    hidden_states = inputs_embeds+pos_embeds+tk_type_embeds
    hidden_states = self.embed_layer_norm(hidden_states)
    hidden_states = self.embed_dropout(hidden_states)

    return hidden_states


  def encode(self, hidden_states, attention_mask):
    """
    hidden_states: the output from the embedding layer [batch_size, seq_len, hidden_size]
    attention_mask: [batch_size, seq_len]
    """
    # get the extended attention mask for self attention
    # returns extended_attention_mask of [batch_size, 1, 1, seq_len]
    # non-padding tokens with 0 and padding tokens with a large negative number 
    extended_attention_mask: torch.Tensor = get_extended_attention_mask(attention_mask, self.dtype)

    # pass the hidden states through the encoder layers
    for i, layer_module in enumerate(self.bert_layers):
      # feed the encoding from the last bert_layer to the next
      hidden_states = layer_module(hidden_states, extended_attention_mask)

    return hidden_states

  def forward(self, input_ids, attention_mask):
    """
    input_ids: [batch_size, seq_len], seq_len is the max length of the batch
    attention_mask: same size as input_ids, 1 represents non-padding tokens, 0 represents padding tokens
    """
    # get the embedding for each input token
    embedding_output = self.embed(input_ids=input_ids)

    # feed to a transformer (a stack of BertLayers)
    sequence_output = self.encode(embedding_output, attention_mask=attention_mask)

    # get cls token hidden state
    first_tk = sequence_output[:, 0]
    first_tk = self.pooler_dense(first_tk)
    first_tk = self.pooler_af(first_tk)

    return {'last_hidden_state': sequence_output, 'pooler_output': first_tk}

In [None]:
# 1 
para_train_data = SentencePairDataset(paraphrase_data,tempargs,isRegression=False)
# 2 
batched_para_data = para_train_data.collate_fn(para_train_data.dataset)
# 3 
concatenation = concatenate_with_padding_and_mask(batched_para_data['token_ids_1'],batched_para_data['token_ids_2'],batched_para_data['attention_mask_1'],batched_para_data['attention_mask_2'])
# 4 
new_attention_mask = create_attention_mask(concatenation)



In [48]:
import torch
import torch.nn as nn

class BertEmbeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embedding = nn.Embedding(config.vocab_size, config.hidden_size)
        self.position_embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.segment_embedding = nn.Embedding(config.type_vocab_size, config.hidden_size)
        
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, input_ids, token_type_ids=None):
        token_embeddings = self.token_embedding(input_ids)
        position_ids = torch.arange(input_ids.size(1), dtype=torch.long, device=input_ids.device)
        position_embeddings = self.position_embedding(position_ids.unsqueeze(0))
        
        embeddings = token_embeddings + position_embeddings
        
        if token_type_ids is not None:
            segment_embeddings = self.segment_embedding(token_type_ids)
            embeddings = embeddings + segment_embeddings
        
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

# Example usage
class BERTConfig:
    def __init__(self):
        self.vocab_size = 30522  # Example vocabulary size
        self.hidden_size = 768
        self.max_position_embeddings = 512
        self.type_vocab_size = 2  # Two token types (0 and 1)
        self.layer_norm_eps = 1e-12
        self.hidden_dropout_prob = 0.1

config = BERTConfig()
bert_embeddings = BertEmbeddings(config)

input_ids = torch.tensor([[101, 2058, 2022, 102], [101, 2003, 2016, 102]], dtype=torch.long)
token_type_ids = torch.tensor([[0, 0, 0, 0], [0, 1, 1, 0]], dtype=torch.long)

embeddings = bert_embeddings(input_ids, token_type_ids)
print(embeddings)

tensor([[[-2.1205,  0.3894,  1.2717,  ...,  0.4214,  0.5916,  0.3167],
         [-0.2933, -0.8539, -0.4161,  ...,  0.5703,  1.6586, -0.0000],
         [-0.0000,  0.0000,  1.0828,  ...,  0.8246,  2.2696, -0.3733],
         [-0.4785,  0.1608,  1.5891,  ...,  0.0158, -0.0000,  0.0948]],

        [[-2.1205,  0.3894,  1.2717,  ...,  0.4214,  0.5916,  0.0000],
         [ 1.8705,  1.0076,  0.0000,  ..., -0.4253,  0.4604,  0.8472],
         [-0.7460,  1.3952,  0.8652,  ..., -0.4192, -0.4473, -0.7033],
         [-0.4785,  0.1608,  1.5891,  ...,  0.0158, -0.0089,  0.0948]]],
       grad_fn=<MulBackward0>)


In [57]:
word_embed = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=1)

Embedding(30522, 768, padding_idx=1)

In [60]:
tk_type_ids = concatenate_with_padding_and_mask(input_ids)
tk_type_ids = create_attention_mask(tk_type_ids)
tk_type_embeds = self.tk_type_embedding(tk_type_ids)

TypeError: concatenate_with_padding_and_mask() missing 3 required positional arguments: 'tensor2', 'mask1', and 'mask2'