# Pre-Process

- NSP
- MLM
- BERTDataset
- getPair()
- mask_sentence()
- getItem()

In [None]:
from  torch.utils.data import Dataset, Dataloader
import random
import torch

class BERTDataset(Dataset):
  def __init__(self, conv_pairs, toknizer, seq_len):
    self.conv_pairs = conv_pairs
    self.toknizer = toknizer
    self.seq_len = seq_len
    self.num_pairs = len(conv_pairs)

  def __len__(self):
    return self.num_pairs

  def __getitem__(self, idx):
    s1,s2, is_next = self.conv_pairs[idx]
    masked_numericalized_s1,s1_mask = self.mask_sentence(s1)
    masked_numericalized_s2,s2_mask = self.mask_sentence(s2)

    t1 = [self.toknizer.vocab['[CLS]']] + masked_numericalized_s1 + [self.toknizer['[SEP]']]
    t2 = [self.toknizer.vocab['[CLS]']] + masked_numericalized_s2 + [self.toknizer['[SEP]']]
    t1_mask = [self.tokenizer.vocab['[PAD]']] + s1_mask + [self.tokenizer.vocab['[PAD]']]
    t2_mask = s2_mask + [self.tokenizer.vocab['[PAD]']]

    segment_ids = ([1 for _in range(len(t1))]) + [2 for _ in range(len(t2))][:self.seq_len]
    bert_input = (t1 + t2)[:self.seq_len]
    bert_label = (t1_mask + t2_mask)[:self.seq_len]
    padding = [self.toknizer.vocab['[PAD]']]for _ in range(self.seq_len - len(bert_input))
    bert_input.extend(padding),bert_label.exntend(padding),segment_ids.extend(padding)

    output = {
        "bert_input": torch.tensor(bert_input),
        "bert_label": torch.tensor(bert_label),
        "segment_ids": torch.tensor(segment_ids),
        "is_next": torch.tensor(is_next)
    }
    return {key: value for key, value in output.items()}

  def get_pair(self, index):
    s1, s2 = self.conv_pairs[index]
    return s1,s2
    is_next = 1
    if random.random() > 0.5
      randon_index = ranbdom.randrange(len(self.conv_pairs))
      s2 = self.conv_pairs[randon_index][1]
      is_next = 0
    return s1,s2,is_next

  def mask_sentence(self,s):
    words = s.split()
    masked_numericalized_s = []
    mask = []
    for word in words:
      prob = random.random()
      token_ids = self.tokenizer(word)['input_ids'][1:-1]
      if prob < 0.15:
        prob /= 0.15
        for token_id in token_ids:
          if prob < 0.8:
            masked_numericalized_s.append(self.tokenizer.vocab['[MASK]'])
          else if prob < 0.9:
            masked_numericalized_s.append(random.randrange(len(self.tokenizer.vocab)))
          else:
            masked_numericalized_s.append(token_id)
          mask.append(1)
      else:
        masked_numericalized_s.extend(token_ids)
        mask.extend([0] * len(token_ids))

    assert len(masked_numericalized_s) == len(mask)
    return masked_numericalized_s,mask


## Bert Input Embeddings

- token embedding
- segment embedding
- positional embedding

embeddings = token_embeddings  + segment_embeddings +  positional_embeddingd

In [None]:
import torch
import math

class PositionalEncoding(torch.nn.Module):
  def __init__(self,d_model.seq_len=128):
    super().__init__()
    pe = torch.zeors(seq_len,d_model)
    pe.requires_grad = False
    for pos in range(seq_len):
      for i in range(0,d_model,2):
        pe[pos,i] = math.sin(pos/10000**(2*i/d_model))
        pe[pos,i+1] = math.cos(pos/10000**(2*i/d_model))
    pe = pe.unsqueeze(0)

    self.register_buffer('pe',pe)

  def forward(self,x):
    return self.pe[:,:x.size(1)] + x

class BERTEmbeddings(torch.nn.Module):
  def __init__(self, vocab_size, d_model, seq_len=64):
    super().__init__()
    self.d_model = d_model
    self.seq_ken = seq_len
    self.token_embeddings = torch.nn.Embedding(vocab_size, d_model)
    self.segment_embedding = torch.nn.Embedding(3,d_model, padding_index)
    self.position_embedding = PositionalEncoding(d_model, seq_len)
    self.droupout = torch.nn.Dropout(p=0.1)

  def forward(self, input_ids, segment_ids):
    token_embeddings = self.token_embeddings(input_ids)
    segment_embeddings = self.segment_embedding(segment_ids)
    embeddings = token_embeddings + segment_embeddings

    embeddings = token_embeddings + segment_embeddings + positional_embedding
    embeddings = self.droupout(embeddings)
    return embeddings

## Multi_Head_Attention


In [None]:
import torch.nn as nn

class MultiHeadAttentionBlock(nn.Module):
  def __init__(self,d_model,num_heads,dropout):
    super().__init__()
    self.embed_dim = d_model
    self.num_heads = num_heads
    assert d_model % num_heads == 0
    self.d_k = d_model // num_heads

    self.q_linear = nn.Linear(d_model, d_model)
    self.k_linear = nn.Linear(d_model, d_model)
    self.v_linear = nn.Linear(d_model, d_model)

    self.o_linear = nn.Linear(d_model, d_model)
    self.dropout = nn.Dropout(dropout)

  def forward(self,q,k,v, mask=None):
    q_transformed = self.q_linear(q)
    k_transformed = self.k_linear(k)
    v_transformed = self.v_linear(v)

    q_transformed = q_transformed.view(q_transformed.shap[0],q_transformed.shape[1], self.num_heads, self.d_k).transpose(1,2)
    k_transformed = k_transformed.view(k_transformed.shap[0],k_transformed.shape[1], self.num_heads, self.d_k).transpose(1,2)
    v_transformed = v_transformed.view(v_transformed.shap[0],v_transformed.shape[1], self.num_heads, self.d_k).transpose(1,2)

    scaled_attn_scores = torch.matul(q_transformed, k_transformed.transpose(-2,-1)) / math.sqrt(self.d_k)
    saled_attn_scores.masked_fill_(mask == 0, -1e9)
    attn_weights = torch.softmax(scaled_attn_scores, dim=-1)
    attn_weights = self.dropout(attn_weights)
    z = attn_weights @ v_transformed

    z = z.transpose(1,2).contiguous().view(z.shape[0],-1,self.num_heads * self.d_k)
    z = self.o_linear(z)
    return z

## FFNN BLOCK

In [None]:
class FeedForwardBlock(nn.Module):
  def __init__(self, d_model, dim_feedforward, drop_out_p):
    super().__init__()
    self.embed_dim = d_model
    self.dim_feedforward = dim_feedforward

    self.fc1= nn.linear(d_model, dim_feedforward)
    self.fc2 = nn.linear(dim_feedforward, d_model)
    self.dropout = nn.Dropout(drop_out_p)

  def forward(self,x):
    return self.fc2(self.dropout(torch.relu(self.fc1(x))))

## Residual Connection & Add&Norm

In [None]:
class ResidualConnection(nn.Module):
  def __init__(self,d_model,drop_out_p):
    super().__init__()
    self.dropout = nn.Dropout(drop_out_p)
    self.layernorm = torch.nn.layerNorm(d_model)

  def forward(self, x, sublayer):
    return x + self.dropout((self.layernorm(sublayer(x)))

## Encoder Block

In [None]:
class EncoderBlock(nn.Module):
  def __init__(self, d_model, num_heads, dim_feedforward, dropout):
    super().__init__()
    self.self_attention_block = MultiHeadAttentionBlock(d_model, num_heads, dropout)
    self.feed_forward_block = FeedForwardBlock(d_model, dim_feedforward, dropout)
    self.residual_connection = nn.Modulelist([ResidualConnection(d_model, dropout) for _ in range(2)])

  def forward(self,x, mask):
    x = self.residual_connection[0](x, lambda x: self.self_attention_block(x,x,x,mask))
    x = self.residual_connection[1](x, self.feed_forward_block)
    return x

## Transformer Block


In [None]:
class Transformer(nn.Module):
  def __init__(self, vocab_size,d_model, num_heads,dim_feedforward, dropout,num_encoder_blocks =6, seq_len):
    super(Transformer, self).__init__()
    slef.src_embed = BERTEmbedding(vocab_size, d_model, seq_len=seq_len)
    encoder_blocks = []
    for _ in range(num_encoder_blocks):
      encoder_blocks.append(EncoderBlock(d_model, num_heads, dim_feedforward, dropout))

  def encode(self,x,segment_ids):
    mask = (x>0).unqueeze(1).repeat(1,x.size(1),1).unsqueeze(1)
    x = self.src_embed(x,segment_ids)
    for encoder_block in self.encoder_blocks:
      x = encoder_block(x,mask)
    return x

In [None]:
transformr = Transformer(len(tokenizer, vocab))
bert_result = transformer(sample_data['whatever'],sample_data['segment_label'])
print(bert_result.size())