In [3]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import sys
%load_ext autoreload
%autoreload 2
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
sys.path.append("../")
from models.bert import BERT
from models.modules.sublayer_connection import LayerNorm
from models.base_model import BaseModel
from models.embedding.bert import BERTEmbedding
from models.modules.transformer_block import TransformerBlock
from torch.optim import Adam
from models.bert import ScheduledOptim
from models.base_model import BaseModule
from models.embedding.position import PositionalEmbedding
from models.embedding.segment import SegmentEmbedding
from models.embedding.token import TokenEmbedding

In [5]:
class BERTEmbedding(BaseModule):
    """
    BERT Embedding which is consisted with under features
        1. TokenEmbedding : normal embedding matrix
        2. PositionalEmbedding : adding positional information using sin, cos
        2. SegmentEmbedding : adding sentence segment info, (sent_A:1, sent_B:2)

        sum of all these features are output of BERTEmbedding
    """

    def __init__(self, vocab_size, embed_size, dropout=0.1):
        """
        :param vocab_size: total vocab size
        :param embed_size: embedding size of token embedding
        :param dropout: dropout rate
        """
        super().__init__()
        self.token = TokenEmbedding(vocab_size=vocab_size, embed_size=embed_size)
        self.position = PositionalEmbedding(d_model=self.token.embedding_dim)
        self.segment = SegmentEmbedding(embed_size=self.token.embedding_dim)
        self.dropout = nn.Dropout(p=dropout)
        self.layer_norm = LayerNorm(embed_size, eps=1e-12)
        self.embed_size = embed_size

    def forward(self, sequence, segment_label):
        x = self.token(sequence) + self.position(sequence) + self.segment(segment_label)
        x = self.layer_norm(x)
        return self.dropout(x)

In [6]:
class BERT(BaseModel):
    def __init__(self, config, vocab_size: int, child: bool = True):
        super(BERT, self).__init__(config)
        """
        :param vocab_size: vocab_size of total words
        :param hidden: BERT model hidden size
        :param n_layers: numbers of Transformer blocks(layers)
        :param attn_heads: number of attention heads
        :param dropout: dropout rate
        """
        self.hidden = config.hidden_features
        self.n_layers = config.layers
        self.attn_heads = config.heads
        self.device = config.device

        # paper noted they used 4*hidden_size for ff_network_hidden_size
        self.feed_forward_hidden = self.hidden * 4

        # embedding for BERT, sum of positional, segment, token embeddings
        self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=self.hidden).to(self.conf.device)

        # multi-layers transformer blocks, deep network
        self.transformer_blocks = nn.ModuleList([
            TransformerBlock(self.hidden, self.attn_heads, self.hidden * 4, config.dropout).to(self.conf.device)
            for _ in range(config.layers)
        ])

        self.optimizer = Adam(
            self.parameters(),
            lr=self.conf.lr,
            betas=(self.conf.adam_beta1, self.conf.adam_beta2),
            weight_decay=self.conf.adam_weight_decay
        ) if config.train and not child else None
        self.optim_schedule = ScheduledOptim(
            self.optimizer,
            self.hidden,
            n_warmup_steps=self.conf.warmup_steps
        ) if config.train and not child else None

    def forward(self, x, segment_info):
        # attention masking for padded token
        # torch.ByteTensor([batch_size, 1, seq_len, seq_len)
        mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)

        # embedding the indexed sequence to sequence of vectors
        x = self.embedding(x, segment_info)

        # running over multiple transformer blocks
        for transformer in self.transformer_blocks:
            x = transformer.forward(x, mask)

        return x

In [7]:
class MaskedLanguageModel(nn.Module):
    """
    predicting origin token from masked input sequence
    n-class classification problem, n-class = vocab_size
    """

    def __init__(self, hidden, vocab_size):
        """
        :param hidden: output size of BERT model
        :param vocab_size: total vocab size
        """
        super().__init__()
        self.linear = nn.Linear(hidden, hidden)
        self.act = nn.GELU()
        self.layer_norm = LayerNorm(hidden)
        self.decoder = nn.Linear(hidden, vocab_size)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        x = self.linear(x)
        x = self.act(x)
        x = self.layer_norm(x)
        return self.decoder(x)

In [8]:
class BERTLM(BaseModel):
    def __init__(self, config, vocab_size: int):
        super(BERTLM, self).__init__(config)
        """
        :param vocab_size: vocab_size of total words
        :param hidden: BERT model hidden size
        :param n_layers: numbers of Transformer blocks(layers)
        :param attn_heads: number of attention heads
        :param dropout: dropout rate
        """
        self.hidden = config.hidden_features
        self.n_layers = config.layers
        self.attn_heads = config.heads
        self.device = config.device

        self.bert = BERT(config, vocab_size, True)

        self.mask_lm = MaskedLanguageModel(self.hidden, vocab_size).to(self.conf.device)


        self.optimizer = Adam(
            self.parameters(),
            lr=self.conf.lr,
            betas=(self.conf.adam_beta1, self.conf.adam_beta2),
            weight_decay=self.conf.adam_weight_decay
        ) if config.train else None
        self.optim_schedule = ScheduledOptim(
            self.optimizer,
            self.hidden,
            n_warmup_steps=self.conf.warmup_steps
        ) if config.train else None

    def forward(self, x, segment_info):
        x = self.bert(x, segment_info)
        print(x.size())
        x = self.mask_lm(x)
        print(x.size())
        return x

In [9]:
class config():
    def __init__(self):
        self.vocab = "bert-google"
        self.vocab_path = "../data/wikitext2/all.txt"
        self.bert_google_vocab = "../data/uncased_L-12_H-768_A-12/vocab.txt"
        self.vocab_max_size = None
        self.vocab_min_frequency = 1
        self.dataset = "wikitext2"
        self.seq_len = 40
        self.on_memory = True
        self.corpus_lines = None
        self.train_dataset = "../data/wikitext2/test_data_single_sentence.txt"
        self.encoding = "utf-8"
        self.batch_size = 1
        self.num_workers = 1
        self.hidden_features = 768
        self.layers = 12
        self.heads = 12
        self.device = "cpu"
        self.dropout = 0.1
        self.train = True
        self.lr = 1e-3
        self.adam_beta1=0.999
        self.adam_beta2=0.999
        self.adam_weight_decay = 0.01
        self.warmup_steps =1000

In [10]:
conf = config()

In [11]:
bert_ml = BERTLM(conf, 30522)

### Load Pretrained Weights

In [12]:
pt_model = torch.load("../../../../torch_dump_model")

In [13]:
mlm_rel_params = {}
for name, param in pt_model.items():
    if "pooler" in name or "seq_relationship" in name:
        continue
    else:
        mlm_rel_params[name] = param

In [14]:
for name in mlm_rel_params:
    if 'embedding' in name:
        print(name)

bert.embeddings.position_ids
bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.weight
bert.embeddings.LayerNorm.bias


In [15]:
from copy import deepcopy

### Set Embeddings

In [16]:
dic = deepcopy(bert_ml.state_dict())

In [17]:
dic['bert.embedding.position.pe'][0] = deepcopy(pt_model['bert.embeddings.position_embeddings.weight'])

In [18]:
dic['bert.embedding.token.weight'] = deepcopy(pt_model['bert.embeddings.word_embeddings.weight'])

In [19]:
dic['bert.embedding.segment.weight'] = deepcopy(pt_model['bert.embeddings.token_type_embeddings.weight'])

In [20]:
dic['bert.embedding.layer_norm.a_2'] = deepcopy(pt_model['bert.embeddings.LayerNorm.weight'])

In [21]:
dic['bert.embedding.layer_norm.b_2'] = deepcopy(pt_model['bert.embeddings.LayerNorm.bias'])

In [22]:
mapping = {
    'attention.self.query.weight':'attention.linear_layers.0.weight',
    'attention.self.query.bias':'attention.linear_layers.0.bias',
    'attention.self.key.weight':'attention.linear_layers.1.weight',
    'attention.self.key.bias':'attention.linear_layers.1.bias',
    'attention.self.value.weight':'attention.linear_layers.2.weight',
    'attention.self.value.bias':'attention.linear_layers.2.bias',
    'attention.output.dense.weight':'attention.output_linear.weight',
    'attention.output.dense.bias':'attention.output_linear.bias',
    'attention.output.LayerNorm.weight':'input_sublayer.norm.a_2',
    'attention.output.LayerNorm.bias': 'input_sublayer.norm.b_2',
    'intermediate.dense.weight':'feed_forward.w_1.weight',
    'intermediate.dense.bias':'feed_forward.w_1.bias',
    'output.dense.weight':'feed_forward.w_2.weight',
    'output.dense.bias':'feed_forward.w_2.bias',
    'output.LayerNorm.weight':'output_sublayer.norm.a_2',
    'output.LayerNorm.bias':'output_sublayer.norm.b_2',
}

In [23]:
inv_mapping = {}
for key, value in mapping.items():
    inv_mapping[value] = key

In [24]:
len(bert_ml.state_dict())

203

In [25]:
cnt = 0
for layer in range(12):
    # We have 12 transformer layers, iterate through them one by one
    for name, p_val in bert_ml.bert.transformer_blocks[layer].named_parameters():
        to_copy = f'bert.encoder.layer.{layer}.' + inv_mapping[name]
        param_to_copy = deepcopy(pt_model[to_copy])
        dic[f'bert.transformer_blocks.{layer}.' + name] = param_to_copy
        assert p_val.shape == param_to_copy.shape
        cnt+=1

In [26]:
cnt

192

### Set Last Layers

In [27]:
dic['mask_lm.linear.weight'] = deepcopy(pt_model['cls.predictions.transform.dense.weight'])
dic['mask_lm.linear.bias'] = deepcopy(pt_model['cls.predictions.transform.dense.bias'])
dic['mask_lm.decoder.weight'] = deepcopy(pt_model['cls.predictions.decoder.weight'])
dic['mask_lm.decoder.bias'] = deepcopy(pt_model['cls.predictions.decoder.bias'])
dic['mask_lm.layer_norm.a_2'] = deepcopy(pt_model['cls.predictions.transform.LayerNorm.weight'])
dic['mask_lm.layer_norm.b_2'] = deepcopy(pt_model['cls.predictions.transform.LayerNorm.bias'])

In [28]:
bert_ml.load_state_dict(dic)

<All keys matched successfully>

In [29]:
bert_ml.eval()

BERTLM(
  (bert): BERT(
    (embedding): BERTEmbedding(
      (token): TokenEmbedding(30522, 768, padding_idx=0)
      (position): PositionalEmbedding()
      (segment): SegmentEmbedding(2, 768, padding_idx=0)
      (dropout): Dropout(p=0.1, inplace=False)
      (layer_norm): LayerNorm()
    )
    (transformer_blocks): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadedAttention(
          (linear_layers): ModuleList(
            (0): Linear(in_features=768, out_features=768, bias=True)
            (1): Linear(in_features=768, out_features=768, bias=True)
            (2): Linear(in_features=768, out_features=768, bias=True)
          )
          (output_linear): Linear(in_features=768, out_features=768, bias=True)
          (attention): Attention()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): PositionwiseFeedForward(
          (w_1): Linear(in_features=768, out_features=3072, bias=True)
          (w_2): Linear(in_features=30

### Comparison Time

In [30]:
from transformers import BertForMaskedLM
ml_model = BertForMaskedLM.from_pretrained('bert-base-uncased')

2023-01-01 20:49:22.847689: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [31]:
ml_model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [32]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
text = "Today is a cold and [MASK] day."
encoded_input = tokenizer(text, return_tensors='pt')

In [33]:
op = ml_model(**encoded_input)[0][0]

In [34]:
for i in range(10):
    print(tokenizer.ids_to_tokens[op[i].argmax().item()])

.
today
is
a
cold
and
rainy
day
.
.


In [35]:
encoded_input['token_type_ids']+=1

In [36]:
op = ml_model(**encoded_input)[0][0]

In [37]:
for i in range(10):
    print(tokenizer.ids_to_tokens[op[i].argmax().item()])

.
today
is
a
cold
and
rainy
day
.
.


In [38]:
from datasets.vocabulary import BertVocab
import datasets

vocab = BertVocab(conf)

# vocab.pad_index

# load the dataset specified with --dataset_name & get data loaders
train_dataset = datasets.get(dataset_name="wikitext2")(config=conf, vocab=vocab)

train_loader = train_dataset.get_data_loader()

Using Bert Vocab


98856it [00:01, 94939.50it/s] 
30522it [00:00, 473826.88it/s]
Loading Dataset: 9961it [00:00, 601593.45it/s]


In [39]:
data = next(iter(train_loader))

In [40]:
data

{'bert_input': tensor([[ 101,  103,  100, 2003, 2019, 2394, 2143, 1010, 2547, 1998, 3004, 3364,
          1012,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0]]),
 'bert_label': tensor([[   0, 2728,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0]]),
 'segment_label': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'mask_index': tensor([1])}

In [41]:
encoded_input

{'input_ids': tensor([[ 101, 2651, 2003, 1037, 3147, 1998,  103, 2154, 1012,  102]]), 'token_type_ids': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [42]:
new_data = {}
bert_input = [[0 for i in range(10)]]
for i in range(10):
    bert_input[0][i] = encoded_input['input_ids'][0][i].item()
segment_label = [[0 for i in range(10)]]
for i in range(10):
    segment_label[0][i] = encoded_input['token_type_ids'][0][i].item()

In [43]:
new_data['bert_input'] = torch.tensor(bert_input).int()
new_data['segment_label'] = torch.tensor(segment_label).int()

In [44]:
encoded_input

{'input_ids': tensor([[ 101, 2651, 2003, 1037, 3147, 1998,  103, 2154, 1012,  102]]), 'token_type_ids': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [45]:
new_data

{'bert_input': tensor([[ 101, 2651, 2003, 1037, 3147, 1998,  103, 2154, 1012,  102]],
        dtype=torch.int32),
 'segment_label': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int32)}

In [46]:
op_our = bert_ml(new_data['bert_input'], new_data['segment_label'])[0][:10]

torch.Size([1, 10, 768])
torch.Size([1, 10, 30522])


In [47]:
op_our = bert_ml(data['bert_input'], data['segment_label'])[0][:10]

torch.Size([1, 40, 768])
torch.Size([1, 40, 30522])


In [48]:
data

{'bert_input': tensor([[ 101,  103,  100, 2003, 2019, 2394, 2143, 1010, 2547, 1998, 3004, 3364,
          1012,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0]]),
 'bert_label': tensor([[   0, 2728,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0]]),
 'segment_label': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'mask_index': tensor([1])}

0

In [62]:
np.where(data['bert_input'][0].numpy()==0)[0]

array([38, 39])

In [67]:
    ixs = np.where(data['bert_input'][0].numpy()==0)[0]


In [66]:
ixs[0]

array([38, 39])

In [74]:
" ".join(str_vals)

'in 2004 [UNK] landed a role as " craig " in the [MASK] " teddy [UNK] story " of the television series the long firm ; he starred alongside actors mark strong and derek [UNK] . [SEP] [PAD]'

In [None]:
vocab.itos[op_our[mask_index]]

In [81]:
ix = 0
for ix, data in enumerate(train_loader):
    ix = ix + 1
    op_our = bert_ml(data['bert_input'], data['segment_label'])[0]
    mask_index = data['mask_index'][0].item()
    ixs = np.where(data['bert_input'][0].numpy()==0)[0]
    print(n, mask_index, )
    print("Mask Prediction: ", vocab.itos[op_our[mask_index].argmax().item()])
    str_vals = []
    for elem in data['bert_input'][0][1:-1]:
        if elem.item() == 0:
            break
        str_vals.append(vocab.itos[elem.item()])
    print(" ".join(str_vals))
    if ix == 5:
        break

torch.Size([1, 40, 768])
torch.Size([1, 40, 30522])
38 1
Mask Prediction:  james
[MASK] [UNK] is an english film , television and theatre actor . [SEP]
torch.Size([1, 40, 768])
torch.Size([1, 40, 30522])
38 12
Mask Prediction:  the
he had a guest [UNK] starring role on the television series [MASK] bill in 2000 . [SEP]
torch.Size([1, 40, 768])
torch.Size([1, 40, 30522])
38 10
Mask Prediction:  play
this was followed by a starring role in the [MASK] [UNK] written by simon stephens , which was performed in 2001 at the royal court theatre . [SEP]
torch.Size([1, 40, 768])
torch.Size([1, 40, 30522])
38 8
Mask Prediction:  television
he had a guest role in the [MASK] series judge john [UNK] in 2002 . [SEP]
torch.Size([1, 40, 768])
torch.Size([1, 40, 30522])
38 28
Mask Prediction:  starred
in 2004 [UNK] landed a role as " craig " in the episode " teddy [UNK] story " of the television series the long firm ; he [MASK] alongside actors mark strong and derek [UNK] . [SEP]


In [None]:
for i in range(10):
    print(vocab.itos[op_our[i].argmax().item()])

## Clearly They Don't Match but WHY?

In [None]:
ml_emb_op = ml_model.bert.embeddings.forward(encoded_input['input_ids'], encoded_input['token_type_ids'])

In [None]:
ml_emb_op

In [None]:
our_emb_op = bert_ml.bert.embedding.forward(new_data['bert_input'], new_data['segment_label'])

In [None]:
our_emb_op

### Disagreement Here --> First Attn Block

In [None]:
x = new_data['bert_input']
mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)
bert_ml.bert.transformer_blocks[0](our_emb_op, mask)

In [None]:
ml_model.bert.encoder.layer[0].forward(ml_emb_op)

### Investigate SA

In [None]:
ml_model.bert.encoder.layer[0].attention.self

In [None]:
ml_model.bert.encoder.layer[0].attention.output

## Key

In [None]:
sa_cls.key.weight

### 0 - Q, 1 - Key, 2 - Val

In [None]:
bert_ml.bert.transformer_blocks[0].attention.linear_layers[1].weight

In [None]:
import math

In [None]:
for n, p in sa_cls.named_parameters():
    print(n)

In [None]:
type(ml_model.config)

In [None]:
ml_

In [None]:
ml_q = sa_cls.transpose_for_scores(sa_cls.query(ml_emb_op))
ml_k = sa_cls.transpose_for_scores(sa_cls.key(ml_emb_op))
ml_v = sa_cls.transpose_for_scores(sa_cls.value(ml_emb_op))

att_score = torch.matmul(ml_q, ml_k.transpose(-1, -2))
att_score = att_score / math.sqrt(64)
att_prob = nn.functional.softmax(att_score, dim=-1)

context_layer = torch.matmul(att_prob, ml_v)

context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (sa_cls.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)

In [None]:
context_layer

## Our Network SA

In [None]:
s = bert_ml.bert.transformer_blocks[0].attention

In [None]:
query, key, value = [l(x).view(1, -1, 12, 64).transpose(1, 2)
                     for l, x in zip(s.linear_layers, (our_emb_op, our_emb_op, our_emb_op))]

In [None]:
op1 = s.attention.forward(query, key, value, mask = mask)

In [None]:
op1 = op1[0].transpose(1, 2).contiguous().view(batch_size, -1, 12 * 64)

## Attention is the same on both

In [None]:
ml_model.bert.encoder.layer[0].attention.output.forward()

### Below individual verificaiton of Embedding

### Standard Encoding

In [None]:
token_ml = ml_model.bert.embeddings.word_embeddings(encoded_input['input_ids'])

In [None]:
token_ours = bert_ml.bert.embedding.token(new_data['bert_input'])

In [None]:
assert (token_ml == token_ours).all()

### Segment Encoding

In [None]:
seg_ml = ml_model.bert.embeddings.token_type_embeddings(encoded_input['token_type_ids'])

In [None]:
seg_ours = bert_ml.bert.embedding.segment(new_data['segment_label'])

In [None]:
assert (seg_ml == seg_ours).all()

### Position

In [None]:
pos_ids = torch.arange(512).expand((1, -1))
position_ids = pos_ids[:, 0 : 10 + 0]
pos_ml = ml_model.bert.embeddings.position_embeddings(position_ids)

In [None]:
pos_ours = bert_ml.bert.embedding.position(new_data['bert_input'])

In [None]:
assert (pos_ml == pos_ours).all()

In [None]:
final_embed_ml = token_ml + seg_ml + pos_ml

In [None]:
final_embed_ours = token_ours + seg_ours + pos_ours

### Add Layer Norm and DropOut

In [None]:
assert (bert_ml.bert.embedding.layer_norm.a_2 == ml_model.bert.embeddings.LayerNorm.weight).all()

In [None]:
assert (bert_ml.bert.embedding.layer_norm.b_2 == ml_model.bert.embeddings.LayerNorm.bias).all()

In [None]:
ml_model.bert.embeddings.LayerNorm(final_embed_ml)

In [None]:
bert_ml.bert.embedding.layer_norm(final_embed_ours)

In [None]:
encoded_input

In [None]:
ml_model.bert.embeddings.forward(encoded_input['input_ids'], encoded_input['token_type_ids'])

In [None]:
bert_ml.bert.embedding.forward(new_data['bert_input'], new_data['segment_label'])