# prepare the dataset
simple input: the conversation between two people

In [1]:
'''
  code by wmathor, modify by qingjiu
  Reference : https://github.com/jadore801120/attention-is-all-you-need-pytorch
         https://github.com/JayParks/transformer, https://github.com/dhlee347/pytorchic-bert
'''
import re
import math
import torch
import numpy as np
from random import *
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data

text = (
    'Hello, how are you? I am Romeo.\n' # R
    'Hello, Romeo My name is Juliet. Nice to meet you.\n' # J
    'Nice meet you too. How are you today?\n' # R
    'Great. My baseball team won the competition.\n' # J
    'Oh Congratulations, Juliet\n' # R
    'Thank you Romeo\n' # J
    'Where are you going today?\n' # R
    'I am going shopping. What about you?\n' # J
    'I am going to visit my grandmother. she is not very well' # R
)

# use Regular Expressions to get a list 用正则表达式获得一个list
sentences = re.sub("[.,!?\\-]", '', text.lower()).split('\n') # filter '.', ',', '?', '!','-'
word_list = list(set(" ".join(sentences).split())) # ['hello', 'how', 'are', 'you',...]

# build the dict
word2idx = {'[PAD]' : 0, '[CLS]' : 1, '[SEP]' : 2, '[MASK]' : 3}
for i, w in enumerate(word_list):
    word2idx[w] = i + 4
idx2word = {i: w for i, w in enumerate(word2idx)}
vocab_size = len(word2idx)

token_list = list()
for sentence in sentences:
    arr = [word2idx[s] for s in sentence.split()]
    token_list.append(arr)

# have a look at token_list
token_list

[[35, 12, 24, 27, 16, 21, 9],
 [35, 9, 31, 15, 36, 20, 5, 13, 37, 27],
 [5, 37, 27, 10, 12, 24, 27, 30],
 [7, 31, 11, 22, 6, 17, 23],
 [14, 32, 20],
 [33, 27, 9],
 [18, 24, 27, 39, 30],
 [16, 21, 39, 29, 25, 34, 27],
 [16, 21, 39, 13, 28, 31, 8, 38, 36, 19, 4, 26]]

# hyper parameter
`maxlen`: indicates that all sentences in the same batch are composed of 30 tokens, and PAD is used to make up for the insufficient tokens (the implementation here is relatively crude, and all sentences in all batches are directly fixed to 30)

`max_pred`: indicates the maximum number of words that need to be predicted, that is, the cloze task in BERT

`n_layers`: indicates the number of Encoder Layers

`d_model`: indicates the dimensions of Token Embeddings, Segment Embeddings, and Position Embeddings

`d_ff`: indicates the dimensions of the fully connected layer in the Encoder Layer

`n_segments`: indicates how many sentences the Decoder input consists of

`maxlen`: 表示同一个 batch 中的所有句子都由 30 个 token 组成，不够的补 PAD（这里我实现的方式比较粗暴，直接固定所有 batch 中的所有句子都为 30）

`max_pred`: 表示最多需要预测多少个单词，即 BERT 中的完形填空任务

`n_layers`: 表示 Encoder Layer 的数量

`d_model`: 表示 Token Embeddings、Segment Embeddings、Position Embeddings 的维度

`d_ff`: 表示 Encoder Layer 中全连接层的维度

`n_segments`: 表示 Decoder input 由几句话组成

In [2]:
# BERT Parameters
maxlen = 30 
batch_size = 6
max_pred = 5 # max tokens of prediction
n_layers = 6
n_heads = 12
d_model = 768
d_ff = 768*4 # 4*d_model, FeedForward dimension
d_k = d_v = 64  # dimension of K(=Q), V
n_segments = 2

# Data preprocessing
It is necessary to randomly make or replace (hereinafter collectively referred to as `mask`) 15% of the `tokens` in a sentence according to probability, and also to splice any two sentences

需要按照概率随机make或替换（以下统称为`mask`）一个句子中15%的`tokens`，并且拼接任意两个句子

In [3]:
# sample IsNext and NotNext to be same in small batch size
def make_data():
    batch = []
    # positive 变量代表两句话是连续的个数
    # negative 代表两句话不是连续的个数
    # 在一个 batch 中，令这两个样本的比例为 1:1
    positive = negative = 0

    while positive != batch_size/2 or negative != batch_size/2:
        # random.randrange(stop)将生成一个在 [0, stop) 范围内的随机整数
        # 随机选句子
        tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences)) # sample random index in sentences
        tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index] # 获得两个句子中的tokens
        input_ids = [word2idx['[CLS]']] + tokens_a + [word2idx['[SEP]']] + tokens_b + [word2idx['[SEP]']]
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)

        # MASK LM
        n_pred =  min(max_pred, max(1, int(len(input_ids) * 0.15))) # 15 % of tokens in one sentence
        cand_maked_pos = [i for i, token in enumerate(input_ids)
                          if token != word2idx['[CLS]'] and token != word2idx['[SEP]']] # candidate masked position
        shuffle(cand_maked_pos)
        masked_tokens, masked_pos = [], []
        for pos in cand_maked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            # 80%变成[MASK]
            if random() < 0.8:  # 80%
                input_ids[pos] = word2idx['[MASK]'] # make mask
            # 10%去随机变成字典中的任意值
            elif random() > 0.9:  # 10%
                index = randint(0, vocab_size - 1) # random index in vocabulary
                while index < 4: # can't involve 'CLS', 'SEP', 'PAD'
                  index = randint(0, vocab_size - 1)
                input_ids[pos] = index # replace
            # 剩余10%什么都不变

        # Zero Paddings
        # 把batch中的input列表填补到maxlen的长度
        n_pad = maxlen - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)

        # Zero Padding (100% - 15%) tokens
        # 为了补齐 mask 的数量，因为不同句子长度，会导致不同数量的单词进行 mask，
        # 我们需要保证同一个 batch 中，mask 的数量（必须）是相同的，所以也需要在后面补一些没有意义的东西，比方说 [0]
        if max_pred > n_pred:
            n_pad = max_pred - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)

        # 如果对应句子的idx相差一,那么两个句子就是连续的
        if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext
            positive += 1
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext
            negative += 1
    return batch
# Proprecessing Finished

# build Model
Use pytorch to build Bert model 

prepare the dataset

In [4]:
batch = make_data()
input_ids, segment_ids, masked_tokens, masked_pos, isNext = zip(*batch)
input_ids, segment_ids, masked_tokens, masked_pos, isNext = \
    torch.LongTensor(input_ids),  torch.LongTensor(segment_ids), torch.LongTensor(masked_tokens),\
    torch.LongTensor(masked_pos), torch.LongTensor(isNext)
print("input_ids.shape is {}, masked_pos.shape is {}".format(input_ids.shape, masked_pos.shape))

class MyDataSet(Data.Dataset):
  def __init__(self, input_ids, segment_ids, masked_tokens, masked_pos, isNext):
    self.input_ids = input_ids
    self.segment_ids = segment_ids
    self.masked_tokens = masked_tokens
    self.masked_pos = masked_pos
    self.isNext = isNext
  
  def __len__(self):
    return len(self.input_ids)
  
  def __getitem__(self, idx):
    return self.input_ids[idx], self.segment_ids[idx], self.masked_tokens[idx], self.masked_pos[idx], self.isNext[idx]

# 获取dataset
mydataset = MyDataSet(input_ids, segment_ids, masked_tokens, masked_pos, isNext)
# 使用现成的dataloader
loader = Data.DataLoader(dataset=mydataset, batch_size=batch_size, shuffle=True)

input_ids.shape is torch.Size([6, 30]), masked_pos.shape is torch.Size([6, 5])


model's implement

在神经网络的建模过程中，模型很重要的性质就是非线性，同时为了模型泛化能力，需要加入随机正则，例如`dropout`(随机置一些输出为0,其实也是一种变相的随机非线性激活)， 而随机正则与非线性激活是分开的两个事情， 而其实模型的输入是由非线性激活与随机正则两者共同决定的。

`GELUs`(高斯误差线性单元(Gaussian Error Linear Unit))正是在激活中引入了随机正则的思想，是一种对神经元输入的概率描述，直观上更符合自然的认识，同时实验效果要比`Relus`与`ELUs`都要好.`GELUs`其实是 `dropout`、`zoneout`、`Relus`的综合

而其中torch.erf(x / math.sqrt(2.0)) 的范围是 -1 到 1

get_attn_pad_mask 这个函数用于生成自注意力机制（Self-Attention Mechanism）中的注意力掩码（Attention Mask）。具体来说，这个函数会生成一个用于遮蔽填充位置（padding positions）的掩码，确保在计算注意力分数时这些填充位置不会对结果产生影响

In [5]:
def get_attn_pad_mask(seq_q, seq_k):
    batch_size, seq_len = seq_q.size()
    # eq(zero) is PAD token
    # seq_q.data.eq(0) 会生成一个与 seq_q 形状相同的布尔张量，
    # 当 seq_q 中的元素等于 0（通常表示 PAD token）时，对应位置的值为 True，否则为 False
    pad_attn_mask = seq_q.data.eq(0).unsqueeze(1)  # [batch_size, 1, seq_len]
    return pad_attn_mask.expand(batch_size, seq_len, seq_len)  # [batch_size, seq_len, seq_len]

# 激活函数
def gelu(x):
    """
      Implementation of the gelu activation function.
      For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
      0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
      Also see https://arxiv.org/abs/1606.08415
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) #范围[0, x]

In [6]:
# 构造一个bert的Embedding类,其中包括了bert要对每个token使用的三种Embedding: 
# tok_embed, pos_embed和seg_embed
class Embedding(nn.Module):
    def __init__(self):
        super(Embedding, self).__init__()
        self.tok_embed = nn.Embedding(vocab_size, d_model)  # token embedding
        self.pos_embed = nn.Embedding(maxlen, d_model)  # position embedding
        self.seg_embed = nn.Embedding(n_segments, d_model)  # segment(token type) embedding
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, seg):
        seq_len = x.size(1)
        pos = torch.arange(seq_len, dtype=torch.long) #用于生成一个包含均匀间隔值的 1D 张量。类似于 Python 中的 range 函数
        pos = pos.unsqueeze(0).expand_as(x)  # pos.shape: [seq_len] -> [batch_size, seq_len]
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg) #embedding.shape: [batch_size, seq_len,d_model]
        return self.norm(embedding)

In [7]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    # # 注意这里
    # def forward(self, Q, K, V, attn_mask):
    #     scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size, n_heads, seq_len, seq_len]
    #     scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
    #     attn = nn.Softmax(dim=-1)(scores)
    #     context = torch.matmul(attn, V)
    #     return context

    # attn_mask: [batch_size, n_heads, seq_len, seq_len]
    # q_s, k_s, v_s: [batch_size, n_heads, seq_len, d_k]
    def forward(self, q_s, k_s, v_s, attn_mask):
        scores = q_s @ k_s.transpose(-1, -2) / np.sqrt(d_k) # scores : [batch_size, n_heads, seq_len, seq_len]
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
        attn = nn.Softmax(dim=-1)(scores) # attn : [batch_size, n_heads, seq_len, seq_len]
        context = attn @ v_s # context: [batch_size, n_heads, seq_len, d_k]
        return context

In [8]:
# 多头注意力机制
class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_v * n_heads)
        self.l1 = nn.Linear(n_heads * d_v, d_model)
        self.norm = nn.LayerNorm(d_model)
        self.attention_cal_result = ScaledDotProductAttention()

    def forward(self, Q, K, V, attn_mask):
        # q: [batch_size, seq_len, d_model], k: [batch_size, seq_len, d_model], v: [batch_size, seq_len, d_model]
        residual, batch_size = Q, Q.size(0)
        # 得到了q,k,v的分数(经过各自W矩阵的映射)
        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size, n_heads, seq_len, d_k]
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size, n_heads, seq_len, d_k]
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size, n_heads, seq_len, d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size, n_heads, seq_len, seq_len]

        # context: [batch_size, n_heads, seq_len, d_v], attn: [batch_size, n_heads, seq_len, seq_len]
        # 定义模型并前向传播
        context = self.attention_cal_result(q_s, k_s, v_s, attn_mask) # context: [batch_size, n_heads, seq_len, d_k]
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size, seq_len, n_heads, d_v] -> [batch_size, seq_len, n_heads * d_v]
        output = self.l1(context) #output: [batch_size, seq_len, d_model]
        return self.norm(output + residual) # output: [batch_size, seq_len, d_model]

In [9]:
# 前馈网络
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_ff) -> (batch_size, seq_len, d_model)
        return self.fc2(gelu(self.fc1(x)))

In [10]:
# encoder层
class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size, seq_len, d_model]
        return enc_outputs

### torch.gather函数 
out = torch.gather(input, dim, index)

gather返回的out的形状和index一样

#out[i][j][k] = input[index[i][j][k]][j][k] # dim=0

#out[i][j][k] = input[i][index[i][j][k]][k] # dim=1

#out[i][j][k] = input[i][j][index[i][j][k]] # dim=2

`optim.Adadelta` 是 PyTorch 中的一种优化器，它是 Adadelta 算法的实现。Adadelta 是一种自适应学习率方法，旨在解决 Adagrad 中学习率衰减过快的问题。Adadelta 通过限制累积窗口中的梯度更新来避免学习率迅速下降

In [11]:
# bert模型
class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.embedding = Embedding()
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
        self.fc = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.Dropout(0.5),
            nn.Tanh(),
        )
        self.classifier = nn.Linear(d_model, 2)
        self.linear = nn.Linear(d_model, d_model)
        self.activ2 = gelu
        # fc2 is shared with embedding layer
        embed_weight = self.embedding.tok_embed.weight
        self.fc2 = nn.Linear(d_model, vocab_size, bias=False)
        self.fc2.weight = embed_weight

    def forward(self, input_ids, segment_ids, masked_pos):
        output = self.embedding(input_ids, segment_ids) #output: [bach_size, seq_len, d_model]
        # 得到了一个boolen掩码矩阵enc_self_attn_mask,其中true表示对应位置为0
        # 而input_ids中0对应的就是|PAD|
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids) # [batch_size, maxlen, maxlen]
        for layer in self.layers:
            # 在多层的attention layers中不停的计算output
            # output: [batch_size, max_len, d_model]
            output = layer(output, enc_self_attn_mask)
        # it will be decided by first token(CLS)
        h_pooled = self.fc(output[:, 0]) #output[:, 0]: [batch_size, d_model], h_pooled: [batch_size, d_model]
        logits_clsf = self.classifier(h_pooled) # [batch_size, 2] predict isNext

        masked_pos = masked_pos[:, :, None].expand(-1, -1, d_model) #masked_pos.shape: [batch_size, max_pred]->[batch_size, max_pred, d_model]
        # 从 output 张量中提取掩码位置的特征
        h_masked = torch.gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model] max_pred前面定义为5
        h_masked = self.activ2(self.linear(h_masked)) # [batch_size, max_pred, d_model]
        # 生成词汇表大小的 logits, 这些 logits 通常用于预测掩码位置的词汇
        logits_lm = self.fc2(h_masked) # [batch_size, max_pred, vocab_size]
        return logits_lm, logits_clsf

model = BERT()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(model.parameters(), lr=0.001)

In [16]:
for epoch in range(50):
    for input_ids, segment_ids, masked_tokens, masked_pos, isNext in loader:
      logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
      loss_lm = criterion(logits_lm.view(-1, vocab_size), masked_tokens.view(-1)) # for masked LM
      loss_lm = (loss_lm.float()).mean()
      loss_clsf = criterion(logits_clsf, isNext) # for sentence classification
      loss = loss_lm + loss_clsf
      if (epoch + 1) % 10 == 0:
          print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

Epoch: 0010 loss = 0.674609
Epoch: 0020 loss = 0.628631
Epoch: 0030 loss = 0.577879
Epoch: 0040 loss = 0.552309
Epoch: 0050 loss = 0.440841


In [17]:
# Predict mask tokens ans isNext
input_ids, segment_ids, masked_tokens, masked_pos, isNext = batch[1]
print(text)
print('================================')
print([idx2word[w] for w in input_ids if idx2word[w] != '[PAD]'])

logits_lm, logits_clsf = model(torch.LongTensor([input_ids]), \
                 torch.LongTensor([segment_ids]), torch.LongTensor([masked_pos]))
logits_lm = logits_lm.data.max(2)[1][0].data.numpy()
print('masked tokens list : ',[pos for pos in masked_tokens if pos != 0])
print('predict masked tokens list : ',[pos for pos in logits_lm if pos != 0])

logits_clsf = logits_clsf.data.max(1)[1].data.numpy()[0]
print('isNext : ', True if isNext else False)
print('predict isNext : ',True if logits_clsf else False)

Hello, how are you? I am Romeo.
Hello, Romeo My name is Juliet. Nice to meet you.
Nice meet you too. How are you today?
Great. My baseball team won the competition.
Oh Congratulations, Juliet
Thank you Romeo
Where are you going today?
I am going shopping. What about you?
I am going to visit my grandmother. she is not very well
['[CLS]', 'nice', 'meet', '[MASK]', 'too', 'how', 'are', 'you', 'today', '[SEP]', 'i', 'am', 'going', 'to', '[MASK]', 'my', 'grandmother', 'she', 'is', 'not', 'very', 'well', '[SEP]']
masked tokens list :  [16, 28, 27]
predict masked tokens list :  [16, 28, 27]
isNext :  False
predict isNext :  False
