#  seq2seq + Luong 注意力 进行中英文翻译

In [1]:
import numpy as np
import jieba
from collections import Counter  #计数器
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import MultiheadAttention
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torchtext.data.utils import get_tokenizer  #分词器

使用Luong 注意力进行中英翻译训练，[数据来源](https://github.com/cuicaihao/Annotated-Transformer-English-to-Chinese-Translator/tree/master/data/nmt/en-cn)，包含成对的中英文翻译句子，包含7个文件，cmn.txt是全体数据集；train.txt, dev.txt, test.txt为对全体数据集的80%、10%、
10%划分。train_mini.txt, dev_mini.txt, test_mini.txt为小样本数据，分别包含1000、200、200条数据。

In [2]:
UNK_IDX = 0  #未知
PAD_IDX = 1  #
BATCH_SIZE = 64
EPOCHS = 30
DROPOUT = 0.2
ENC_HIDDEN_SIZE = DEC_HIDDEN_SIZE = 100
EMBED_SIZE = 100
DEBUG = True

if DEBUG:

    train_file = 'attn_data/train_mini.txt'
    dev_file = 'attn_data/dev_mini.txt'
    test_file = 'attn_data/test_mini.txt'
    save_file = 'attn_data/model.pt'
else:

    train_file = 'attn_data/train.txt'
    dev_file = 'attn_data/dev.txt'
    test_file = 'attn_data/test.txt'
    save_file = 'attn_data/large_model.pt'

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
#分词器
tokenizer_en = get_tokenizer('basic_english')  #按空格进行分割
tokenizer_cn = get_tokenizer(jieba.lcut)  #进行结巴分词


#加载文件
def load_data(path):
    en = []
    cn = []
    with open(path, 'r', encoding = 'utf-8') as f:
        for line in f:
            line = line.strip().split('\t')
            en.append(["BOS"] + tokenizer_en(line[0].lower()) + ["EOS"])  #小写
            cn.append(["BOS"] + tokenizer_cn(line[1]) + ["EOS"])
    return en, cn


train_en, train_zh = load_data(train_file)
dev_en, dev_zh = load_data(dev_file)
test_en, test_zh = load_data(test_file)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\NIEYUZ~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.684 seconds.
Prefix dict has been built successfully.


In [4]:
print(train_en[0], train_zh[0])

['BOS', 'anyone', 'can', 'do', 'that', '.', 'EOS'] ['BOS', '任何人', '都', '可以', '做到', '。', 'EOS']


In [5]:
#构建词汇表
def build_dict(sentences, max_words = 50000):
    vocab = Counter(np.concatenate(sentences)).most_common(max_words)  #最大单词数是50000
    word_to_id = {w[0]: index + 2 for index, w in enumerate(vocab)}
    word_to_id['UNK'] = UNK_IDX  #0
    word_to_id['PAD'] = PAD_IDX  #1
    id_to_word = {v: k for k, v in word_to_id.items()}
    return word_to_id, id_to_word


en_wtoi, en_itow = build_dict(train_en)
zh_wtoi, zh_itow = build_dict(train_zh)

In [6]:
en_itow

{2: 'BOS',
 3: 'EOS',
 4: '.',
 5: 'i',
 6: 'the',
 7: "'",
 8: 'you',
 9: 'to',
 10: 'a',
 11: '?',
 12: 'is',
 13: 'he',
 14: 't',
 15: 'in',
 16: 'it',
 17: 'of',
 18: 'she',
 19: 's',
 20: 'have',
 21: 'me',
 22: 'tom',
 23: ',',
 24: 'do',
 25: 'that',
 26: 'for',
 27: 'my',
 28: 'don',
 29: 'are',
 30: 'her',
 31: 'what',
 32: 'this',
 33: 'at',
 34: 'with',
 35: 'your',
 36: 'we',
 37: 'was',
 38: 'not',
 39: 'like',
 40: 'on',
 41: 'his',
 42: 'can',
 43: 'has',
 44: 'm',
 45: 'be',
 46: 'go',
 47: 'him',
 48: 'please',
 49: 'they',
 50: 'will',
 51: 'there',
 52: 'from',
 53: 'want',
 54: 'and',
 55: 'know',
 56: 'how',
 57: 'very',
 58: 'll',
 59: 'here',
 60: 'an',
 61: 'didn',
 62: 'about',
 63: 'should',
 64: 'did',
 65: 'get',
 66: 'had',
 67: 'up',
 68: 've',
 69: 'by',
 70: 'out',
 71: 'all',
 72: 'need',
 73: 'time',
 74: 'why',
 75: 'many',
 76: 'no',
 77: 'could',
 78: 'some',
 79: 'good',
 80: 'one',
 81: 'day',
 82: 'give',
 83: 'made',
 84: 'would',
 85: 'school',

In [7]:
# 利用词典对原始句子编码 单词->数字
def encode(en_sentences, ch_sentences, en_wtoi, zh_wtoi, sort_by_len = True):
    out_en_sentences = [[en_wtoi.get(w, UNK_IDX) for w in sent] for sent in en_sentences]
    out_ch_sentences = [[zh_wtoi.get(w, UNK_IDX) for w in sent] for sent in ch_sentences]

    #返回w对应的值，否则返回UNK_IDX
    def len_argsort(seq):  #按照长度进行排序
        return sorted(range(len(seq)), key = lambda x: len(seq[x]))

    # 把中文和英文按照同样的顺序排序
    if sort_by_len:
        sorted_index = len_argsort(out_en_sentences)
        out_en_sentences = [out_en_sentences[i] for i in sorted_index]
        out_ch_sentences = [out_ch_sentences[i] for i in sorted_index]

    return out_en_sentences, out_ch_sentences


train_en_encode, train_zh_encode = encode(train_en, train_zh, en_wtoi, zh_wtoi)
dev_en_encode, dev_zh_encode = encode(dev_en, dev_zh, en_wtoi, zh_wtoi)
test_en_encode, test_zh_encode = encode(test_en, test_zh, en_wtoi, zh_wtoi)


In [8]:
test_en_encode

[[2, 423, 16, 4, 3],
 [2, 18, 435, 1162, 4, 3],
 [2, 56, 29, 8, 11, 3],
 [2, 1272, 35, 135, 4, 3],
 [2, 215, 21, 95, 4, 3],
 [2, 6, 339, 0, 4, 3],
 [2, 42, 182, 269, 11, 3],
 [2, 0, 313, 8, 4, 3],
 [2, 22, 0, 0, 4, 3],
 [2, 31, 63, 5, 601, 11, 3],
 [2, 13, 12, 76, 0, 4, 3],
 [2, 18, 132, 51, 170, 4, 3],
 [2, 13, 0, 6, 0, 4, 3],
 [2, 5, 115, 192, 103, 4, 3],
 [2, 5, 92, 52, 0, 4, 3],
 [2, 8, 558, 6, 0, 4, 3],
 [2, 5, 458, 30, 67, 4, 3],
 [2, 13, 0, 41, 0, 4, 3],
 [2, 198, 29, 353, 421, 4, 3],
 [2, 0, 100, 12, 738, 4, 3],
 [2, 144, 29, 27, 421, 11, 3],
 [2, 13, 0, 6, 0, 4, 3],
 [2, 13, 140, 137, 287, 4, 3],
 [2, 22, 0, 214, 170, 4, 3],
 [2, 0, 17, 6, 230, 317, 3],
 [2, 24, 8, 137, 0, 11, 3],
 [2, 6, 672, 1420, 134, 4, 3],
 [2, 6, 309, 0, 0, 4, 3],
 [2, 411, 12, 35, 121, 11, 3],
 [2, 43, 103, 79, 300, 11, 3],
 [2, 18, 247, 15, 107, 1344, 4, 3],
 [2, 28, 7, 14, 45, 0, 4, 3],
 [2, 5, 92, 10, 0, 59, 4, 3],
 [2, 22, 585, 67, 6, 1013, 4, 3],
 [2, 13, 93, 45, 62, 958, 4, 3],
 [2, 5, 37, 0, 69, 

In [9]:
#返回每个batch的id
def get_minibatches(n, minibatch_size, shuffle = True):
    idx_list = np.arange(0, n, minibatch_size)
    if shuffle:
        np.random.shuffle(idx_list)
    minibatches = []
    for idx in idx_list:
        minibatches.append(np.arange(idx, min(idx + minibatch_size, n)))
    return minibatches

In [10]:
get_minibatches(50, 10, shuffle = True)  #得到每一个batch对应的id

[array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
 array([30, 31, 32, 33, 34, 35, 36, 37, 38, 39]),
 array([40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
 array([20, 21, 22, 23, 24, 25, 26, 27, 28, 29])]

In [11]:
#将句子对划分到batch
def get_batches(en_encode, ch_encode):
    batch_indexs = get_minibatches(len(en_encode), BATCH_SIZE)

    batches = []
    for batch_index in batch_indexs:
        batch_en = [torch.tensor(en_encode[index]).long() for index in batch_index]  #每一个idx对应的句子，转为tensor格式
        batch_zh = [torch.tensor(ch_encode[index]).long() for index in batch_index]
        length_en = torch.tensor([len(en) for en in batch_en]).long()  #每一个句子的长度
        length_zh = torch.tensor([len(zh) for zh in batch_zh]).long()

        batch_en = pad_sequence(batch_en, padding_value = PAD_IDX, batch_first = True)  #讲一个batch中的句子padding为相同长度
        batch_zh = pad_sequence(batch_zh, padding_value = PAD_IDX, batch_first = True)

        batches.append((batch_en, batch_zh, length_en, length_zh))
    return batches


train_data = get_batches(train_en_encode, train_zh_encode)
dev_data = get_batches(dev_en_encode, dev_zh_encode)

In [12]:
train_data[0][3]

tensor([ 8,  8,  8, 10,  9,  7,  9, 10,  8,  5,  8,  8,  7, 10,  9,  8,  7,  9,
        15,  9,  9,  9,  9,  8,  9, 11,  9,  8,  8,  6, 12,  8,  9,  8,  8,  9,
         9,  8, 10, 10,  9,  8,  8, 11, 10, 10,  7,  9,  8, 10,  8,  9, 11,  8,
         6,  8,  9,  7, 10, 11, 11,  8,  9,  7])

## 建立模型

![](https://cdn.mathpix.com/snip/images/8Es5WLO-mn8kY7GO-T7o1IxWUBPbZeLrz3JbqY5U2Vw.original.fullsize.png)

其中 $\bar{h}_{s}$ 表示encoder每个hidden_state的输出， $h_{t}$ 表示decoder每个hidden_state的输出。


$$
\begin{aligned}
a_{t}(s) 
&=\frac{\exp \left(\operatorname{score}\left(h_{t}, \bar{h}_{s}\right)\right)}{\sum_{s^{\prime}} \exp \left(\operatorname{score}\left(h_{t}, \bar{h}_{s^{\prime}}\right)\right)} \\
c_{t} &= \sum a_{t} \bar{h}_{s} \\
\tilde{h}_{t} &=\tanh \left(W_{c}\left[c_{t} ; h_{t}\right]\right)\\

\end{aligned}
$$

$\operatorname{score}\left(\boldsymbol{h}_{t}, \overline{\boldsymbol{h}}_{s}\right)= \begin{cases}\boldsymbol{h}_{t}^{\top} \overline{\boldsymbol{h}}_{s} & \text { dot } \\ \boldsymbol{h}_{t}^{\top} \boldsymbol{W}_{\boldsymbol{a}} \overline{\boldsymbol{h}}_{s} & \text { general } \\ \boldsymbol{v}_{a}^{\top} \tanh \left(\boldsymbol{W}_{\boldsymbol{a}}\left[\boldsymbol{h}_{t} ; \overline{\boldsymbol{h}}_{s}\right]\right) & \text { concat }\end{cases}$

In [13]:
class LuongEncoder(nn.Module):
    def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout):
        super(LuongEncoder, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, enc_hidden_size, bidirectional = True)  #双向GRU
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(enc_hidden_size * 2, dec_hidden_size)

    def forward(self, x, x_lengths):
        """
        input_seqs : batch_size,max(x_lengths)
        input_lengths: batch_size
        """
        embedded = self.dropout(self.embedding(x))  #batch_size,max(x_lengths),embed_size
        packed = pack_padded_sequence(embedded, x_lengths.long().cpu().data.numpy(), batch_first = True,
                                      enforce_sorted = False)
        #压缩填充张量,压缩掉无效的填充值
        #enforce_sorted：如果是 True ，则输入应该是按长度降序排序的序列。如果是 False ，会在函数内部进行排序 
        outputs, hidden = self.rnn(packed)
        outputs, _ = pad_packed_sequence(outputs, padding_value = PAD_IDX, batch_first = True)  #还原

        #hidden (2, batch_size, enc_hidden_size)
        #outputs (batch_size,seq_len, 2 * enc_hidden_size)

        hidden = torch.cat([hidden[-2], hidden[-1]], dim = 1)
        hidden = torch.tanh(self.fc(hidden)).unsqueeze(0)
        return outputs, hidden

In [14]:
class Attn(nn.Module):
    def __init__(self, enc_hidden_size, dec_hidden_size):
        super(Attn, self).__init__()
        #general attention
        self.linear_in = nn.Linear(enc_hidden_size * 2, dec_hidden_size, bias = False)
        self.linear_out = nn.Linear(enc_hidden_size * 2 + dec_hidden_size, dec_hidden_size)

    def forward(self, output, encoder_out, mask):
        """
        output:batch_size, max(y_lengths), dec_hidden_size  #(h_t)
        encoder_out:batch_size, max(x_lengths), 2 * enc_hidden_size  #(h_s)
        """
        batch_size = output.shape[0]
        output_len = output.shape[1]
        input_len = encoder_out.shape[1]

        encoder_out1 = self.linear_in(encoder_out.view(batch_size * input_len, -1)).view(batch_size, input_len, -1)
        #Wh_s 
        #batch_size,max(x_lengths),dec_hidden_size
        score = torch.bmm(output, encoder_out1.transpose(1, 2))  #实现三维数组的乘法，而不用拆成二维数组使用for循环解决
        #[batch_size,max(y_lengths),dec_hidden_size] * [batch_size,dec_hidden_size,max(x_lengths)]
        #batch_size,max(y_lengths),max(x_lengths)#score = h_t W h_s
        score.data.masked_fill(mask, -1e16)
        attn = F.softmax(score, dim = 2)  #attention系数矩阵

        ct = torch.bmm(attn, encoder_out)  #ct = aths
        #[batch_size,max(y_lengths),max(x_lengths)] * [batch_size, max(x_lengths), 2 * enc_hidden_size]
        #batch_size, max(y_lengths), enc_hidden_size*2
        output = torch.cat((ct, output), dim = 2)

        output = output.view(batch_size * output_len, -1)
        output = torch.tanh(self.linear_out(output))
        output = output.view(batch_size, output_len, -1)
        #batch_size, max(y_lengths), dec_hidden_size

        return output, attn


In [15]:
class LuongDecoder(nn.Module):
    def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout):
        super(LuongDecoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.attention = Attn(enc_hidden_size, dec_hidden_size)
        self.rnn = nn.GRU(embed_size, dec_hidden_size, batch_first = True)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(dec_hidden_size, vocab_size)

    def creat_mask(self, x, y):
        x_mask = x.data != PAD_IDX  #batch_size,max(x_lengths)
        y_mask = y.data != PAD_IDX  #batch_size,max(y_lengths)
        mask = (1 - (x_mask.unsqueeze(2) * y_mask.unsqueeze(1)).float()).bool()
        #batch_size,max(x_lengths),max(y_lengths)
        return mask

    def forward(self, encoder_out, x, y, y_lengths, hid):
        mask = self.creat_mask(y, x)
        y = self.dropout(self.embedding(y))
        packed = pack_padded_sequence(y, y_lengths.long().cpu().data.numpy(), batch_first = True,
                                      enforce_sorted = False)
        out, hid = self.rnn(packed, hid)

        out, _ = pad_packed_sequence(out, padding_value = PAD_IDX, batch_first = True)

        output, attn = self.attention(out, encoder_out, mask)
        output = self.out(output)
        #batch_size, max(y_lengths), vocab_size
        return output, hid, attn

In [16]:
class seq2seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(seq2seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, x, x_lengths, y, y_lengths):
        encoder_out, hid = self.encoder(x, x_lengths)
        output, hid, attn = self.decoder(encoder_out,  #这里输出的hid是decoder_rnn的hid
                                         x = x,
                                         y = y,
                                         y_lengths = y_lengths,
                                         hid = hid)  #encoder的hid
        return output, attn

    def translate(self, x, x_lengths, y, max_length = 15):
        encoder_out, hid = self.encoder(x, x_lengths)
        preds = []
        batch_size = x.shape[0]
        attns = []
        for _ in range(max_length):
            output, hid, attn = self.decoder(encoder_out,
                                             x = x,
                                             y = y,
                                             y_lengths = torch.ones(batch_size).long().to(y.device),
                                             hid = hid)

            y = output.max(2)[1].view(batch_size, 1)

            preds.append(y)
            attns.append(attn)
        return torch.cat(preds, 1), torch.cat(attns, 1)


In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#Define Model
encoder = LuongEncoder(vocab_size = len(en_itow), embed_size = EMBED_SIZE, enc_hidden_size = ENC_HIDDEN_SIZE,
                       dec_hidden_size = DEC_HIDDEN_SIZE, dropout = DROPOUT)
decoder = LuongDecoder(vocab_size = len(zh_itow), embed_size = EMBED_SIZE, enc_hidden_size = ENC_HIDDEN_SIZE,
                       dec_hidden_size = DEC_HIDDEN_SIZE, dropout = DROPOUT)
model = seq2seq(encoder, decoder)
model = model.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index = PAD_IDX)  #忽略padding位置的损失
optimizer = torch.optim.Adam(model.parameters())

In [18]:
def train_epoch(model, optimizer, train_data):
    model.train()
    losses = 0
    for x, y, x_lengths, y_lengths in train_data:
        x = x.to(DEVICE)
        y = y.to(DEVICE)
        x_lengths = x_lengths.to(DEVICE)

        y_input = y[:, :-1]  #将前seq-1个单词作为输入
        y_output = y[:, 1:]  #将后seq-1个单词作为输出，相当于前一个单词预测后一个单词
        y_lengths = (y_lengths - 1).to(DEVICE)

        logits, _ = model(x, x_lengths, y_input, y_lengths)  #batch_size, max(y_lengths), vocab_size
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), y_output.reshape(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses += loss.item()

    return losses / len(train_data)


def evaluate(model, dev_data):
    model.train()
    losses = 0
    for x, y, x_lengths, y_lengths in train_data:
        x = x.to(DEVICE)
        y = y.to(DEVICE)
        x_lengths = x_lengths.to(DEVICE)

        y_input = y[:, :-1]
        y_output = y[:, 1:]
        y_lengths = (y_lengths - 1).to(DEVICE)
        logits, _ = model(x, x_lengths, y_input, y_lengths)
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), y_output.reshape(-1))

        losses += loss.item()

    return losses / len(train_data)

In [19]:
from timeit import default_timer as timer

for epoch in range(1, EPOCHS + 1):
    start_time = timer()
    train_loss = train_epoch(model, optimizer, train_data)
    end_time = timer()
    val_loss = evaluate(model, dev_data)
    print((
        f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))

Epoch: 1, Train loss: 7.124, Val loss: 6.198, Epoch time = 1.174s
Epoch: 2, Train loss: 5.706, Val loss: 5.289, Epoch time = 1.160s
Epoch: 3, Train loss: 5.157, Val loss: 5.006, Epoch time = 1.130s
Epoch: 4, Train loss: 4.969, Val loss: 4.882, Epoch time = 1.135s
Epoch: 5, Train loss: 4.865, Val loss: 4.786, Epoch time = 1.124s
Epoch: 6, Train loss: 4.779, Val loss: 4.710, Epoch time = 1.121s
Epoch: 7, Train loss: 4.697, Val loss: 4.653, Epoch time = 1.132s
Epoch: 8, Train loss: 4.619, Val loss: 4.564, Epoch time = 1.122s
Epoch: 9, Train loss: 4.537, Val loss: 4.466, Epoch time = 1.138s
Epoch: 10, Train loss: 4.440, Val loss: 4.361, Epoch time = 1.118s
Epoch: 11, Train loss: 4.344, Val loss: 4.282, Epoch time = 1.152s
Epoch: 12, Train loss: 4.262, Val loss: 4.194, Epoch time = 1.118s
Epoch: 13, Train loss: 4.182, Val loss: 4.114, Epoch time = 1.106s
Epoch: 14, Train loss: 4.107, Val loss: 4.031, Epoch time = 1.115s
Epoch: 15, Train loss: 4.028, Val loss: 3.969, Epoch time = 1.119s
Epoc

In [20]:
def translate_dev(i):
    model.eval()

    en_sent = " ".join([en_itow[word] for word in test_en_encode[i]])
    print('英文原句：', en_sent)
    print('标准中文翻译：', " ".join([zh_itow[word] for word in test_zh_encode[i]]))

    bos = torch.Tensor([[zh_wtoi["BOS"]]]).long().to(DEVICE)
    x = torch.Tensor(test_en_encode[i]).long().to(DEVICE).reshape(1, -1)
    x_len = torch.Tensor([len(test_en_encode[i])]).long().to(DEVICE)

    translation, _ = model.translate(x, x_len, bos)
    translation = [zh_itow[i] for i in translation.data.cpu().numpy().reshape(-1)]

    trans = []
    for word in translation:
        if word != "EOS":
            trans.append(word)
        else:
            break
    print('模型翻译结果：', " ".join(trans))


for i in range(50, 100):
    translate_dev(i)
    print()

英文原句： BOS it ' s not UNK . EOS
标准中文翻译： BOS 这 不 UNK 。 EOS
模型翻译结果： 我们 在 這裡 的 朋友 。

英文原句： BOS we have a great team . EOS
标准中文翻译： BOS 我们 有个 UNK 的 UNK 。 EOS
模型翻译结果： 我們 我們 的 时候 。

英文原句： BOS this is a strange sentence . EOS
标准中文翻译： BOS 這是 一個 奇怪 的 句子 。 EOS
模型翻译结果： 这 是 个 。

英文原句： BOS do you study every day ? EOS
标准中文翻译： BOS 你 每天 都 学习 吗 ？ EOS
模型翻译结果： 你 要 多少 ？

英文原句： BOS that ' s the point . EOS
标准中文翻译： BOS 这 UNK 问题 的 UNK 。 EOS
模型翻译结果： 他 是 个 个 。

英文原句： BOS a UNK is a UNK . EOS
标准中文翻译： BOS UNK 是 UNK UNK 。 EOS
模型翻译结果： 这 是 个 的 故事 。

英文原句： BOS tom used to work here . EOS
标准中文翻译： BOS 汤姆 UNK 在 这里 工作 。 EOS
模型翻译结果： 汤姆 在 這裡 了 。

英文原句： BOS you must clear the table . EOS
标准中文翻译： BOS 你 必须 把 桌子 UNK UNK 。 EOS
模型翻译结果： 你 是 个 。

英文原句： BOS i ' m not sure . EOS
标准中文翻译： BOS 我 不 确定 。 EOS
模型翻译结果： 我 不 明白 的 。

英文原句： BOS i ' ve UNK better . EOS
标准中文翻译： BOS 我 已 UNK 得 UNK 了 。 EOS
模型翻译结果： 我 不 想 ， 我 不 在 這裡 做 。

英文原句： BOS the job is half done . EOS
标准中文翻译： BOS 這項 工作 已經 完成 了 一半 。 EOS
模型翻译结果： 這個 於 。

英文原句： BOS UNK is UNK in chi

# 练习一：Bi-LSTM + attention 用于情感分类

补全代码：我们使用构建一个Bi-LSTM + attention模型完成文本分类任务，数据使用IMDb电影评论数据集，检测一段文字的情感是正面还是负面。

[论文](https://aclanthology.org/P16-2034.pdf)

![](https://cdn.mathpix.com/snip/images/pvB4-X5G9OAFQ_A2wYqjCZxoOOMu_u1PpkkrJUlTbQ8.original.fullsize.png)

In [21]:
import torch
from torch import nn
from torch.utils.data import DataLoader

from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

## 准备数据

In [22]:
tokenizer = get_tokenizer('basic_english')
train_iter = IMDB(split = 'train')


def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)


vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials = ["<unk>"])
vocab.set_default_index(vocab["<unk>"])
vocab.insert_token("<pad>", 1)


In [23]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: 0 if x == 'neg' else 1

In [24]:
def collate_batch(batch):  #自定义的batch输出
    label_list, text_list, lengths = [], [], []
    batch.sort(key = lambda x: len(text_pipeline(x[1])), reverse = True)  #按照长度的大小进行排序
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text))
        text_list.append(processed_text)
        lengths.append(len(processed_text))
    text_list = pad_sequence(text_list, padding_value = vocab.get_stoi()["<pad>"],
                             batch_first = True)  #进行填充，每个batch中的句子需要有相同的长度
    return torch.tensor(label_list), text_list, lengths

In [25]:
train_iter, test_iter = IMDB(root = 'data', split = ('train', 'test'))
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size = 128,
                              shuffle = True, collate_fn = collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size = 128,
                              shuffle = True, collate_fn = collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size = 128,
                             shuffle = True, collate_fn = collate_batch)

In [26]:
data = next(iter(train_dataloader))

In [27]:
data[0]  #标签

tensor([0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
        0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1,
        1, 0, 1, 1, 1, 1, 0, 0])

In [28]:
data[1]  #batch_size,max_length

tensor([[  458,  3141,  1552,  ...,    17,    57,     3],
        [   20, 14053,     4,  ...,     1,     1,     1],
        [    6,  8268,    12,  ...,     1,     1,     1],
        ...,
        [ 1070,     7,     6,  ...,     1,     1,     1],
        [    2, 18679,   121,  ...,     1,     1,     1],
        [  905,    14,    21,  ...,     1,     1,     1]])

In [29]:
data[2]  #lengths

[1185,
 1162,
 1144,
 862,
 842,
 773,
 762,
 724,
 662,
 645,
 631,
 619,
 598,
 552,
 535,
 496,
 491,
 478,
 477,
 460,
 435,
 433,
 429,
 402,
 398,
 398,
 398,
 378,
 367,
 360,
 349,
 338,
 336,
 330,
 329,
 327,
 319,
 317,
 313,
 310,
 306,
 305,
 301,
 298,
 284,
 283,
 276,
 275,
 272,
 267,
 258,
 252,
 247,
 245,
 242,
 238,
 235,
 228,
 227,
 222,
 219,
 216,
 207,
 201,
 198,
 198,
 196,
 195,
 192,
 189,
 187,
 187,
 184,
 182,
 179,
 179,
 176,
 175,
 174,
 162,
 161,
 160,
 158,
 157,
 157,
 155,
 155,
 155,
 155,
 153,
 150,
 149,
 148,
 147,
 146,
 144,
 143,
 142,
 141,
 137,
 135,
 133,
 133,
 131,
 131,
 128,
 126,
 125,
 124,
 123,
 123,
 122,
 121,
 117,
 117,
 111,
 105,
 97,
 97,
 85,
 84,
 76,
 65,
 61,
 58,
 54,
 37,
 24]

## 模型

模型分为五个部分

![](https://cdn.mathpix.com/snip/images/pvB4-X5G9OAFQ_A2wYqjCZxoOOMu_u1PpkkrJUlTbQ8.original.fullsize.png)

- 输入层（Input layer）：将句子输入模型
- 嵌入层（Embedding layer）：将每个词映射到一个低维向量
- LSTM层（LSTM layer）：利用BiLSTM从词向量中获得特征
- Attention层（Attention layer）：生成权重向量，将每个时间步长的单词级特征与权重向量相乘，合并成句子级特征向量（补全代码）
- 输出层（Output layer）： 对句子进行分类

In [30]:
class bilstm_attn(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                 dropout_rate, pad_id):
        super(bilstm_attn, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_id)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers = n_layers, bidirectional = True,
                            dropout = dropout_rate, batch_first = True)
        self.dropout = nn.Dropout(dropout_rate)
        self.attn = MultiheadAttention(hidden_dim, 8, batch_first = True)
        self.Q = nn.Linear(hidden_dim, hidden_dim)
        self.K = nn.Linear(hidden_dim, hidden_dim)
        self.V = nn.Linear(hidden_dim, hidden_dim)
        self.output = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, lengths):
        embedded = self.dropout(self.embedding(x))  # [batch size,seq len] -> [batch size,seq len,embedding_dim]

        packed_embedded = pack_padded_sequence(embedded, lengths, batch_first = True)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # hidden = [n layers *2, batch size, hidden dim]最后一个step的hidden
        # cell = [n layers * 2, batch size, hidden dim]最终一个step的cell
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first = True)
        # output = [batch size, seq len, hidden dim * 2]#每一个step下的最后一层的output
        output = output.reshape(output.shape[0], output.shape[1], 2, -1)
        # output = [batch size, seq len, 2,hidden dim]
        output = torch.sum(output, dim = 2)
        # output = [batch size, seq len, hidden dim]
        output = self.dropout(output)
        q_vector = self.Q(output)
        k_vector = self.K(output)
        v_vector = self.V(output)
        attn_output, attn_output_weights = self.attn(q_vector, k_vector, v_vector)
        prediction = self.output(attn_output.sum(dim = 1))
        return prediction

In [31]:
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = 2
n_layers = 1
dropout_rate = 0.5
pad_id = vocab.get_stoi()["<pad>"]

In [32]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = bilstm_attn(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout_rate, pad_id).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())



In [33]:
from tqdm import tqdm


def train(model, train_loader, optimizer, loss_fn):
    epoch_loss = 0
    corrects = 0
    total_len = 0
    model.train()  #model.train()代表了训练模式
    for label, text, lengths in tqdm(train_loader):
        label = label.to(device)
        text = text.to(device)

        out = model(text, lengths)
        loss = loss_fn(out, label)

        _, pred = torch.max(out.data, 1)
        corrects += (pred == label).sum().item()

        optimizer.zero_grad()  #加这步防止梯度叠加
        loss.backward()  #反向传播
        optimizer.step()  #梯度下降

        epoch_loss += loss.item() * len(label)
        #loss.item()已经本身除以了len(batch.label)
        #所以得再乘一次，得到一个batch的损失，累加得到所有样本损失。

        total_len += len(label)
        #计算train_iterator所有样本的数量，不出意外应该是17500

    return epoch_loss / total_len, corrects / total_len

In [34]:
def evaluate(model, valid_loader):
    epoch_loss = 0
    corrects = 0
    total_len = 0

    model.eval()
    #转换成测试模式，冻结dropout层或其他层。

    with torch.no_grad():
        for label, text, lengths in tqdm(valid_loader):
            #iterator为valid_iterator
            label = label.to(device)
            text = text.to(device)

            out = model(text, lengths)
            loss = loss_fn(out, label)

            _, pred = torch.max(out.data, 1)
            corrects += (pred == label).sum().item()

            epoch_loss += loss.item() * len(label)
            total_len += len(label)
    model.train()  #调回训练模式

    return epoch_loss / total_len, corrects / total_len

In [None]:
for epoch in range(10):
    train_loss, train_acc = train(model, train_dataloader, optimizer, loss_fn)
    print("epoch:", epoch, "train_loss:", train_loss, "train_acc", train_acc)
    valid_loss, valid_acc = evaluate(model, valid_dataloader)
    print("epoch:", epoch, "valid_loss:", valid_loss, "valid_acc", valid_acc)


  1%|          | 1/186 [02:08<6:36:36, 128.63s/it]

In [None]:
torch.save(model.state_dict(), 'attn_data/bilstm_attn.pt')

In [None]:
def predict_sentiment(text):
    text = text_pipeline(text)

    length = torch.LongTensor([len(text)])
    tensor = torch.LongTensor(text).unsqueeze(0).to(device)

    out = model(tensor, length)
    _, pred = torch.max(out.data, 1)
    return pred.item()

In [None]:
predict_sentiment("This film is terrible")

In [None]:
predict_sentiment("This film is great")