#  seq2seq + Luong 注意力 进行中英文翻译

In [1]:
import numpy as np
import jieba
from collections import Counter  #计数器
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torchtext.data.utils import get_tokenizer  #分词器

使用Luong 注意力进行中英翻译训练，[数据来源](https://github.com/cuicaihao/Annotated-Transformer-English-to-Chinese-Translator/tree/master/data/nmt/en-cn)，包含成对的中英文翻译句子，包含7个文件，cmn.txt是全体数据集；train.txt, dev.txt, test.txt为对全体数据集的80%、10%、
10%划分。train_mini.txt, dev_mini.txt, test_mini.txt为小样本数据，分别包含1000、200、200条数据。

In [None]:
UNK_IDX = 0  #未知
PAD_IDX = 1  #
BATCH_SIZE = 64
EPOCHS = 30
DROPOUT = 0.2
ENC_HIDDEN_SIZE = DEC_HIDDEN_SIZE = 100
EMBED_SIZE = 100
DEBUG = True

if DEBUG:

    train_file = 'attn_data/train_mini.txt'
    dev_file = 'attn_data/dev_mini.txt'
    test_file = 'attn_data/test_mini.txt'
    save_file = 'model.pt'
else:

    train_file = 'attn_data/train.txt'
    dev_file = 'attn_data/dev.txt'
    test_file = 'attn_data/test.txt'
    save_file = 'large_model.pt'

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
#分词器
tokenizer_en = get_tokenizer('basic_english')  #按空格进行分割
tokenizer_cn = get_tokenizer(jieba.lcut)  #进行结巴分词


#加载文件
def load_data(path):
    en = []
    cn = []
    with open(path, 'r', encoding = 'utf-8') as f:
        for line in f:
            line = line.strip().split('\t')
            en.append(["BOS"] + tokenizer_en(line[0].lower()) + ["EOS"])  #小写
            cn.append(["BOS"] + tokenizer_cn(line[1]) + ["EOS"])
    return en, cn


train_en, train_zh = load_data(train_file)
dev_en, dev_zh = load_data(dev_file)
test_en, test_zh = load_data(test_file)

In [None]:
print(train_en[0], train_zh[0])

In [None]:
#构建词汇表
def build_dict(sentences, max_words = 50000):
    vocab = Counter(np.concatenate(sentences)).most_common(max_words)  #最大单词数是50000
    word_to_id = {w[0]: index + 2 for index, w in enumerate(vocab)}
    word_to_id['UNK'] = UNK_IDX  #0
    word_to_id['PAD'] = PAD_IDX  #1
    id_to_word = {v: k for k, v in word_to_id.items()}
    return word_to_id, id_to_word


en_wtoi, en_itow = build_dict(train_en)
zh_wtoi, zh_itow = build_dict(train_zh)

In [None]:
en_itow[2], en_itow[3]

In [None]:
# 利用词典对原始句子编码 单词->数字
def encode(en_sentences, ch_sentences, en_wtoi, zh_wtoi, sort_by_len = True):
    out_en_sentences = [[en_wtoi.get(w, UNK_IDX) for w in sent] for sent in en_sentences]
    out_ch_sentences = [[zh_wtoi.get(w, UNK_IDX) for w in sent] for sent in ch_sentences]

    #返回w对应的值，否则返回UNK_IDX
    def len_argsort(seq):  #按照长度进行排序
        return sorted(range(len(seq)), key = lambda x: len(seq[x]))

    # 把中文和英文按照同样的顺序排序
    if sort_by_len:
        sorted_index = len_argsort(out_en_sentences)
        out_en_sentences = [out_en_sentences[i] for i in sorted_index]
        out_ch_sentences = [out_ch_sentences[i] for i in sorted_index]

    return out_en_sentences, out_ch_sentences


train_en_encode, train_zh_encode = encode(train_en, train_zh, en_wtoi, zh_wtoi)
dev_en_encode, dev_zh_encode = encode(dev_en, dev_zh, en_wtoi, zh_wtoi)
test_en_encode, test_zh_encode = encode(test_en, test_zh, en_wtoi, zh_wtoi)


In [None]:
test_en_encode[:10]

In [None]:
#返回每个batch的id
def get_minibatches(n, minibatch_size, shuffle = True):
    idx_list = np.arange(0, n, minibatch_size)
    if shuffle:
        np.random.shuffle(idx_list)
    minibatches = []
    for idx in idx_list:
        minibatches.append(np.arange(idx, min(idx + minibatch_size, n)))
    return minibatches

In [None]:
get_minibatches(50, 10, shuffle = True)  #得到每一个batch对应的id

In [None]:
#将句子对划分到batch
def get_batches(en_encode, ch_encode):
    batch_indexs = get_minibatches(len(en_encode), BATCH_SIZE)

    batches = []
    for batch_index in batch_indexs:
        batch_en = [torch.tensor(en_encode[index]).long() for index in batch_index]  #每一个idx对应的句子，转为tensor格式
        batch_zh = [torch.tensor(ch_encode[index]).long() for index in batch_index]
        length_en = torch.tensor([len(en) for en in batch_en]).long()  #每一个句子的长度
        length_zh = torch.tensor([len(zh) for zh in batch_zh]).long()

        batch_en = pad_sequence(batch_en, padding_value = PAD_IDX, batch_first = True)  #讲一个batch中的句子padding为相同长度
        batch_zh = pad_sequence(batch_zh, padding_value = PAD_IDX, batch_first = True)

        batches.append((batch_en, batch_zh, length_en, length_zh))
    return batches


train_data = get_batches(train_en_encode, train_zh_encode)
dev_data = get_batches(dev_en_encode, dev_zh_encode)

In [None]:
train_data[0][3], len(train_data[0][3])

In [None]:
train_data[0][0]

## 建立模型

<img src="https://cdn.mathpix.com/snip/images/8Es5WLO-mn8kY7GO-T7o1IxWUBPbZeLrz3JbqY5U2Vw.original.fullsize.png" width="40%"> 

其中 $\bar{h}_{s}$ 表示encoder每个hidden_state的输出， $h_{t}$ 表示decoder每个hidden_state的输出。

$$
a_{t}(s) =\frac{\exp \left(\operatorname{score}\left(h_{t}, \bar{h}_{s}\right)\right)}{\sum_{s^{\prime}} \exp \left(\operatorname{score}\left(h_{t}, \bar{h}_{s^{\prime}}\right)\right)}\\
c_{t} = \sum a_{t} \bar{h}_{s}\\
\tilde{h}_{t} = \tanh \left(W_{c}\left[c_{t} ; h_{t}\right]\right)
$$

$$
\operatorname{score}\left(\boldsymbol{h}_{t}, \overline{\boldsymbol{h}}_{s}\right)= \begin{cases}\boldsymbol{h}_{t}^{\top} \overline{\boldsymbol{h}}_{s} & \text { dot } \\ \boldsymbol{h}_{t}^{\top} \boldsymbol{W}_{\boldsymbol{a}} \overline{\boldsymbol{h}}_{s} & \text { general } \\ \boldsymbol{v}_{a}^{\top} \tanh \left(\boldsymbol{W}_{\boldsymbol{a}}\left[\boldsymbol{h}_{t} ; \overline{\boldsymbol{h}}_{s}\right]\right) & \text { concat }\end{cases}
$$

In [None]:
class LuongEncoder(nn.Module):
    def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout):
        super(LuongEncoder, self).__init__()
        
        # 随机初始化词向量，词向量值在正态分布N(0,1)中随机取值: vocab_size 词典的大小尺寸, embed_size 嵌入向量的维度
        self.embedding = nn.Embedding(vocab_size, embed_size) 
        self.rnn = nn.GRU(embed_size, enc_hidden_size, bidirectional = True)  #双向GRU（embed_size输入特征维度，enc_hidden_size输出特征维度）
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(enc_hidden_size * 2, dec_hidden_size)

    def forward(self, x, x_lengths):  # x_lengths: 输入句子长度
        """
        input_seqs : batch_size,max(x_lengths)
        input_lengths: batch_size
        """
        embedded = self.dropout(self.embedding(x))  #batch_size,max(x_lengths),embed_size
        packed = pack_padded_sequence(embedded, x_lengths.long().cpu().data.numpy(), batch_first = True,
                                      enforce_sorted = False)
        # batch_first = False (seq, batch, feature)  batch_first = True (batch, seq, feature)
        
        #压缩填充张量,压缩掉无效的填充值
        #enforce_sorted：如果是 True ，则输入应该是按长度降序排序的序列。如果是 False ，会在函数内部进行排序 
        outputs, hidden = self.rnn(packed)
        outputs, _ = pad_packed_sequence(outputs, padding_value = PAD_IDX, batch_first = True)  #还原

        #hidden (2, batch_size, enc_hidden_size)  # 2:双向
        #outputs (batch_size,seq_len, 2 * enc_hidden_size)  h_s

        hidden = torch.cat([hidden[-2], hidden[-1]], dim = 1)  # 变成一维
        hidden = torch.tanh(self.fc(hidden)).unsqueeze(0)  # 修改成decoder可接受hidden size维度
        return outputs, hidden  # outputs为每一个time stamp的输出，hidden为最后一个time stamp的输出

In [None]:
class Attn(nn.Module):
    def __init__(self, enc_hidden_size, dec_hidden_size):
        super(Attn, self).__init__()
        #general attention
        self.linear_in = nn.Linear(enc_hidden_size * 2, dec_hidden_size, bias = False)
        self.linear_out = nn.Linear(enc_hidden_size * 2 + dec_hidden_size, dec_hidden_size)

    def forward(self, output, encoder_out, mask):
        """
        output:batch_size, max(y_lengths), dec_hidden_size  #(h_t)
        encoder_out:batch_size, max(x_lengths), 2 * enc_hidden_size  #(h_s)
        """
        batch_size = output.shape[0]
        output_len = output.shape[1]
        input_len = encoder_out.shape[1]

        encoder_out1 = self.linear_in(encoder_out.view(batch_size * input_len, -1)).view(batch_size, input_len, -1)
        #Wh_s 
        #batch_size,max(x_lengths),dec_hidden_size
        score = torch.bmm(output, encoder_out1.transpose(1, 2))  #实现三维数组的乘法，而不用拆成二维数组使用for循环解决
        #[batch_size,max(y_lengths),dec_hidden_size] * [batch_size,dec_hidden_size,max(x_lengths)]
        #batch_size,max(y_lengths),max(x_lengths)  #score = h_t W h_s
        score.data.masked_fill(mask, -1e16)
        attn = F.softmax(score, dim = 2)  #attention系数矩阵, mask均为0
        # mask的size是(batch_size,n,m)
        # mask中，若该位置为True，就表明score中该位置要被mask掉，用-1e6来代替。
        # PS mask中，若某个位置K_sub(b,i,j)为True，表明这个batch中的第b句话的中文的第i个字是padding or 英文的第j个单词是padding or 两个都是padding
        # 若某个位置K_sub(b,i,j)为False，表明这个batch中的第b句话的中文的第i个字不是padding且英文的第j个单词也不是padding

        ct = torch.bmm(attn, encoder_out)  #ct = aths
        #[batch_size,max(y_lengths),max(x_lengths)] * [batch_size, max(x_lengths), 2 * enc_hidden_size]
        #batch_size, max(y_lengths), enc_hidden_size*2
        output = torch.cat((ct, output), dim = 2)  #batch_size, max(y_lengths), enc_hidden_size*2 + dec_hidden_size

        output = output.view(batch_size * output_len, -1)  #batch_size * max(y_lengths), enc_hidden_size*2 + dec_hidden_size
        output = torch.tanh(self.linear_out(output))  #batch_size * max(y_lengths), dec_hidden_size
        output = output.view(batch_size, output_len, -1)
        #batch_size, max(y_lengths), dec_hidden_size

        return output, attn


In [None]:
class LuongDecoder(nn.Module):
    def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout):
        super(LuongDecoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.attention = Attn(enc_hidden_size, dec_hidden_size)
        self.rnn = nn.GRU(embed_size, dec_hidden_size, batch_first = True)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(dec_hidden_size, vocab_size)

    def creat_mask(self, x, y):
        x_mask = x.data != PAD_IDX  #batch_size,max(x_lengths)  # 不等于为true，不是padding
        y_mask = y.data != PAD_IDX  #batch_size,max(y_lengths)
        mask = (1 - (x_mask.unsqueeze(2) * y_mask.unsqueeze(1)).float()).bool()  # true为padding
        # unsqueeze增加维度
        #batch_size,max(x_lengths),max(y_lengths)
        #attn为batch_size,max(y_lengths),max(x_lengths)，因此y与x对调
        return mask

    def forward(self, encoder_out, x, y, y_lengths, hid):  ## (encoder_out. hid)对应LuongEncoder的输出(outputs, hidden)
        mask = self.creat_mask(y, x)
        y = self.dropout(self.embedding(y))
        packed = pack_padded_sequence(y, y_lengths.long().cpu().data.numpy(), batch_first = True,
                                      enforce_sorted = False)
        out, hid = self.rnn(packed, hid)  # x的 hid和y同时输入到decoder

        out, _ = pad_packed_sequence(out, padding_value = PAD_IDX, batch_first = True)

        output, attn = self.attention(out, encoder_out, mask)  # 输入enc和dec的hidden
        output = self.out(output)
        #batch_size, max(y_lengths), dec_hidden_size --> batch_size, max(y_lengths), vocab_size
        return output, hid, attn  # output为每一个time stamp的输出，hid为最后一个time stamp的输出(attention前)

In [None]:
a = torch.Tensor([1, 2, 3])

In [None]:
a.data,PAD_IDX

In [None]:
class seq2seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(seq2seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, x, x_lengths, y, y_lengths):
        encoder_out, hid = self.encoder(x, x_lengths)
        output, hid, attn = self.decoder(encoder_out,  #这里输出的hid是decoder_rnn的hid
                                         x = x,
                                         y = y,
                                         y_lengths = y_lengths,
                                         hid = hid)  #encoder的hid
        return output, attn

    def translate(self, x, x_lengths, y, max_length = 15):
        encoder_out, hid = self.encoder(x, x_lengths)
        preds = []
        batch_size = x.shape[0]
        attns = []
        for _ in range(max_length):
            output, hid, attn = self.decoder(encoder_out,
                                             x = x,
                                             y = y,
                                             y_lengths = torch.ones(batch_size).long().to(y.device),  # 逐字翻译！
                                             hid = hid)

            y = output.max(2)[1].view(batch_size, 1)

            preds.append(y)
            attns.append(attn)
        return torch.cat(preds, 1), torch.cat(attns, 1)


In [16]:
a.data,PAD_IDX

(tensor([1., 2., 3.]), 1)

In [11]:
class seq2seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(seq2seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, x, x_lengths, y, y_lengths):
        encoder_out, hid = self.encoder(x, x_lengths)
        output, hid, attn = self.decoder(encoder_out,  #这里输出的hid是decoder_rnn的hid
                                         x = x,
                                         y = y,
                                         y_lengths = y_lengths,
                                         hid = hid)  #encoder的hid
        return output, attn

    def translate(self, x, x_lengths, y, max_length = 15):
        encoder_out, hid = self.encoder(x, x_lengths)
        preds = []
        batch_size = x.shape[0]
        attns = []
        for _ in range(max_length):
            output, hid, attn = self.decoder(encoder_out,
                                             x = x,
                                             y = y,
                                             y_lengths = torch.ones(batch_size).long().to(y.device),  # 逐字翻译！
                                             hid = hid)

            y = output.max(2)[1].view(batch_size, 1)

            preds.append(y)
            attns.append(attn)
        return torch.cat(preds, 1), torch.cat(attns, 1)


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#Define Model
encoder = LuongEncoder(vocab_size = len(en_itow), embed_size = EMBED_SIZE, enc_hidden_size = ENC_HIDDEN_SIZE,
                       dec_hidden_size = DEC_HIDDEN_SIZE, dropout = DROPOUT)
decoder = LuongDecoder(vocab_size = len(zh_itow), embed_size = EMBED_SIZE, enc_hidden_size = ENC_HIDDEN_SIZE,
                       dec_hidden_size = DEC_HIDDEN_SIZE, dropout = DROPOUT)
model = seq2seq(encoder, decoder)
model = model.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index = PAD_IDX)  #忽略padding位置的损失
optimizer = torch.optim.Adam(model.parameters())

In [13]:
def train_epoch(model, optimizer, train_data):
    model.train()
    losses = 0
    for x, y, x_lengths, y_lengths in train_data:
        x = x.to(DEVICE)
        y = y.to(DEVICE)
        x_lengths = x_lengths.to(DEVICE)

        y_input = y[:, :-1]  #将前seq-1个单词作为输入
        y_output = y[:, 1:]  #将后seq-1个单词作为输出，相当于前一个单词预测后一个单词
        y_lengths = (y_lengths - 1).to(DEVICE)

        logits, _ = model(x, x_lengths, y_input, y_lengths)  #batch_size, max(y_lengths), vocab_size
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), y_output.reshape(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses += loss.item()

    return losses / len(train_data)


def evaluate(model, dev_data):
    model.train()
    losses = 0
    for x, y, x_lengths, y_lengths in train_data:
        x = x.to(DEVICE)
        y = y.to(DEVICE)
        x_lengths = x_lengths.to(DEVICE)

        y_input = y[:, :-1]
        y_output = y[:, 1:]
        y_lengths = (y_lengths - 1).to(DEVICE)
        logits, _ = model(x, x_lengths, y_input, y_lengths)
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), y_output.reshape(-1))

        losses += loss.item()

    return losses / len(train_data)

In [14]:
from timeit import default_timer as timer

for epoch in range(1, EPOCHS + 1):
    start_time = timer()
    train_loss = train_epoch(model, optimizer, train_data)
    end_time = timer()
    val_loss = evaluate(model, dev_data)
    print((
              f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))

Epoch: 1, Train loss: 5.773, Val loss: 5.144, Epoch time = 4.815s
Epoch: 2, Train loss: 5.007, Val loss: 4.760, Epoch time = 4.802s
Epoch: 3, Train loss: 4.663, Val loss: 4.447, Epoch time = 4.790s
Epoch: 4, Train loss: 4.389, Val loss: 4.218, Epoch time = 4.812s
Epoch: 5, Train loss: 4.168, Val loss: 4.007, Epoch time = 5.031s
Epoch: 6, Train loss: 3.967, Val loss: 3.824, Epoch time = 4.705s
Epoch: 7, Train loss: 3.784, Val loss: 3.655, Epoch time = 4.880s
Epoch: 8, Train loss: 3.616, Val loss: 3.505, Epoch time = 5.031s
Epoch: 9, Train loss: 3.462, Val loss: 3.369, Epoch time = 4.778s
Epoch: 10, Train loss: 3.316, Val loss: 3.241, Epoch time = 4.814s
Epoch: 11, Train loss: 3.179, Val loss: 3.110, Epoch time = 4.890s
Epoch: 12, Train loss: 3.051, Val loss: 2.978, Epoch time = 4.578s
Epoch: 13, Train loss: 2.928, Val loss: 2.866, Epoch time = 4.710s
Epoch: 14, Train loss: 2.812, Val loss: 2.754, Epoch time = 4.702s
Epoch: 15, Train loss: 2.705, Val loss: 2.654, Epoch time = 4.913s
Epoc

In [20]:
def translate_dev(i):
    model.eval()

    en_sent = " ".join([en_itow[word] for word in test_en_encode[i]])
    print('英文原句：', en_sent)
    print('标准中文翻译：', " ".join([zh_itow[word] for word in test_zh_encode[i]]))

    bos = torch.Tensor([[zh_wtoi["BOS"]]]).long().to(DEVICE)
    x = torch.Tensor(test_en_encode[i]).long().to(DEVICE).reshape(1, -1)
    x_len = torch.Tensor([len(test_en_encode[i])]).long().to(DEVICE)

    translation, _ = model.translate(x, x_len, bos)
    translation = [zh_itow[i] for i in translation.data.cpu().numpy().reshape(-1)]

    trans = []
    for word in translation:
        if word != "EOS":
            trans.append(word)
        else:
            break
    print('模型翻译结果：', " ".join(trans))


for i in range(50, 100):
    translate_dev(i)
    print()

英文原句： BOS they want more . EOS
标准中文翻译： BOS 他們 想要 更 多 。 EOS
模型翻译结果： 汤姆 想 吃 了 。

英文原句： BOS tom was bullied . EOS
标准中文翻译： BOS 汤姆 被 UNK 了 。 EOS
模型翻译结果： 汤姆 是 蓝色 的 。

英文原句： BOS see you around . EOS
标准中文翻译： BOS 再见 ！ EOS
模型翻译结果： 再见 ！

英文原句： BOS now i remember . EOS
标准中文翻译： BOS 现在 我 想 起来 了 。 EOS
模型翻译结果： 但願 我 吃 飽 。

英文原句： BOS she hated him . EOS
标准中文翻译： BOS 她 恨 他 。 EOS
模型翻译结果： 她 愛 他 。

英文原句： BOS tell me again . EOS
标准中文翻译： BOS 重新 告訴 我 。 EOS
模型翻译结果： 再 再 再 再 再 再 再 再 來 了 。

英文原句： BOS tom follows orders . EOS
标准中文翻译： BOS 汤姆 UNK 。 EOS
模型翻译结果： 汤姆 走得 很慢 。

英文原句： BOS do it now . EOS
标准中文翻译： BOS 現在 就 做 。 EOS
模型翻译结果： 现在 在 做 。

英文原句： BOS foxes eat UNK . EOS
标准中文翻译： BOS 狐狸 吃 UNK 。 EOS
模型翻译结果： 下午 都 来 。

英文原句： BOS i almost won . EOS
标准中文翻译： BOS 我 几乎 赢 了 。 EOS
模型翻译结果： 我 轉動 58 了 。

英文原句： BOS shake my hand . EOS
标准中文翻译： BOS 和 我 握手 。 EOS
模型翻译结果： 别进 我 的 手 。

英文原句： BOS they hated tom . EOS
标准中文翻译： BOS 他們 恨 湯姆 。 EOS
模型翻译结果： 他们 原谅 了 。

英文原句： BOS prices went up . EOS
标准中文翻译： BOS UNK 上 UNK 。 EOS
模型翻译结果： 下 点 。

英文原句： BO

# 练习一：Bi-LSTM + attention 用于情感分类

补全代码：我们使用构建一个Bi-LSTM + attention模型完成文本分类任务，数据使用IMDb电影评论数据集，检测一段文字的情感是正面还是负面。

[论文](https://aclanthology.org/P16-2034.pdf)

![](https://cdn.mathpix.com/snip/images/pvB4-X5G9OAFQ_A2wYqjCZxoOOMu_u1PpkkrJUlTbQ8.original.fullsize.png)

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

## 准备数据

In [2]:
tokenizer = get_tokenizer('basic_english')
train_iter = IMDB(split = 'train')


def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)


vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials = ["<unk>"])
vocab.set_default_index(vocab["<unk>"])
vocab.insert_token("<pad>", 1)




In [3]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: 0 if x == 'neg' else 1

In [4]:
def collate_batch(batch):  #自定义的batch输出
    label_list, text_list, lengths = [], [], []
    batch.sort(key = lambda x: len(text_pipeline(x[1])), reverse = True)  #按照长度的大小进行排序
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text))
        text_list.append(processed_text)
        lengths.append(len(processed_text))
    text_list = pad_sequence(text_list, padding_value = vocab.get_stoi()["<pad>"],
                             batch_first = True)  #进行填充，每个batch中的句子需要有相同的长度
    return torch.tensor(label_list), text_list, lengths

In [43]:
train_iter, test_iter = IMDB(root = 'data', split = ('train', 'test'))
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size = 128,
                              shuffle = True, collate_fn = collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size = 128,
                              shuffle = True, collate_fn = collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size = 128,
                             shuffle = True, collate_fn = collate_batch)



In [44]:
data = next(iter(train_dataloader))

In [45]:
data[0]  #标签

tensor([0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
        0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
        1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
        1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
        0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0,
        0, 1, 1, 1, 0, 1, 1, 1])

In [46]:
data[1]  #batch_size,max_length

tensor([[ 13, 163,   9,  ..., 131, 117,   3],
        [822,  15,  12,  ...,   1,   1,   1],
        [ 87, 637,  18,  ...,   1,   1,   1],
        ...,
        [ 13, 438,   8,  ...,   1,   1,   1],
        [115, 361,   4,  ...,   1,   1,   1],
        [ 14,  21,  52,  ...,   1,   1,   1]])

In [47]:
data[2]  #lengths

[1112,
 941,
 910,
 899,
 897,
 800,
 759,
 759,
 738,
 729,
 728,
 718,
 655,
 645,
 583,
 575,
 569,
 568,
 512,
 496,
 486,
 469,
 464,
 454,
 422,
 408,
 405,
 404,
 388,
 386,
 383,
 383,
 380,
 378,
 365,
 362,
 339,
 332,
 328,
 319,
 317,
 314,
 310,
 287,
 282,
 278,
 264,
 258,
 254,
 252,
 252,
 246,
 245,
 242,
 237,
 236,
 227,
 227,
 222,
 222,
 220,
 218,
 216,
 216,
 215,
 214,
 209,
 209,
 206,
 203,
 201,
 201,
 198,
 196,
 192,
 190,
 189,
 186,
 185,
 183,
 173,
 172,
 169,
 168,
 167,
 167,
 165,
 165,
 165,
 158,
 158,
 158,
 157,
 153,
 152,
 151,
 150,
 148,
 147,
 146,
 146,
 144,
 144,
 143,
 142,
 141,
 141,
 140,
 135,
 133,
 131,
 128,
 128,
 126,
 116,
 114,
 110,
 108,
 105,
 103,
 95,
 88,
 77,
 76,
 75,
 71,
 61,
 56]

## 模型

模型分为五个部分

![](https://cdn.mathpix.com/snip/images/pvB4-X5G9OAFQ_A2wYqjCZxoOOMu_u1PpkkrJUlTbQ8.original.fullsize.png)

- 输入层（Input layer）：将句子输入模型
- 嵌入层（Embedding layer）：将每个词映射到一个低维向量
- LSTM层（LSTM layer）：利用BiLSTM从词向量中获得特征
- Attention层（Attention layer）：生成权重向量，将每个时间步长的单词级特征与权重向量相乘，合并成句子级特征向量（补全代码）
- 输出层（Output layer）： 对句子进行分类

In [79]:
class bilstm_attn(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                 dropout_rate, pad_id):
        super(bilstm_attn, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_id)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers = n_layers, bidirectional = True,
                            dropout = dropout_rate, batch_first = True)
        self.dropout = nn.Dropout(dropout_rate)
        """
            Write your code here.
        """

    def forward(self, x, lengths):
        embedded = self.dropout(self.embedding(x))  # [batch size,seq len] -> [batch size,seq len,embedding_dim]

        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths, batch_first = True)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # hidden = [n layers *2, batch size, hidden dim]最后一个step的hidden
        # cell = [n layers * 2, batch size, hidden dim]最终一个step的cell
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first = True)
        # output = [batch size, seq len, hidden dim * 2]#每一个step下的最后一层的output
        output = output.reshape(output.shape[0], output.shape[1], 2, -1)
        # output = [batch size, seq len, 2,hidden dim]
        output = torch.sum(output, dim = 2)
        # output = [batch size, seq len, hidden dim]
        output = self.dropout(output)

        """
            Write your code here.
        """
        return prediction

In [80]:
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = 2
n_layers = 1
dropout_rate = 0.5
pad_id = vocab.get_stoi()["<pad>"]

In [85]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = bilstm_attn(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout_rate, pad_id).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [86]:
def train(model, train_loader, optimizer, loss_fn):
    epoch_loss = 0
    corrects = 0
    total_len = 0
    model.train()  #model.train()代表了训练模式
    for label, text, lengths in train_loader:
        label = label.to(device)
        text = text.to(device)

        out = model(text, lengths)
        loss = loss_fn(out, label)

        _, pred = torch.max(out.data, 1)
        corrects += (pred == label).sum().item()

        optimizer.zero_grad()  #加这步防止梯度叠加
        loss.backward()  #反向传播
        optimizer.step()  #梯度下降

        epoch_loss += loss.item() * len(label)
        #loss.item()已经本身除以了len(batch.label)
        #所以得再乘一次，得到一个batch的损失，累加得到所有样本损失。

        total_len += len(label)
        #计算train_iterator所有样本的数量，不出意外应该是17500

    return epoch_loss / total_len, corrects / total_len

In [87]:
def evaluate(model, valid_loader):
    epoch_loss = 0
    corrects = 0
    total_len = 0

    model.eval()
    #转换成测试模式，冻结dropout层或其他层。

    with torch.no_grad():
        for label, text, lengths in valid_loader:
            #iterator为valid_iterator
            label = label.to(device)
            text = text.to(device)

            out = model(text, lengths)
            loss = loss_fn(out, label)

            _, pred = torch.max(out.data, 1)
            corrects += (pred == label).sum().item()

            epoch_loss += loss.item() * len(label)
            total_len += len(label)
    model.train()  #调回训练模式

    return epoch_loss / total_len, corrects / total_len

In [88]:
for epoch in range(10):
    train_loss, train_acc = train(model, train_dataloader, optimizer, loss_fn)
    print("epoch:", epoch, "train_loss:", train_loss, "train_acc", train_acc)
    valid_loss, valid_acc = evaluate(model, valid_dataloader)
    print("epoch:", epoch, "valid_loss:", valid_loss, "valid_acc", valid_acc)


epoch: 0 train_loss: 0.6832094741921676 train_acc 0.5885052631578948
epoch: 0 valid_loss: 0.5922406481742859 valid_acc 0.6976
epoch: 1 train_loss: 0.5618076030480235 train_acc 0.709978947368421
epoch: 1 valid_loss: 0.47752310166358947 valid_acc 0.772
epoch: 2 train_loss: 0.46739850202108685 train_acc 0.7832842105263158
epoch: 2 valid_loss: 0.45280455932617186 valid_acc 0.8024
epoch: 3 train_loss: 0.405346336911854 train_acc 0.8201263157894737
epoch: 3 valid_loss: 0.41220421171188354 valid_acc 0.8248
epoch: 4 train_loss: 0.3586794477236898 train_acc 0.844378947368421
epoch: 4 valid_loss: 0.35476437602043154 valid_acc 0.8536
epoch: 5 train_loss: 0.3227162823275516 train_acc 0.8635368421052632
epoch: 5 valid_loss: 0.4248126239776611 valid_acc 0.8312
epoch: 6 train_loss: 0.2943850700114903 train_acc 0.8784421052631579
epoch: 6 valid_loss: 0.3186738593578339 valid_acc 0.8664
epoch: 7 train_loss: 0.27103296057676013 train_acc 0.8895157894736843
epoch: 7 valid_loss: 0.33145447697639463 valid_

In [92]:
torch.save(model.state_dict(), 'bilstm_attn.pt')

In [89]:
def predict_sentiment(text):
    text = text_pipeline(text)

    length = torch.LongTensor([len(text)])
    tensor = torch.LongTensor(text).unsqueeze(0).to(device)

    out = model(tensor, length)
    _, pred = torch.max(out.data, 1)
    return pred.item()

In [90]:
predict_sentiment("This film is terrible")

0

In [91]:
predict_sentiment("This film is great")

1