#  seq2seq + Luong 注意力 进行中英文翻译

In [1]:
import numpy as np
import jieba
from collections import Counter  #计数器
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import MultiheadAttention
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torchtext.data.utils import get_tokenizer  #分词器

# 练习一：Bi-LSTM + attention 用于情感分类

补全代码：我们使用构建一个Bi-LSTM + attention模型完成文本分类任务，数据使用IMDb电影评论数据集，检测一段文字的情感是正面还是负面。

[论文](https://aclanthology.org/P16-2034.pdf)

![](https://cdn.mathpix.com/snip/images/pvB4-X5G9OAFQ_A2wYqjCZxoOOMu_u1PpkkrJUlTbQ8.original.fullsize.png)

In [2]:
import torch
from torch import nn
from torch.utils.data import DataLoader

from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

## 准备数据

In [3]:
tokenizer = get_tokenizer('basic_english')
train_iter = IMDB(split = 'train')


def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)


vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials = ["<unk>"])
vocab.set_default_index(vocab["<unk>"])
vocab.insert_token("<pad>", 1)




In [4]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: 0 if x == 'neg' else 1

In [5]:
def collate_batch(batch):  #自定义的batch输出
    label_list, text_list, lengths = [], [], []
    batch.sort(key = lambda x: len(text_pipeline(x[1])), reverse = True)  #按照长度的大小进行排序
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text))
        text_list.append(processed_text)
        lengths.append(len(processed_text))
    text_list = pad_sequence(text_list, padding_value = vocab.get_stoi()["<pad>"],
                             batch_first = True)  #进行填充，每个batch中的句子需要有相同的长度
    return torch.tensor(label_list), text_list, lengths

In [6]:
train_iter, test_iter = IMDB(root = 'data', split = ('train', 'test'))
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size = 128,
                              shuffle = True, collate_fn = collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size = 128,
                              shuffle = True, collate_fn = collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size = 128,
                             shuffle = True, collate_fn = collate_batch)

In [7]:
data = next(iter(train_dataloader))

In [8]:
data[0]  #标签

tensor([0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0,
        1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1,
        1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
        1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0,
        1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0,
        1, 0, 1, 1, 0, 0, 1, 1])

In [9]:
data[1]  #batch_size,max_length

tensor([[11133,   512,    18,  ...,  4904,    24,     3],
        [   94,   139,     8,  ...,     1,     1,     1],
        [  288,   725,     6,  ...,     1,     1,     1],
        ...,
        [    6,  1126,    21,  ...,     1,     1,     1],
        [   19,   154,     7,  ...,     1,     1,     1],
        [ 1196,    80,   100,  ...,     1,     1,     1]])

In [10]:
data[2]  #lengths

[944,
 864,
 777,
 695,
 653,
 544,
 531,
 522,
 499,
 497,
 496,
 491,
 485,
 484,
 479,
 457,
 453,
 440,
 439,
 433,
 429,
 422,
 419,
 407,
 404,
 397,
 395,
 383,
 372,
 369,
 352,
 342,
 331,
 323,
 320,
 312,
 312,
 312,
 311,
 311,
 295,
 295,
 294,
 294,
 294,
 280,
 275,
 272,
 263,
 260,
 259,
 253,
 253,
 243,
 234,
 224,
 220,
 220,
 216,
 211,
 210,
 209,
 204,
 203,
 201,
 200,
 193,
 191,
 191,
 188,
 187,
 186,
 186,
 185,
 184,
 183,
 181,
 179,
 173,
 171,
 168,
 164,
 163,
 163,
 162,
 156,
 156,
 156,
 153,
 150,
 150,
 148,
 141,
 141,
 138,
 134,
 133,
 133,
 133,
 132,
 132,
 130,
 128,
 128,
 127,
 126,
 120,
 120,
 119,
 118,
 111,
 110,
 108,
 106,
 93,
 93,
 91,
 83,
 81,
 79,
 77,
 67,
 67,
 63,
 57,
 50,
 47,
 46]

## 模型

模型分为五个部分

![](https://cdn.mathpix.com/snip/images/pvB4-X5G9OAFQ_A2wYqjCZxoOOMu_u1PpkkrJUlTbQ8.original.fullsize.png)

- 输入层（Input layer）：将句子输入模型
- 嵌入层（Embedding layer）：将每个词映射到一个低维向量
- LSTM层（LSTM layer）：利用BiLSTM从词向量中获得特征
- Attention层（Attention layer）：生成权重向量，将每个时间步长的单词级特征与权重向量相乘，合并成句子级特征向量（补全代码）
- 输出层（Output layer）： 对句子进行分类

In [11]:
class bilstm_attn(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                 dropout_rate, pad_id):
        super(bilstm_attn, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_id)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers = n_layers, bidirectional = True,
                            dropout = dropout_rate, batch_first = True)
        self.dropout = nn.Dropout(dropout_rate)
        self.attn = MultiheadAttention(hidden_dim, 1, batch_first = True)
        self.Q = nn.Linear(hidden_dim, hidden_dim)
        self.K = nn.Linear(hidden_dim, hidden_dim)
        self.V = nn.Linear(hidden_dim, hidden_dim)
        self.output = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, lengths):
        embedded = self.dropout(self.embedding(x))  # [batch size,seq len] -> [batch size,seq len,embedding_dim]

        packed_embedded = pack_padded_sequence(embedded, lengths, batch_first = True)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # hidden = [n layers *2, batch size, hidden dim]最后一个step的hidden
        # cell = [n layers * 2, batch size, hidden dim]最终一个step的cell
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first = True)
        # output = [batch size, seq len, hidden dim * 2]#每一个step下的最后一层的output
        output = output.reshape(output.shape[0], output.shape[1], 2, -1)
        # output = [batch size, seq len, 2,hidden dim]
        output = torch.sum(output, dim = 2)
        # output = [batch size, seq len, hidden dim]
        output = self.dropout(output)
        q_vector = self.Q(output)
        k_vector = self.K(output)
        v_vector = self.V(output)
        attn_output, attn_output_weights = self.attn(q_vector, k_vector, v_vector)
        prediction = self.output(attn_output.sum(dim = 1))
        return prediction

In [12]:
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = 2
n_layers = 1
dropout_rate = 0.5
pad_id = vocab.get_stoi()["<pad>"]

In [13]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = bilstm_attn(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout_rate, pad_id).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())



In [14]:
from tqdm import tqdm


def train(model, train_loader, optimizer, loss_fn):
    epoch_loss = 0
    corrects = 0
    total_len = 0
    model.train()  #model.train()代表了训练模式
    for label, text, lengths in tqdm(train_loader):
        label = label.to(device)
        text = text.to(device)

        out = model(text, lengths)
        loss = loss_fn(out, label)

        _, pred = torch.max(out.data, 1)
        corrects += (pred == label).sum().item()

        optimizer.zero_grad()  #加这步防止梯度叠加
        loss.backward()  #反向传播
        optimizer.step()  #梯度下降

        epoch_loss += loss.item() * len(label)
        #loss.item()已经本身除以了len(batch.label)
        #所以得再乘一次，得到一个batch的损失，累加得到所有样本损失。

        total_len += len(label)
        #计算train_iterator所有样本的数量，不出意外应该是17500

    return epoch_loss / total_len, corrects / total_len

In [15]:
def evaluate(model, valid_loader):
    epoch_loss = 0
    corrects = 0
    total_len = 0

    model.eval()
    #转换成测试模式，冻结dropout层或其他层。

    with torch.no_grad():
        for label, text, lengths in tqdm(valid_loader):
            #iterator为valid_iterator
            label = label.to(device)
            text = text.to(device)

            out = model(text, lengths)
            loss = loss_fn(out, label)

            _, pred = torch.max(out.data, 1)
            corrects += (pred == label).sum().item()

            epoch_loss += loss.item() * len(label)
            total_len += len(label)
    model.train()  #调回训练模式

    return epoch_loss / total_len, corrects / total_len

In [16]:
for epoch in range(10):
    train_loss, train_acc = train(model, train_dataloader, optimizer, loss_fn)
    print("epoch:", epoch, "train_loss:", train_loss, "train_acc", train_acc)
    valid_loss, valid_acc = evaluate(model, valid_dataloader)
    print("epoch:", epoch, "valid_loss:", valid_loss, "valid_acc", valid_acc)


100%|██████████| 186/186 [01:32<00:00,  2.00it/s]


epoch: 0 train_loss: 3.335827901147541 train_acc 0.5699368421052632


100%|██████████| 10/10 [00:03<00:00,  2.80it/s]


epoch: 0 valid_loss: 0.6690802865028381 valid_acc 0.62


100%|██████████| 186/186 [01:33<00:00,  2.00it/s]


epoch: 1 train_loss: 0.5631066594475194 train_acc 0.7155789473684211


100%|██████████| 10/10 [00:03<00:00,  2.93it/s]


epoch: 1 valid_loss: 0.5004681632995606 valid_acc 0.7728


100%|██████████| 186/186 [01:32<00:00,  2.00it/s]


epoch: 2 train_loss: 0.45125067443345723 train_acc 0.7912


100%|██████████| 10/10 [00:03<00:00,  2.81it/s]


epoch: 2 valid_loss: 0.3808725685119629 valid_acc 0.8264


100%|██████████| 186/186 [01:32<00:00,  2.01it/s]


epoch: 3 train_loss: 0.4035148923246484 train_acc 0.8285052631578947


100%|██████████| 10/10 [00:03<00:00,  2.89it/s]


epoch: 3 valid_loss: 0.38572630796432494 valid_acc 0.8448


100%|██████████| 186/186 [01:32<00:00,  2.00it/s]


epoch: 4 train_loss: 0.34146378761341695 train_acc 0.857178947368421


100%|██████████| 10/10 [00:03<00:00,  2.87it/s]


epoch: 4 valid_loss: 0.4886480527877808 valid_acc 0.8088


100%|██████████| 186/186 [01:32<00:00,  2.00it/s]


epoch: 5 train_loss: 0.28406720405628805 train_acc 0.8827368421052632


100%|██████████| 10/10 [00:03<00:00,  2.96it/s]


epoch: 5 valid_loss: 0.3373931920051575 valid_acc 0.868


100%|██████████| 186/186 [01:32<00:00,  2.01it/s]


epoch: 6 train_loss: 0.24964638517279375 train_acc 0.8992


100%|██████████| 10/10 [00:03<00:00,  3.06it/s]


epoch: 6 valid_loss: 0.3452787638664246 valid_acc 0.8712


100%|██████████| 186/186 [01:33<00:00,  2.00it/s]


epoch: 7 train_loss: 0.229890008506022 train_acc 0.9080842105263158


100%|██████████| 10/10 [00:03<00:00,  2.95it/s]


epoch: 7 valid_loss: 0.31069177660942077 valid_acc 0.8816


100%|██████████| 186/186 [01:32<00:00,  2.01it/s]


epoch: 8 train_loss: 0.19923359266958737 train_acc 0.9213473684210526


100%|██████████| 10/10 [00:03<00:00,  3.02it/s]


epoch: 8 valid_loss: 0.3836130741596222 valid_acc 0.8736


100%|██████████| 186/186 [01:32<00:00,  2.02it/s]


epoch: 9 train_loss: 0.18384257307429064 train_acc 0.926778947368421


100%|██████████| 10/10 [00:03<00:00,  2.89it/s]

epoch: 9 valid_loss: 0.34496309649944307 valid_acc 0.8992





In [17]:
torch.save(model.state_dict(), 'attn_data/bilstm_attn.pt')

In [18]:
def predict_sentiment(text):
    text = text_pipeline(text)

    length = torch.LongTensor([len(text)])
    tensor = torch.LongTensor(text).unsqueeze(0).to(device)

    out = model(tensor, length)
    _, pred = torch.max(out.data, 1)
    return pred.item()

In [19]:
predict_sentiment("This film is terrible")

0

In [20]:
predict_sentiment("This film is great")

1