# LSTM

In [1]:
import torch
from torch import nn, optim
from torchtext.datasets import IMDB

print("GPU:", torch.backends.mps.is_available())
print("GPU:", torch.cuda.is_available())
device = torch.device("mps")
torch.manual_seed(123)


GPU: True
GPU: False


<torch._C.Generator at 0x10e115b70>

In [2]:
# TEXT = data.Field(tokenize="spacy")
# LABEL = data.LabelField(dtype=torch.float)
# train_data, test_data = IMDB.splits(TEXT, LABEL)
#
# print("len of train data:", len(train_data))
# print("len of test data:", len(test_data))
#
# print(train_data.examples[15].text)
# print(train_data.examples[15].label)
#
# # word2vec, glove
# TEXT.build_vocab(train_data, max_size=10000, vectors="glove.6B.100d")
# LABEL.build_vocab(train_data)


需要注意的是, 在新版当中, tokenlizer的TEXT和FIELD已经移除了, 可以使用utils中的get_tokenizer和vacob包来进行分词处理. 而输出的结果label也变成了1和2. 原本的输出应该是pos,neg

这里我们简单的创建一个vocab, 后面我们会使用Glove来进行训练

In [3]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, GloVe, vocab

train_data_iter, test_data_iter = IMDB(split=("train", "test"))
tokenizer = get_tokenizer("basic_english")


def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)


# 这里指定一个最大单词数量. 这里使用的build_vocab_from_iterator本质就是一个string to index. 简写为stoi.
def get_vocab(train_data_pipe):
    vocab = build_vocab_from_iterator(
        yield_tokens(train_data_pipe), specials=["<UNK>", "<PAD>"], max_tokens=10000
    )
    vocab.set_default_index(vocab["<UNK>"])
    return vocab


# 在新版当中我们使用Vocab包对数据进行编码, 获得所有的词汇量
# build_vocab_from_iterator：根据给定的迭代器yield_tokens(train_iter)，来构造一个Vocab对象。具体的Vocab类的介绍Vocab类。
# 得到一个库,就是将文本转为编码
train_vocab = get_vocab(train_data_iter)


In [4]:
print(len(train_vocab))
sentence = "i have a apple"
sentence_token = tokenizer(sentence)
print(sentence_token)
print(train_vocab(sentence_token))


10000
['i', 'have', 'a', 'apple']
[13, 33, 6, 7316]


In [5]:
# 建立rnn网络
# 注意, 这里建立的是LSTM网络, 而不是LSTMCell, 所以需要固定长度
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(RNN, self).__init__()
        # 在构造函数中构建embedding. 这里直接将输入的vocabulary数量转换为一个宽度为embedding_dim的矩阵
        # 这里我们可以设置一个embedding为10000, 剩下的单词为不知道, 只使用一万个单词. 每一个单词维度是100
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # 构建lstm层, 这里的我们使用了两层lstm, 100个词维度, 256个memory. 也就是c和h都是memory.
        # 这里设置了bidirection
        self.rnn = nn.LSTM(
            embedding_dim, hidden_dim, num_layers=2, bidirectional=True, dropout=0.5
        )
        # 将h层作为输出层, 输出为1
        self.fc = nn.Linear(hidden_dim * 2, 1)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        # [seq,b,1] => [seq,b,100]
        # 每一个单词生成一个向量, 一个用b句话
        embedding = self.dropout(self.embedding(x))

        # output: [seq, b, hid_dim*2]
        # hidden/h: [num_layers*2, b, hid_dim]
        # cell/c: [num_layers*2, b, hid_di]
        # 需要注意的是, 由于我们使用的是双向的神经网络, 所以所有的weight都是两倍的
        output, (h, c) = self.rnn(embedding)

        # 这里我们将h的两个维度拿过来做一个concat, 由于是双向的, 会有两个维度, h1和h2.
        # 这里我们使用的是h也就是最后的一个时间序列来进行计算
        # [num_layers*2, b, hid_dim] => 2 of [b, hid_dim] => [b, hid_dim*2]
        h = torch.cat([h[-2], h[-1]], dim=1)

        # 将上面concat以后得向量送入全连接层
        h = self.dropout(h)
        out = self.fc(h)  # 输出大小为[b]的向量
        return out


在新版本中, TEXT和FIELD都不存在了, torchtext中增加了两个类 Vocab和Vectors. 同时GloVe embedding也独立成为一个可下载工具.

这里我们直接引用, 引用会下载在cache中

In [6]:
from torchtext.vocab import GloVe, vocab

# 由于我们的单词只用100维, 这里就用100维, 默认300
# 需要注意的是, 只有6b才有100维向量, 默认的向量不存在100维
glove_vectors = GloVe(dim=100, name="6B")
glove_vocab = vocab(glove_vectors.stoi, 0)
glove_vocab.insert_token("<unk>", 0)
# this is necessary otherwise it will throw runtime error if OOV token is queried
glove_vocab.set_default_index(0)
pretrained_embeddings = glove_vectors.vectors
pretrained_embeddings = torch.cat(
    (torch.zeros(1, pretrained_embeddings.shape[1]), pretrained_embeddings)
)


In [7]:
# 通过词, 创建LSTM
# 注意: 这里我们直接用了embedding的长度来进行训练, 但是如果可以的话, 可以使用数据源的长度来进行训练.
rnn = RNN(len(glove_vocab), 100, 256)
# 将embedding的weight放入到我们自定义的embedding里面
rnn.embedding.weight.data.copy_(pretrained_embeddings)


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        [-0.1077,  0.1105,  0.5981,  ..., -0.8316,  0.4529,  0.0826],
        ...,
        [ 0.3609, -0.1692, -0.3270,  ...,  0.2714, -0.2919,  0.1611],
        [-0.1046, -0.5047, -0.4933,  ...,  0.4253, -0.5125, -0.1705],
        [ 0.2837, -0.6263, -0.4435,  ...,  0.4368, -0.8261, -0.1570]])

在使用之前, 我们重新定义一下dataloader. 同样,在原先的版本中,dataloader的定义比较简单. 在新版本中我们需要手动定义数据格式.

In [8]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence


def text_transform(text):
    vocab = glove_vocab(tokenizer(text))
    return vocab


def label_transform(target):
    label = torch.tensor([target - 1], dtype=torch.float32)
    return label


def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
        label_list.append(label_transform(_label))
        processed_text = torch.tensor(text_transform(_text))
        text_list.append(processed_text)
    return torch.tensor(label_list), pad_sequence(text_list, padding_value=3.0)


train_iter = IMDB(split="train")
train_dataloader = DataLoader(
    list(train_iter), batch_size=8, shuffle=True, collate_fn=collate_batch
)
test_iter = IMDB(split="test")
test_dataloader = DataLoader(
    list(test_iter), batch_size=8, shuffle=True, collate_fn=collate_batch
)


In [9]:
optimizer = optim.Adam(rnn.parameters(), lr=1e-3)
criteon = nn.BCEWithLogitsLoss().to(device)
rnn.to(device)


RNN(
  (embedding): Embedding(400001, 100)
  (rnn): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [10]:
import numpy as np


def binary_acc(preds, y):
    """
    get accuracy
    """
    preds = torch.round(torch.sigmoid(preds))
    correct = torch.eq(preds, y).float()
    acc = correct.sum() / len(correct)
    return acc


## 训练环节
def train(epoch, rnn, iterator, optimizer, criteon):

    avg_acc = []
    rnn.train()

    for i, (l, t) in enumerate(iterator):
        label, text = l.to(device), t.to(device)
        pred = rnn(text).squeeze(1)

        loss = criteon(pred, label)  # 计算一下loss
        # 我们计算一下train的准确率
        acc = binary_acc(pred, label).item()
        avg_acc.append(acc)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 10 == 0:
            print(
                "Epoch: {} with [{}] step, current accuracy: {}".format(epoch, i, acc)
            )

    avg_acc = np.array(avg_acc).mean()
    print("avg acc:", avg_acc)


## 测试环节
def eval(epoch, rnn, iterator, criteon):
    avg_acc = []
    rnn.eval()
    with torch.no_grad():  # 关闭梯度信息
        for (l, t) in iterator:
            label, text = l.to(device), t.to(device)
            # [b, 1] => [b]
            pred = rnn(text).squeeze(1)
            loss = criteon(pred, label)
            acc = binary_acc(pred, label).item()  # 计算一个准确度
            avg_acc.append(acc)

    avg_acc = np.array(avg_acc).mean()
    print("with epoch {} we have accuracy in test: {}".format(epoch, avg_acc))


In [11]:
for epoch in range(10):
    train(epoch, rnn, train_dataloader, optimizer, criteon)
    eval(epoch, rnn, test_dataloader, criteon)


Epoch: 0 with [0] step, current accuracy: 0.5
Epoch: 0 with [10] step, current accuracy: 0.625
Epoch: 0 with [20] step, current accuracy: 0.5
Epoch: 0 with [30] step, current accuracy: 0.25
Epoch: 0 with [40] step, current accuracy: 0.5
Epoch: 0 with [50] step, current accuracy: 0.0
Epoch: 0 with [60] step, current accuracy: 0.0
Epoch: 0 with [70] step, current accuracy: 0.0
Epoch: 0 with [80] step, current accuracy: 0.0
Epoch: 0 with [90] step, current accuracy: 0.0
Epoch: 0 with [100] step, current accuracy: 0.0
Epoch: 0 with [110] step, current accuracy: 0.0
Epoch: 0 with [120] step, current accuracy: 0.0
Epoch: 0 with [130] step, current accuracy: 0.0
Epoch: 0 with [140] step, current accuracy: 0.0
Epoch: 0 with [150] step, current accuracy: 0.0
Epoch: 0 with [160] step, current accuracy: 0.0
Epoch: 0 with [170] step, current accuracy: 0.0
Epoch: 0 with [180] step, current accuracy: 0.0
Epoch: 0 with [190] step, current accuracy: 0.0
Epoch: 0 with [200] step, current accuracy: 0.0


KeyboardInterrupt: 