该笔记本可在 Kaggle 环境下成功运行，本地环境暂未测试。

模型实现参考了如下教程：

https://github.com/bentrevett/pytorch-sentiment-analysis/

In [None]:
%%capture

import os
import time
import string

import numpy as np
import pandas as pd

# PyTorch 相关
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.vocab import Vectors
from torchtext import data

# 安装特定版本库
!pip install -v pytorch-ignite==0.4rc.0.post1
!pip install --upgrade scikit-learn

# Ignite 相关
from ignite.engine import Events, Engine
from ignite.metrics import Precision, Recall, Accuracy, Loss
from ignite.handlers import ModelCheckpoint
from ignite.contrib.metrics import RocCurve, ROC_AUC
from ignite.contrib.handlers.tqdm_logger import ProgressBar

from sklearn.metrics import RocCurveDisplay

In [None]:
# Input data files are available in the "/kaggle/input" directory.

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## 准备数据

对于词嵌入层以及数据集的预处理，我们使用 `torchtext` 包来进行处理。`torchtext` 包提供了从 csv 数据集文件直接构建 PyTorch 所需数据集格式的相关函数。

同时，`torchtext` 还提供 `build_vocab` 函数，加载词向量文件（word2vec 或 GloVe），同时根据数据集中存在的词语，去除掉预训练向量中多余的词汇，以提高内存利用效率。

In [None]:
EMBEDDING_FILE = '/kaggle/input/imdb-word2vec/word2vec.txt'
# EMBEDDING_FILE = '/kaggle/input/glove6b100dtxt/glove.6B.100d.txt'

def load_file(file_path, device, embedding_file):

    TEXT = data.Field(sequential=True, lower=True, include_lengths=True)
    LABEL = data.Field(sequential=False, use_vocab=False)
    
    datafields = [('clean_text', TEXT), ('label', LABEL)]
    # Step two construction our dataset.
    train, valid, test = data.TabularDataset.splits(path=file_path,
                                                    train="Train_clean.csv", validation="Valid_clean.csv",
                                                    test="Test_clean.csv", format="csv",
                                                    skip_header=True, fields=datafields)
    # because of input dir is read-only we must change the cache path.
    cache = ('/kaggle/working/.vector_cache')
    if not os.path.exists(cache):
        os.mkdir(cache)
    # using the pretrained word embedding.
    vector = Vectors(name=embedding_file, cache=cache)
    TEXT.build_vocab(train, vectors=vector, max_size=25000, unk_init=torch.Tensor.normal_)
    train_iter, valid_iter, test_iter = data.BucketIterator.splits((train, valid, test), device=device, batch_size=64, 
                                                             sort_key=lambda x:len(x.clean_text), sort_within_batch=True)
    
    return TEXT, LABEL, train_iter, valid_iter, test_iter


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

TEXT, LABEL, train_iter, valid_iter, test_iter = load_file('/kaggle/input/cleaned-imdb-data', 
                                                          device, EMBEDDING_FILE)

In [None]:
TEXT.vocab

## 建立模型

分别实现 RNN 模型及 LSTM 模型。 (RNN 有梯度消失的问题) 

我们使用 PyTorch 框架提供的 API 来构建模型。

PyTorch 的 nn 模块提供了神经网络中常见的网络层，如 `nn.Embedding` 和 `nn.RNN`，以及 `nn.Linear`等。我们按照前一节设计的网络结构，在一个继承 `nn.Module` 的类中定义相关层的实例，并在 `forward` 函数中定义网络的前向传播即可。

In [None]:
class SentimentModelRNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):

        embedded = self.embedding(text)
        
        output, hidden = self.rnn(embedded)

        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [None]:
class SentimentModelLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        
        if bidirectional:
            self.fc = nn.Linear(hidden_dim * 2, output_dim)
        else:
            self.fc = nn.Linear(hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        embedded = self.dropout(self.embedding(text))
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        
        hidden = self.dropout(torch.cat([hidden[-2,:,:], hidden[-1,:,:]], dim=1)).squeeze()    

        return self.fc(hidden)

## 网络参数

设置 RNN 和 LSTM 的高阶参数如下。（二者共有一些参数）

参数的选取上，由于我们的词向量均为 100 维，所以嵌入层维度也为 100 维；同时，我们选取隐藏层维度为 256 维，输出层维度为 1 维（二分类任务）。

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model_rnn = SentimentModelRNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model_lstm = SentimentModelLSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)

统计模型参数数量：

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print('model_rnn', count_parameters(model_rnn))
print('model_lstm', count_parameters(model_lstm))

导入嵌入层数据：

In [None]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

model_rnn.embedding.weight.data.copy_(pretrained_embeddings)
model_lstm.embedding.weight.data.copy_(pretrained_embeddings)

## 训练模型

使用 Adam 训练算法，使用交叉熵损失函数（BinaryCrossEntropy）。


In [None]:
optimizer_rnn = optim.Adam(model_rnn.parameters())
optimizer_lstm = optim.Adam(model_lstm.parameters())

loss_rnn = nn.BCEWithLogitsLoss()
loss_lstm = nn.BCEWithLogitsLoss()

model_rnn = model_rnn.to(device)
model_lstm = model_lstm.to(device)

loss_rnn = loss_rnn.to(device)
loss_lstm = loss_lstm.to(device)

对于训练过程，在手动编写网络前向、后向传播的前提下，我们利用了 PyTorch 提供的 ignite 包。这个包从高层面提供了对训练过程监控的能力，我们可以根据不同的事件（每个 epoch、每个 iteration 等）在 ignite engine 分别注册事件处理函数。这样，我们可以把训练代码与验证代码分开，同时获得灵活的监控验证功能。

Ignite Engine 接受一个两参数的回调函数，这个函数会在每个 batch 调用，是我们的训练过程。为了在不同的模型中复用代码，我们使用 Python 的高阶函数（闭包）功能来动态创建这个回调：

In [None]:
def get_trainer_callback(model, optimizer, loss_fn):
    def train(engine, batch):
        model.train()
        
        text, text_lengths = batch.clean_text
        
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = loss_fn(predictions, batch.label.float())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    return train

def get_evaluator_callback(model):
    def evaluate(engine, batch):
        model.eval()
        with torch.no_grad():
            text, text_lengths = batch.clean_text
            
            predictions = model(text, text_lengths).squeeze(1)
            
            y_pred =torch.sigmoid(predictions)
            y = batch.label.float()

            return y_pred, y

    return evaluate

我们通过 `ignite.Engine`创建训练器。然后，我们为其添加几个事件：

- ProgressBar，用来图形化显示训练进度；
- ModelCheckpoint，用来保存模型文件；
- log_training_results，log_validation_results，用来自动在每个 epoch 后进行评估。

同时，我们也挂载了几个监控指标：

```python
accuracy = Accuracy(output_transform=output_transform)
precision = Precision(output_transform=output_transform, average=False)
recall = Recall(output_transform=output_transform, average=False)
F1 = (precision * recall * 2 / (precision + recall)).mean()
roc_curve = RocCurve()
roc_auc = ROC_AUC()
```

In [None]:
def output_transform(output):
    y_pred = torch.round(output[0])
    y = output[1]
    return y_pred, y

def create_trainer_evaluator(model, optimizer, loss_fn, model_name):
    trainer = Engine(get_trainer_callback(model, optimizer, loss_fn))
    
    evaluator = Engine(get_evaluator_callback(model))

    pbar = ProgressBar(persist=True)
    pbar.attach(trainer)
    
    saver = ModelCheckpoint('./', 'checkpoint', n_saved=2, require_empty=False)
    
    trainer.add_event_handler(Events.EPOCH_COMPLETED, saver, {model_name: model})
    
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        evaluator.run(train_iter)
        metrics = evaluator.state.metrics
        print("Training Results - Epoch: {}  Avg accuracy: {:.2f}"
              .format(engine.state.epoch, metrics['accuracy']))


    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(valid_iter)
        metrics = evaluator.state.metrics
        print("Validation Results - Epoch: {}  Avg accuracy: {:.2f}"
              .format(engine.state.epoch, metrics['accuracy']))
        
    accuracy = Accuracy(output_transform=output_transform)
    precision = Precision(output_transform=output_transform, average=False)
    recall = Recall(output_transform=output_transform, average=False)
    F1 = (precision * recall * 2 / (precision + recall)).mean()
    roc_curve = RocCurve()
    roc_auc = ROC_AUC()

    accuracy.attach(evaluator, "accuracy")
    precision.attach(evaluator, "precision")
    recall.attach(evaluator, "recall")
    F1.attach(evaluator, "F1")
    roc_curve.attach(evaluator, "roc_curve")
    roc_auc.attach(evaluator, "roc_auc")

    return trainer, evaluator

创建训练器实例，并训练 5 个 epoch：

In [None]:
trainer_rnn, evaluator_rnn = create_trainer_evaluator(model_rnn, optimizer_rnn, loss_rnn, 'model_rnn')
trainer_rnn.run(train_iter, max_epochs=5)
torch.save(model_rnn.state_dict(), 'model_rnn')

In [None]:
trainer_lstm, evaluator_lstm = create_trainer_evaluator(model_lstm, optimizer_lstm, loss_lstm, 'model_lstm')
trainer_lstm.run(train_iter, max_epochs=5)
torch.save(model_lstm.state_dict(), 'model_lstm')

## 模型评估

用 Sklearn 中的 `RocCurveDisplay` 绘制 ROC 曲线：

In [None]:
def print_plot_scores(model, evaluator, name):
    evaluator.run(test_iter)
    
    metrics = evaluator.state.metrics

    for key, value in metrics.items():
        if key != 'roc_curve':
            print('{}: {:.3f}'.format(key, value))

    fpr, tpr, _ = metrics['roc_curve']

    roc_auc = metrics['roc_auc']

    viz = RocCurveDisplay(
        fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=name
    )

    viz.plot(name=name)

In [None]:
model_rnn.load_state_dict(torch.load('model_rnn'))
print_plot_scores(model_rnn, evaluator_rnn, 'rnn')

In [None]:
model_lstm.load_state_dict(torch.load('model_lstm'))
print_plot_scores(model_lstm, evaluator_lstm, 'lstm')

## 用户输入测试

In [None]:
def predict_sentiment(model, sentence):
    model.eval()
    tokenizer = lambda x: str(x).translate(str.maketrans('', '', string.punctuation)).strip().split()
    tokenized = [tok for tok in tokenizer(sentence)]
    print(tokenized)
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length).to(device)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

In [None]:
predict_sentiment(model_lstm, "i love it")

In [None]:
predict_sentiment(model_lstm, "This movie sucks")