### 本文件展示学习 RNN 的全流程

In [22]:
# 读取数据集 IMDb 电影评论情感分类
import pandas as pd

data = pd.read_csv('./data/IMDB Dataset.csv')
print(data.head)

<bound method NDFrame.head of                                                   review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]>


In [23]:
# 定义简单的RNN网络
import torch
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vs, embedding_dim, hidden_dim, output_dim):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vs, embedding_dim)
        self.hidden_dim = hidden_dim
        self.rnn = nn.RNNCell(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):  # x: (batch_size, seq_len)
        batch_size, seq_len = x.size()
        embedded = self.embedding(x)  # (batch, seq_len, embed_dim)

        h_t = torch.zeros(batch_size, self.hidden_dim, device=x.device)
        for t in range(seq_len):
            x_t = embedded[:, t, :]
            h_t = self.rnn(x_t, h_t)

        output = self.fc(h_t)
        return output

In [24]:
model = RNN(vs=90, embedding_dim=50, hidden_dim=64, output_dim=1)
model

RNN(
  (embedding): Embedding(90, 50)
  (rnn): RNNCell(50, 64)
  (fc): Linear(in_features=64, out_features=1, bias=True)
)

### 数据处理函数

In [25]:
import pandas as pd
import re
from collections import Counter
import torch
from sklearn.model_selection import train_test_split

def tokenize(text):
    text = text.lower()
    return re.findall(r'\b\w+\b', text)

def build_vocab(sentences, min_freq=2):
    counter = Counter()
    for sent in sentences:
        tokens = tokenize(sent)
        counter.update(tokens)
    vocab = {'<pad>': 0, '<unk>': 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

def encode_sentence(sentence, vocab, max_len):
    tokens = tokenize(sentence)
    ids = [vocab.get(token, vocab['<unk>']) for token in tokens[:max_len]]
    ids += [vocab['<pad>']] * (max_len - len(ids))
    return ids

def load_data(path, vocab=None, max_len=100, test_size=0.2, seed=42):
    df = pd.read_csv(path)
    sentences = df['review'].tolist()
    raw_labels = df['sentiment'].tolist()
    label_map = {'positive': 1.0, 'negative': 0.0}
    labels = [label_map[label.strip().lower()] for label in raw_labels]

    # 划分训练/测试
    s_train, s_test, y_train, y_test = train_test_split(
        sentences, labels, test_size=test_size, random_state=seed
    )

    if vocab is None:
        vocab = build_vocab(s_train)  # 只用训练集构建词表

    # 编码文本
    x_train = [encode_sentence(s, vocab, max_len) for s in s_train]
    x_test = [encode_sentence(s, vocab, max_len) for s in s_test]

    return (
        torch.tensor(x_train), torch.tensor(y_train).float(),
        torch.tensor(x_test), torch.tensor(y_test).float(),
        vocab
    )


In [55]:
from torch.utils.data import DataLoader, TensorDataset

# 加载数据
x_train, y_train, x_test, y_test, vocab = load_data('./data/IMDB Dataset.csv', max_len=50)
vocab["<pad>"] = 0
vocab["<unk>"] = 1
print("训练集:", x_train.shape, y_train.shape)
print("测试集:", x_test.shape, y_test.shape)
print("词表大小:", len(vocab))

train_loader = DataLoader(TensorDataset(x_train, y_train), batch_size=32, shuffle=True)
test_loader = DataLoader(TensorDataset(x_test, y_test), batch_size=32)

训练集: torch.Size([40000, 50]) torch.Size([40000])
测试集: torch.Size([10000, 50]) torch.Size([10000])
词表大小: 57766


### 模型训练

In [56]:
from tqdm import tqdm
import torch.nn.functional as F

# 初始化模型
model = RNN(vs=len(vocab), embedding_dim=50, hidden_dim=64, output_dim=1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# 损失函数和优化器
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
epochs = 20

# 训练过程
best_acc = 0
for epoch in range(epochs):
    model.train()
    total_loss = 0

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
    train_tot = 0
    train_correct = 0
    for batch_x, batch_y in loop:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        # 前向传播
        logits = model(batch_x).squeeze(1)  # shape: (batch,)
        loss = criterion(logits, batch_y)
        probs = torch.sigmoid(logits)
        preds = (probs > 0.5).float()
        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        train_correct += (preds == batch_y).sum().item()
        train_tot += batch_y.size(0)
        loop.set_postfix({
                "loss": loss.item(),
                "acc": 100*train_correct / train_tot if train_tot else 0
            })

    print(f"[Epoch {epoch+1}] Train Loss: {total_loss/len(train_loader):.4f} Train Accuracy: {train_correct/train_tot*100:.2f}")

    # --- 测试评估 ---
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for x_test_batch, y_test_batch in test_loader:
            x_test_batch, y_test_batch = x_test_batch.to(device), y_test_batch.to(device)
            logits = model(x_test_batch).squeeze(1)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).float()
            correct += (preds == y_test_batch).sum().item()
            total += y_test_batch.size(0)

    acc = 100 * correct / total
    if best_acc < acc:
        best_acc = acc
        torch.save(model.state_dict(), 'RNN_IMDB.pth')
    print(f"[Epoch {epoch+1}] Test Accuracy: {acc:.2f}")


Epoch 1/20: 100%|██████████| 1250/1250 [00:46<00:00, 27.16it/s, loss=0.673, acc=52.7]


[Epoch 1] Train Loss: 0.6928 Train Accuracy: 52.66
[Epoch 1] Test Accuracy: 52.20


Epoch 2/20: 100%|██████████| 1250/1250 [00:47<00:00, 26.52it/s, loss=0.693, acc=56.4]


[Epoch 2] Train Loss: 0.6803 Train Accuracy: 56.37
[Epoch 2] Test Accuracy: 54.58


Epoch 3/20: 100%|██████████| 1250/1250 [00:47<00:00, 26.40it/s, loss=0.746, acc=59.1]


[Epoch 3] Train Loss: 0.6682 Train Accuracy: 59.12
[Epoch 3] Test Accuracy: 57.34


Epoch 4/20: 100%|██████████| 1250/1250 [00:47<00:00, 26.38it/s, loss=0.569, acc=63.9]


[Epoch 4] Train Loss: 0.6386 Train Accuracy: 63.90
[Epoch 4] Test Accuracy: 62.14


Epoch 5/20: 100%|██████████| 1250/1250 [00:51<00:00, 24.24it/s, loss=0.586, acc=65.2]


[Epoch 5] Train Loss: 0.6251 Train Accuracy: 65.21
[Epoch 5] Test Accuracy: 55.37


Epoch 6/20: 100%|██████████| 1250/1250 [00:50<00:00, 24.51it/s, loss=0.677, acc=61.3]


[Epoch 6] Train Loss: 0.6503 Train Accuracy: 61.26
[Epoch 6] Test Accuracy: 60.85


Epoch 7/20: 100%|██████████| 1250/1250 [00:48<00:00, 25.97it/s, loss=0.602, acc=66.7]


[Epoch 7] Train Loss: 0.6140 Train Accuracy: 66.68
[Epoch 7] Test Accuracy: 65.37


Epoch 8/20: 100%|██████████| 1250/1250 [00:47<00:00, 26.33it/s, loss=0.699, acc=70.8]


[Epoch 8] Train Loss: 0.5726 Train Accuracy: 70.75
[Epoch 8] Test Accuracy: 69.01


Epoch 9/20: 100%|██████████| 1250/1250 [00:53<00:00, 23.33it/s, loss=0.56, acc=76.3] 


[Epoch 9] Train Loss: 0.5074 Train Accuracy: 76.34
[Epoch 9] Test Accuracy: 68.32


Epoch 10/20: 100%|██████████| 1250/1250 [00:57<00:00, 21.81it/s, loss=0.511, acc=79.2]


[Epoch 10] Train Loss: 0.4663 Train Accuracy: 79.15
[Epoch 10] Test Accuracy: 70.43


Epoch 11/20: 100%|██████████| 1250/1250 [01:07<00:00, 18.47it/s, loss=0.324, acc=81.2]


[Epoch 11] Train Loss: 0.4316 Train Accuracy: 81.23
[Epoch 11] Test Accuracy: 71.09


Epoch 12/20: 100%|██████████| 1250/1250 [00:53<00:00, 23.17it/s, loss=0.48, acc=82.7] 


[Epoch 12] Train Loss: 0.4108 Train Accuracy: 82.71
[Epoch 12] Test Accuracy: 71.61


Epoch 13/20: 100%|██████████| 1250/1250 [00:46<00:00, 27.07it/s, loss=0.558, acc=75.6]


[Epoch 13] Train Loss: 0.5036 Train Accuracy: 75.57
[Epoch 13] Test Accuracy: 59.28


Epoch 14/20: 100%|██████████| 1250/1250 [00:46<00:00, 26.93it/s, loss=0.469, acc=81.5]


[Epoch 14] Train Loss: 0.4265 Train Accuracy: 81.55
[Epoch 14] Test Accuracy: 71.17


Epoch 15/20: 100%|██████████| 1250/1250 [00:46<00:00, 27.05it/s, loss=0.396, acc=84.2]


[Epoch 15] Train Loss: 0.3840 Train Accuracy: 84.16
[Epoch 15] Test Accuracy: 66.04


Epoch 16/20: 100%|██████████| 1250/1250 [00:45<00:00, 27.17it/s, loss=0.524, acc=77.6]


[Epoch 16] Train Loss: 0.4722 Train Accuracy: 77.62
[Epoch 16] Test Accuracy: 62.02


Epoch 17/20: 100%|██████████| 1250/1250 [00:58<00:00, 21.55it/s, loss=0.332, acc=82.2]


[Epoch 17] Train Loss: 0.4116 Train Accuracy: 82.20
[Epoch 17] Test Accuracy: 70.57


Epoch 18/20: 100%|██████████| 1250/1250 [01:06<00:00, 18.93it/s, loss=0.265, acc=83.9]


[Epoch 18] Train Loss: 0.3841 Train Accuracy: 83.89
[Epoch 18] Test Accuracy: 72.83


Epoch 19/20: 100%|██████████| 1250/1250 [01:02<00:00, 20.03it/s, loss=0.375, acc=84.9]


[Epoch 19] Train Loss: 0.3642 Train Accuracy: 84.85
[Epoch 19] Test Accuracy: 71.95


Epoch 20/20: 100%|██████████| 1250/1250 [00:52<00:00, 23.85it/s, loss=0.304, acc=87.5]


[Epoch 20] Train Loss: 0.3241 Train Accuracy: 87.47
[Epoch 20] Test Accuracy: 71.54


### 测试 RNN 的情感分析性能

In [57]:
def predict_sentiments(texts, vocab, model, device, max_len=100):
    model.eval()
    for text in texts:
        tokens = text.lower().split()
        token_ids = [vocab.get(token, vocab["<unk>"]) for token in tokens]
        if len(token_ids) < max_len:
            token_ids += [vocab["<pad>"]] * (max_len - len(token_ids))
        else:
            token_ids = token_ids[:max_len]
        input_ids = torch.tensor(token_ids).unsqueeze(0).to(device)
        with torch.no_grad():
            logits = model(input_ids).squeeze(1)
            prob = torch.sigmoid(logits)
            label = "positive" if prob.item() >= 0.5 else "negative"
        print(f"Text: {text}")
        print(f"Prediction: {label} (Confidence: {prob.item():.4f})")
        print("-" * 50)

# 示例测试文本
test_texts = [
    "I absolutely loved this movie!",
    "This film was a masterpiece, beautifully acted and directed.",
    "Great performance by the lead actor, I would watch it again.",
    "I hated every minute of this film.",
    "Terrible plot and wooden acting.",
    "One of the worst movies I’ve ever seen.",
    "The story made no sense and the pacing was awful.",
    "The visuals were great, but the plot was lacking.",
    "It was okay, not bad but not amazing either.",
    "I expected more, but it wasn't the worst.",
]
model = RNN(vs=len(vocab), embedding_dim=50, hidden_dim=64, output_dim=1).to(device)
model.load_state_dict(torch.load('RNN_IMDB.pth', map_location=device))
# 执行预测
predict_sentiments(test_texts, vocab, model, device, max_len=50)

Text: I absolutely loved this movie!
Prediction: negative (Confidence: 0.3479)
--------------------------------------------------
Text: This film was a masterpiece, beautifully acted and directed.
Prediction: positive (Confidence: 0.9609)
--------------------------------------------------
Text: Great performance by the lead actor, I would watch it again.
Prediction: positive (Confidence: 0.9609)
--------------------------------------------------
Text: I hated every minute of this film.
Prediction: negative (Confidence: 0.0545)
--------------------------------------------------
Text: Terrible plot and wooden acting.
Prediction: negative (Confidence: 0.0545)
--------------------------------------------------
Text: One of the worst movies I’ve ever seen.
Prediction: negative (Confidence: 0.0545)
--------------------------------------------------
Text: The story made no sense and the pacing was awful.
Prediction: negative (Confidence: 0.0545)
-----------------------------------------------