In [1]:
import pandas as pd
import re
import jieba
from tqdm import tqdm
import torch

In [2]:
# 检查是否有可用的 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
# 读取数据集
data_path = '../datasets/chinese_movie_reviews/chinese_movie_reviews_datasets.jsonl'
df = pd.read_json(data_path, orient='records', lines=True)
print(f'数据数量{len(df)}')
print(df.head())

数据数量260386
                                             content  stars  label
0  “我相信真正纯正的爱情能产生一个纾解死亡的阶段，所有的懦弱都出自于没有爱或爱得不彻底，这两者...      4      1
1  太现实不是女人的错，不过年老色衰、中年危机了就不要自以为是，幻想重新寻找当年一口拒绝了的、虽...      4      1
2                               跑吧，我们无力对抗，但也不能让他们得逞。      5      1
3  我在同样变态的师傅手下呆了三年，祖宗十八代被骂了个遍，没空吃饭上厕所睡觉交朋友谈恋爱，脊椎侧...      5      1
4                    还可以，是比较好的电影，但是又觉得和吕克贝松的巅峰状态差了好多      4      1


## 数据预处理

1. 分词
2. 训练Word2Vec：生成一个包含语料库中的每个词的向量空间

In [4]:
# 读取停用词表
with open('../datasets/chinese_movie_reviews/stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = [line.strip() for line in f.readlines()]

# 移除数字
def remove_digit(text):
     return re.sub(r'\d+', '', text)

# 分词处理
def tokenize(text):
    return " ".join(jieba.cut(text))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stopwords])

# 应用预处理
def process_row(text):
    text = remove_digit(text)
    text = re.sub(r"[^\u4e00-\u9fa5]", "", text)  # 只保留汉字字符
    text = tokenize(text)
    text = remove_stopwords(text)
    return text

df["content"] = df["content"].apply(process_row) # 作者在这里大概用了2分钟

# 计算每条文本的长度
sentence_lengths = df["content"].apply(lambda x: len(x.split()))  # 计算每条文本的词数（已经分词）
# 计算最大长度和平均长度
max_length = sentence_lengths.max()
avg_length = sentence_lengths.mean()
print(f"最大文本长度：{max_length}")
print(f"平均文本长度：{avg_length:.2f}")

df.sample(5)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\PC\AppData\Local\Temp\jieba.cache
Loading model cost 0.301 seconds.
Prefix dict has been built successfully.


最大文本长度：166
平均文本长度：21.79


Unnamed: 0,content,stars,label
118816,星星 青青 子 衿 悠悠 我心 但 为 君故 沉吟 至今 一日不见 如 三月 兮 求之不得 ...,4,1
250417,想 看 多年 片子 今天 看 完 之后 却 有些 失望 剧情 想象 美好 甚至 根本 谈不上...,3,0
181970,女 领导 说 郭达 像 院子 没 拴住 狗 还 到处 拉屎 乱搞 太 形象 哈哈 库珀 左拳...,0,0
246242,这片 居然 豆瓣 额 哈哈哈哈 哈哈哈哈 人间 真是 不可 推测 灰色 喜剧,1,0
186430,完全 没 剧情 画面 不错,3,0


#### Word2Vec

`Word2Vec训练很快，一分钟以内就能结束`

In [5]:
import numpy as np
from gensim.models import Word2Vec

def prepare_data(df, vector_size, max_length=100):
    # 将文本转换为词列表
    texts = df['content'].apply(lambda x: x.split())

    # 首先创建一个空的词汇表并添加 <PAD> 词索引为 0
    vocab = {"<PAD>": 0}
    
    # 训练Word2Vec模型
    w2v_model = Word2Vec(sentences=texts, vector_size=vector_size, window=8, min_count=1, workers=4)
    
    # 获取模型训练后生成的词汇表
    vocab.update({k: v+1 for k, v in w2v_model.wv.key_to_index.items()})
    
    # 将文本转换为序列，如果词不在词汇表中，则用0表示
    sequences = [[vocab.get(word, 0) for word in text] for text in texts]

    # 对每个序列进行填充或截断
    padded = [s[:max_length] + [0] * (max_length - len(s)) if len(s) < max_length 
              else s[:max_length] for s in sequences]
    
    embedding_matrix = np.zeros((len(vocab), vector_size))
    for word, i in vocab.items():
        if word != "<PAD>":  # 确保 <PAD> 不会被赋予任何词向量
            embedding_matrix[i] = w2v_model.wv[word]
    
    return np.array(padded), embedding_matrix, vocab

vector_size=256
padded, embedding_matrix, vocab = prepare_data(df, vector_size=vector_size)

In [6]:
# 查看词汇表的前几个词
print(f'词汇表大小： {len(vocab)}')
print("词汇表的一部分:")
for word, idx in list(vocab.items())[:5]:  # 查看前5个词
    print(f"词: {word}, 索引: {idx}")

# 查看嵌入矩阵中对应某个词的词向量
word_to_check = '狗'  
if word_to_check in vocab:
    word_index = vocab[word_to_check]
    word_vector = embedding_matrix[word_index]
    print(f"{word_to_check} 的词向量:")
    print(word_vector)
else:
    print(f"词汇表中没有 {word_to_check} 这个词。")

词汇表大小： 62221
词汇表的一部分:
词: <PAD>, 索引: 0
词: 看, 索引: 1
词: 人, 索引: 2
词: 但, 索引: 3
词: 好, 索引: 4
狗 的词向量:
[-2.36567736e-01 -1.81298125e+00  1.21999574e+00  5.98021567e-01
 -7.45063961e-01 -9.00016546e-01  1.39530659e+00  8.75249505e-02
  9.44862217e-02 -2.27183059e-01  8.02466094e-01  4.17786032e-01
 -1.23330700e+00 -1.26467597e+00 -1.03314006e+00  2.52983034e-01
 -1.06839933e-01 -1.45807362e+00 -2.17082933e-01  1.82238653e-01
  1.27316415e+00  1.82120705e+00  2.78290063e-01  8.53154004e-01
 -8.26691613e-02  1.88696980e-01 -1.55740094e+00 -4.38516945e-01
  1.26891887e+00 -7.75177002e-01 -2.30401799e-01  1.53721225e+00
  3.56354192e-02 -1.19395781e+00 -5.02536178e-01  6.80065900e-02
  1.05687715e-01  6.16707563e-01  7.03328133e-01  6.87454417e-02
 -1.48744419e-01  6.16866827e-01  3.77114773e-01 -3.70331675e-01
  1.41153842e-01  6.74931824e-01  2.14020729e-01  6.24394007e-02
 -1.70209959e-01  6.99220151e-02  5.40796697e-01 -8.27411283e-03
 -4.45683390e-01 -1.78815514e-01 -4.80320573e-01 -3.56745511e

#### 构建数据集

In [7]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

class TextDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = torch.LongTensor(sequences)
        self.labels = torch.LongTensor(labels)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

X = padded
y = df['label'].values
# stratify=df["label"] 使得训练集和测试集中的标签分布是均匀
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=df["label"])  

train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [8]:
# 查看训练集和测试集的大小
print(f"训练集大小: {len(train_dataset)}")
print(f"测试集大小: {len(test_dataset)}")

# 查看训练集和测试集的标签分布
from collections import Counter

train_labels_counter = Counter(y_train)
test_labels_counter = Counter(y_test)
print(f"训练集标签分布: {train_labels_counter}")
print(f"测试集标签分布: {test_labels_counter}")

训练集大小: 234347
测试集大小: 26039
训练集标签分布: Counter({1: 117174, 0: 117173})
测试集标签分布: Counter({0: 13020, 1: 13019})


## 定义模型

In [9]:
import torch.nn as nn
import torch.nn.functional as F

class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, embedding_matrix, hidden_dim=256):
        super().__init__()
        # 定义词嵌入层，使用 embedding_matrix 初始化
        self.embedding = nn.Embedding.from_pretrained(
            torch.FloatTensor(embedding_matrix),
            padding_idx=0
        )
        self.embedding.weight.requires_grad = True # 确保嵌入层的参数可训练
        """
        双向 LSTM 层：输入维度为 embedding_dim，输出维度为 hidden_dim。
        batch_first=True : 输入张量的形状为 (batch_size, sequence_length)。
        bidirectional=True : LSTM 会在两个方向上（正向和反向）处理输入序列，以捕捉更多上下文信息
        (因为 LSTM 是双向的，它的输出将是两个隐藏层的连接, 所以实际输出维度为 hidden_dim * 2)
        """
        self.lstm = nn.LSTM(
            embedding_dim,    # 输入特征的维度
            hidden_dim,       # 隐藏状态的维度
            num_layers=2,     # LSTM的层数
            batch_first=True, # 输入和输出的张量的第一个维度是batch_size
            bidirectional=True, # 使用双向LSTM
            dropout=0.3
        )
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 2)  # 2 classes for binary classification
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded) # 第二个返回值_是LSTM的隐藏状态和单元状态；lstm_out形状： (batch_size, sequence_length, hidden_dim * 2)
        last_hidden = lstm_out[:, -1, :] # 选择每个批次中的最后一个时刻的输出，形状为 (batch_size, hidden_dim * 2)
        dropped = self.dropout(last_hidden)
        fc1_out = F.relu(self.fc1(dropped))
        # return self.fc2(fc1_out)
        return self.fc2(self.dropout(fc1_out))


model = TextClassifier(len(vocab) + 1, vector_size, embedding_matrix).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# 初始化学习率调度器，每10个epoch将学习率衰减为原来的gamma倍
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5)

In [10]:
# 查看模型结构
# 打印模型参数总数和可训练参数总数
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())  # 所有参数数量
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)  # 需要训练的参数数量
    print(f"模型总参数数量: {total_params:,}")
    print(f"模型可训练参数数量: {trainable_params:,}")

print(model)
count_parameters(model)

TextClassifier(
  (embedding): Embedding(62221, 256, padding_idx=0)
  (lstm): LSTM(256, 256, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (fc1): Linear(in_features=512, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)
模型总参数数量: 18,690,050
模型可训练参数数量: 18,690,050


## 模型训练与评估

In [11]:
from tqdm import tqdm

# 训练函数
def train(dataloader, model, loss_fn, optimizer):
    model.train()  # 设置模型为训练模式
    
    running_loss = 0.0
    correct = 0
    total = 0

    # 使用 tqdm 包裹数据加载器，显示进度条
    progress_bar = tqdm(dataloader, desc="Training", leave=False)
    for texts, labels in progress_bar:
        # 将数据移动到设备
        texts, labels = texts.to(device), labels.to(device)

        # 前向传播
        outputs = model(texts)
        loss = loss_fn(outputs, labels)

        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()  # 更新模型参数

        # 统计指标
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        # 更新进度条描述
        progress_bar.set_postfix(lr=optimizer.param_groups[0]['lr'], loss=loss.item())
        
    scheduler.step()  # 更新学习率       

    accuracy = 100 * correct / total
    avg_loss = running_loss / len(dataloader)
    return avg_loss, accuracy

In [12]:
# 测试函数
def evaluate(dataloader, model, loss_fn):
    model.eval()  # 设置模型为评估模式
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():  # 关闭梯度计算
        progress_bar = tqdm(dataloader, desc="Evaluating", leave=False)
        for texts, labels in progress_bar:
            # 将数据移动到设备
            texts, labels = texts.to(device), labels.to(device)

            # 前向传播
            outputs = model(texts)
            loss = loss_fn(outputs, labels)

            # 统计指标
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            # 更新进度条描述
            progress_bar.set_postfix(loss=loss.item())

    accuracy = 100 * correct / total
    avg_loss = running_loss / len(dataloader)
    return avg_loss, accuracy

In [13]:
# 开始训练

num_epochs = 10

train_loss = []
train_acc  = []
test_loss  = []
test_acc   = []

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    
    epoch_train_loss, epoch_train_acc = train(train_loader, model, loss_fn, optimizer)

    # 在测试集上评估
    epoch_test_loss, epoch_test_acc = evaluate(test_loader, model, loss_fn)

    train_acc.append(epoch_train_acc)
    train_loss.append(epoch_train_loss)
    test_acc.append(epoch_test_acc)
    test_loss.append(epoch_test_loss)
    # 打印训练和测试结果
    template = ('Epoch:{:2d}, Train_acc:{:.1f}%, Train_loss:{:.3f}, Test_acc:{:.1f}%，Test_loss:{:.3f}')
    print(template.format(epoch+1, epoch_train_acc, epoch_train_loss, epoch_test_acc, epoch_test_loss))

print("训练完成!")

Epoch 1/10


                                                                                                                       

Epoch: 1, Train_acc:51.1%, Train_loss:0.693, Test_acc:50.3%，Test_loss:0.692
Epoch 2/10


                                                                                                                       

Epoch: 2, Train_acc:59.6%, Train_loss:0.651, Test_acc:67.4%，Test_loss:0.604
Epoch 3/10


                                                                                                                       

Epoch: 3, Train_acc:71.2%, Train_loss:0.554, Test_acc:68.4%，Test_loss:0.590
Epoch 4/10


                                                                                                                       

Epoch: 4, Train_acc:75.1%, Train_loss:0.495, Test_acc:68.0%，Test_loss:0.595
Epoch 5/10


                                                                                                                       

Epoch: 5, Train_acc:80.4%, Train_loss:0.405, Test_acc:67.1%，Test_loss:0.695
Epoch 6/10


                                                                                                                       

Epoch: 6, Train_acc:83.5%, Train_loss:0.344, Test_acc:66.1%，Test_loss:0.837
Epoch 7/10


                                                                                                                       
KeyboardInterrupt



## 结果可视化

In [None]:
import matplotlib.pyplot as plt

epochs_range = range(num_epochs)

plt.figure(figsize=(12, 3))
plt.subplot(1, 2, 1)

plt.plot(epochs_range, train_acc, label='Training Accuracy')
plt.plot(epochs_range, test_acc, label='Test Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, train_loss, label='Training Loss')
plt.plot(epochs_range, test_loss, label='Test Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()