In [1]:
import pandas as pd
import re
import jieba
from tqdm import tqdm
import torch

In [2]:
# 检查是否有可用的 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
# 读取数据集
data_path = '../datasets/chinese_movie_reviews/chinese_movie_reviews_datasets.jsonl'
df = pd.read_json(data_path, orient='records', lines=True)
print(f'数据数量{len(df)}')
print(df.head())

数据数量260386
                                             content  stars  label
0  “我相信真正纯正的爱情能产生一个纾解死亡的阶段，所有的懦弱都出自于没有爱或爱得不彻底，这两者...      4      1
1  太现实不是女人的错，不过年老色衰、中年危机了就不要自以为是，幻想重新寻找当年一口拒绝了的、虽...      4      1
2                               跑吧，我们无力对抗，但也不能让他们得逞。      5      1
3  我在同样变态的师傅手下呆了三年，祖宗十八代被骂了个遍，没空吃饭上厕所睡觉交朋友谈恋爱，脊椎侧...      5      1
4                    还可以，是比较好的电影，但是又觉得和吕克贝松的巅峰状态差了好多      4      1


### 数据预处理

1. 分词
2. 构建词汇表 ： 将每个词映射到一个唯一的数字索引
3. 构造 Dataset 和 DataLoader

In [4]:
# 读取停用词表
with open('../datasets/chinese_movie_reviews/stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = [line.strip() for line in f.readlines()]

# 移除数字
def remove_digit(text):
     return re.sub(r'\d+', '', text)

# 分词处理
def tokenize(text):
    return " ".join(jieba.cut(text))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stopwords])

# 应用预处理
def process_row(text):
    text = remove_digit(text)
    text = re.sub(r"[^\u4e00-\u9fa5]", "", text)  # 只保留汉字字符
    text = tokenize(text)
    text = remove_stopwords(text)
    return text

df["content"] = df["content"].apply(process_row) # 作者在这里大概用了2分钟

# 计算每条文本的长度
sentence_lengths = df["content"].apply(lambda x: len(x.split()))  # 计算每条文本的词数（已经分词）
# 计算最大长度和平均长度
max_length = sentence_lengths.max()
avg_length = sentence_lengths.mean()
print(f"最大文本长度：{max_length}")
print(f"平均文本长度：{avg_length:.2f}")

df.head()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\PC\AppData\Local\Temp\jieba.cache
Loading model cost 0.294 seconds.
Prefix dict has been built successfully.


最大文本长度：166
平均文本长度：21.79


Unnamed: 0,content,stars,label
0,相信 真正 纯正 爱情 能 产生 纾解 死亡 阶段 所有 懦弱 出自于 爱 或 爱 得 彻底...,4,1
1,太 现实 不是 女人 错 年老色衰 中年 危机 不要 自以为是 幻想 重新 寻找 当年 一口...,4,1
2,跑 我们 无力 对抗 但 不能 得逞,5,1
3,同样 变态 师傅 手下 呆 三年 祖宗 十八代 骂 个 遍 没空 吃饭 厕所 睡觉 交朋友 ...,5,1
4,还 比较 好 觉得 吕克贝 松 巅峰状态 差 好多,4,1


In [5]:
from torchtext.vocab import build_vocab_from_iterator

# 1. 分词
def yield_tokens(text_iter):
    for text in text_iter:
        yield text.split()  # 返回分词后的结果

# 2. 创建基础词汇表
vocab = build_vocab_from_iterator(yield_tokens(df["content"]))

# 输出词汇表的大小
print(f"词汇表大小: {len(vocab)}")

# 查看词汇表中的前 10 个词汇及其索引
for i, (word, idx) in enumerate(vocab.stoi.items()):
    if i >= 10:  # 只查看前 10 个
        break
    print(f"{word}: {idx}")

260386lines [00:01, 156910.16lines/s]


词汇表大小: 209859
<unk>: 0
<pad>: 1
看: 2
人: 3
但: 4
好: 5
这: 6
还是: 7
还: 8
啊: 9


In [6]:
MAX_LENGTH = 32  # 可以自定义最大长度

def pad_truncate(tokens):
    if len(tokens) > MAX_LENGTH:
        return tokens[:MAX_LENGTH]  # 如果超过最大长度，裁剪
    else:
        return tokens + [vocab["<pad>"]] * (MAX_LENGTH - len(tokens))  # 不足则填充

def text_pipeline(text):
    tokens = [vocab[token] for token in text]  # 通过词汇表将词语转换为索引
    return pad_truncate(tokens)

from torch.utils.data import Dataset, DataLoader

# 定义数据集类
class TextDataset(Dataset):
    def __init__(self, texts, labels, text_pipeline):
        self.texts = texts
        self.labels = labels
        self.text_pipeline = text_pipeline

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        return torch.tensor(self.text_pipeline(text)), torch.tensor(label)

# 将数据转换为训练集和测试集
from sklearn.model_selection import train_test_split

# 按 99% 训练集和 1% 测试集来划分(数据集比较大即使是1%也有2w6千条数据
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["content"], df["label"], test_size=0.10, random_state=42, stratify=df["label"]) # stratify=df["label"] 使得训练集和测试集中的标签分布是均匀

# 创建训练集和测试集的 Dataset
train_dataset = TextDataset(train_texts, train_labels, text_pipeline)
test_dataset = TextDataset(test_texts, test_labels, text_pipeline)

# 创建 DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

### 定义模型结构

In [7]:
import torch.nn as nn
import torch.optim as optim

# 定义模型架构，使用 Transformer 模型
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_encoder_layers, hidden_dim, output_dim, max_len=MAX_LENGTH):
        super(TransformerModel, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.position_encoding = nn.Parameter(torch.zeros(1, max_len, embed_size))
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_size, nhead=num_heads, dim_feedforward=hidden_dim),
            num_layers=num_encoder_layers
        )
        self.fc = nn.Linear(embed_size, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        seq_len = x.size(1)
        x = x + self.position_encoding[:, :seq_len, :]
        x = self.transformer(x)
        x = x.mean(dim=1)  # 聚合所有位置的输出
        x = self.fc(x)
        return x

# 模型初始化
vocab_size = len(vocab)
embed_size = 128
num_heads = 8
num_encoder_layers = 6
hidden_dim = 512
output_dim = 2

model = TransformerModel(vocab_size, embed_size, num_heads, num_encoder_layers, hidden_dim, output_dim)
model = model.to(device)

# 查看模型结构
# 打印模型参数总数和可训练参数总数
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())  # 所有参数数量
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)  # 需要训练的参数数量
    print(f"模型总参数数量: {total_params:,}")
    print(f"模型可训练参数数量: {trainable_params:,}")

print(model)
count_parameters(model)

TransformerModel(
  (embedding): Embedding(209859, 128)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc): Linear(in_features=128, out_features=2, bias=True)
)
模型总参数数量: 28,055,938
模型可训练参数数量: 28,055,938




In [None]:
# 损失函数与优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# from torch.optim.lr_scheduler import StepLR
# # 设置学习率调度器， 每 5 个 epoch 调整一次学习率
# scheduler = StepLR(optimizer, step_size=2, gamma=0.2)

# 训练与验证
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10):
    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        model.train()
        total_loss = 0
        total_acc = 0
        
        progress_bar = tqdm(train_loader, desc="Training", leave=False)
        for texts, labels in progress_bar:
            texts, labels = texts.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            total_acc += (preds == labels).sum().item()
            
            # 更新进度条描述
            progress_bar.set_postfix(loss=loss.item())

        # # 更新学习率
        # scheduler.step()  # 每个 epoch 后更新一次学习率
        # print(f"当前学习率: {optimizer.param_groups[0]['lr']}")

        
        train_loss = total_loss / len(train_loader)
        train_acc = total_acc / len(train_loader.dataset)
        
        # 验证
        model.eval()
        val_loss, val_acc = evaluate_model(val_loader, model, criterion)
        
        print(f'Epoch {epoch+1}/{epochs} | '
      f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f} | '
      f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}')

        
def evaluate_model(loader, model, criterion):
    total_loss = 0
    total_acc = 0
    with torch.no_grad():
        progress_bar = tqdm(loader, desc="Evaluating", leave=False)
        for texts, labels in progress_bar:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            total_acc += (preds == labels).sum().item()
            
            # 更新进度条描述
            progress_bar.set_postfix(loss=loss.item())
    
    return total_loss / len(loader), total_acc / len(loader.dataset)

# 开始训练
train_model(model, train_loader, test_loader, criterion, optimizer, epochs=10)


Epoch 1/10


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
                                                                                                                       

Epoch 1/10 | Train Loss: 0.6943, Train Accuracy: 0.5009 | Validation Loss: 0.6932, Validation Accuracy: 0.5000
Epoch 2/10


                                                                                                                       

Epoch 2/10 | Train Loss: 0.6933, Train Accuracy: 0.5020 | Validation Loss: 0.6932, Validation Accuracy: 0.5000
Epoch 3/10


                                                                                                                       

Epoch 3/10 | Train Loss: 0.6933, Train Accuracy: 0.5004 | Validation Loss: 0.6933, Validation Accuracy: 0.5000
Epoch 4/10


                                                                                                                       

Epoch 4/10 | Train Loss: 0.6933, Train Accuracy: 0.4992 | Validation Loss: 0.6932, Validation Accuracy: 0.5000
Epoch 5/10


                                                                                                                       

Epoch 5/10 | Train Loss: 0.6933, Train Accuracy: 0.5003 | Validation Loss: 0.6933, Validation Accuracy: 0.5000
Epoch 6/10


Training:  15%|████████▋                                               | 1133/7324 [00:33<03:00, 34.39it/s, loss=0.694]