In [7]:
import torch
import pandas as pd
import re
from torch.utils.data import Dataset, DataLoader

In [2]:
# 检查设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

### 数据预处理

In [6]:
# 读取数据集
data_path = '../datasets/chinese_movie_reviews/chinese_movie_reviews_datasets.jsonl'
df = pd.read_json(data_path, orient='records', lines=True)
print(f'数据数量{len(df)}')
print(df.head())

数据数量260386
                                             content  stars  label
0  “我相信真正纯正的爱情能产生一个纾解死亡的阶段，所有的懦弱都出自于没有爱或爱得不彻底，这两者...      4      1
1  太现实不是女人的错，不过年老色衰、中年危机了就不要自以为是，幻想重新寻找当年一口拒绝了的、虽...      4      1
2                               跑吧，我们无力对抗，但也不能让他们得逞。      5      1
3  我在同样变态的师傅手下呆了三年，祖宗十八代被骂了个遍，没空吃饭上厕所睡觉交朋友谈恋爱，脊椎侧...      5      1
4                    还可以，是比较好的电影，但是又觉得和吕克贝松的巅峰状态差了好多      4      1


In [None]:
# 构建文本数据集类
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# 分割数据集
X = df['content'].values
y = df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

# 使用BERT的Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

train_dataset = TextDataset(X_train, y_train, tokenizer)
test_dataset = TextDataset(X_test, y_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [None]:
# 查看训练集和测试集的大小
print(f"训练集大小: {len(train_dataset)}")
print(f"测试集大小: {len(test_dataset)}")

# 查看训练集和测试集的标签分布
from collections import Counter

train_labels_counter = Counter(y_train)
test_labels_counter = Counter(y_test)
print(f"训练集标签分布: {train_labels_counter}")
print(f"测试集标签分布: {test_labels_counter}")