In [1]:
# Cell 1: 准备环境
import os
import numpy as np
import pandas as pd
import torch

from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import matplotlib.pyplot as plt
import seaborn as sns

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("当前设备:", device)

# 和 03 / 04 notebook 保持一致
label_map = {'bad': 0, 'neutral': 1, 'good': 2}
id2label = {v: k for k, v in label_map.items()}
print("标签映射:", label_map)


  Referenced from: <0B7EB158-53DC-3403-8A49-22178CAB4612> /opt/anaconda3/envs/gpt_senti/lib/python3.10/site-packages/torchvision/image.so
  warn(


当前设备: mps
标签映射: {'bad': 0, 'neutral': 1, 'good': 2}


In [2]:
# Cell 2: 加载 processed_data.csv

filename = 'processed_data.csv'

if os.path.exists(os.path.join('..', 'data', filename)):
    data_path = os.path.join('..', 'data', filename)
elif os.path.exists(os.path.join('data', filename)):
    data_path = os.path.join('data', filename)
else:
    raise FileNotFoundError("❌ 找不到 processed_data.csv，请确认它在 data/ 或 ../data/ 之下。")

print("数据路径:", data_path)

df = pd.read_csv(data_path)
print("总样本数:", len(df))
print("列名:", df.columns.tolist())

# 和训练时一样的清洗逻辑（防止有空值）
df = df.dropna(subset=['cleaned_text', 'labels'])

# 如果还没有 label_id，就按照 label_map 再建一列
if 'label_id' not in df.columns:
    df['label_id'] = df['labels'].map(label_map)

print("\n前 3 行预览：")
display(df[['cleaned_text', 'labels', 'label_id']].head(3))


数据路径: ../data/processed_data.csv
总样本数: 219294
列名: ['Unnamed: 0', 'tweets', 'labels', 'cleaned_text']

前 3 行预览：


Unnamed: 0,cleaned_text,labels,label_id
0,chatgpt optimizing language models for dialogue,neutral,1
1,try talking with chatgpt our new ai system whi...,good,2
2,chatgpt optimizing language models for dialogu...,neutral,1


In [3]:
# Cell 3: train_test_split，与 04_BERT_Finetune.ipynb 保持一致
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_text'],
    df['label_id'],
    test_size=0.2,
    random_state=42
)

print("训练集:", len(X_train), "测试集:", len(X_test))

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            add_special_tokens=True,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 从 ./model_save 中加载 tokenizer & model（和 app.py 保持一致）
model_path = "./model_save"

tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)
model.to(device)
model.eval()

test_dataset = SentimentDataset(X_test, y_test, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print("✅ DataLoader 构建完成，测试批次数:", len(test_loader))


训练集: 175428 测试集: 43858
✅ DataLoader 构建完成，测试批次数: 1371


In [4]:
# Cell 4: 推理，收集预测结果
import torch.nn.functional as F

all_true = []
all_pred = []
all_probs = []
all_texts = []

model.eval()
with torch.no_grad():
    for batch_idx, batch in enumerate(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        logits = outputs.logits
        
        probs = F.softmax(logits, dim=-1)  # [batch_size, 3]
        preds = torch.argmax(probs, dim=-1)
        
        all_true.extend(labels.cpu().numpy())
        all_pred.extend(preds.cpu().numpy())
        all_probs.extend(probs.cpu().numpy())
        
        # 记录原始文本，方便误差分析
        # 注意：这里用的是 X_test 的顺序，DataLoader 没 shuffle
        batch_texts = X_test.iloc[batch_idx*test_loader.batch_size : (batch_idx+1)*test_loader.batch_size]
        all_texts.extend(batch_texts.tolist())

y_true = np.array(all_true)
y_pred = np.array(all_pred)
y_prob = np.array(all_probs)  # shape: [N, 3]

print("y_true shape:", y_true.shape)
print("y_pred shape:", y_pred.shape)
print("y_prob shape:", y_prob.shape)


y_true shape: (43858,)
y_pred shape: (43858,)
y_prob shape: (43858, 3)


In [5]:
# Cell 5: 构建 eval_df 方便后续 error analysis

eval_df = pd.DataFrame({
    'text': all_texts,
    'true_id': y_true,
    'pred_id': y_pred
})

eval_df['true_label'] = eval_df['true_id'].map(id2label)
eval_df['pred_label'] = eval_df['pred_id'].map(id2label)

# 记录每个样本的最大置信度
eval_df['max_prob'] = y_prob.max(axis=1)

# 简单按空格计算文本长度（后面按长度分桶）
eval_df['text_len'] = eval_df['text'].apply(lambda x: len(str(x).split()))

print("eval_df 预览：")
display(eval_df.head())

# 以后误差分析都基于 eval_df 做（非常方便）


eval_df 预览：


Unnamed: 0,text,true_id,pred_id,true_label,pred_label,max_prob,text_len
0,i asked chatgpt to define productivity and to ...,2,2,good,good,0.978996,32
1,the ps5 is an elizabethan marvel,1,1,neutral,neutral,0.988305,6
2,just tried chatgpt to help write some blog pos...,1,1,neutral,neutral,0.897813,31
3,im onboard with the thought that is going to o...,1,1,neutral,neutral,0.99085,18
4,heres a question for the garbageingarbageout f...,0,0,bad,bad,0.979215,25
