In [3]:
!pip install torch transformers pandas scikit-learn

Collecting torch
  Downloading torch-2.6.0-cp312-none-macosx_11_0_arm64.whl.metadata (28 kB)
Collecting transformers
  Downloading transformers-4.51.2-py3-none-any.whl.metadata (38 kB)
Collecting pandas
  Using cached pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting filelock (from torch)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting networkx (from torch)
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2025.3.2-py3-none-any.whl.metadata (11 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy==1.13.1->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Collecti

In [4]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import random
import time

# 设定随机种子，确保结果可重复
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

print("所有依赖导入完成，随机种子已设置。")


所有依赖导入完成，随机种子已设置。


In [5]:
# 请确保 Tweets.csv 文件已上传到当前工作目录
csv_path = "Tweets.csv"  # 根据实际情况修改文件路径
df = pd.read_csv(csv_path)

# 查看数据示例
print("数据示例：")
print(df.head())

# 删除缺失 tweet 文本的行（假设 tweet 文本存储在 'text' 列中）
df = df.dropna(subset=['text', 'airline_sentiment'])

# 将情感标签转换为数字
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['airline_sentiment'])
print("标签映射：", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

print("数据加载与预处理完成。")


数据示例：
             tweet_id airline_sentiment  airline_sentiment_confidence  \
0  570306133677760513           neutral                        1.0000   
1  570301130888122368          positive                        0.3486   
2  570301083672813571           neutral                        0.6837   
3  570301031407624196          negative                        1.0000   
4  570300817074462722          negative                        1.0000   

  negativereason  negativereason_confidence         airline  \
0            NaN                        NaN  Virgin America   
1            NaN                     0.0000  Virgin America   
2            NaN                        NaN  Virgin America   
3     Bad Flight                     0.7033  Virgin America   
4     Can't Tell                     1.0000  Virgin America   

  airline_sentiment_gold        name negativereason_gold  retweet_count  \
0                    NaN     cairdin                 NaN              0   
1                    NaN  

In [6]:
class TweetDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        tweet = str(self.data.iloc[index]['text'])
        label = int(self.data.iloc[index]['label'])
        
        # 使用BERT tokenizer进行编码
        encoding = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'tweet_text': tweet,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

print("TweetDataset 定义完成。")


TweetDataset 定义完成。


In [7]:
# 初始化 BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 创建数据集
dataset = TweetDataset(df, tokenizer, max_len=128)

# 拆分数据集为训练集和验证集（80% - 20% 拆分）
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# 创建 DataLoader
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

print(f"数据集拆分完成：训练集样本 {train_size}, 验证集样本 {val_size}")


Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

数据集拆分完成：训练集样本 11712, 验证集样本 2928


In [8]:
# 根据情感标签数初始化分类模型
num_labels = len(label_encoder.classes_)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# 检查 GPU 是否可用
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# 设置优化器和学习率调度器
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
epochs = 3  # 根据需要调整 epoch 数
total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,  # 根据需要调整 warmup 步数
    num_training_steps=total_steps
)

print("模型初始化，优化器和调度器设定完成。")


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

模型初始化，优化器和调度器设定完成。




In [10]:
def train_epoch(model, data_loader, optimizer, scheduler, device):
    model.train()
    losses = []
    correct_predictions = 0
    
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    
    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

def eval_model(model, data_loader, device):
    model.eval()
    losses = []
    correct_predictions = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            
            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses), all_preds, all_labels

print("训练和验证函数定义完成。")


训练和验证函数定义完成。


In [11]:
history = {'train_acc': [], 'train_loss': [], 'val_acc': [], 'val_loss': []}
best_accuracy = 0

print("开始训练模型...")

for epoch in range(epochs):
    print(f'Epoch {epoch+1}/{epochs}')
    start_time = time.time()
    
    train_acc, train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
    val_acc, val_loss, y_pred, y_true = eval_model(model, val_loader, device)
    
    end_time = time.time()
    
    history['train_acc'].append(train_acc.item())
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc.item())
    history['val_loss'].append(val_loss)
    
    print(f'Train loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}')
    print(f'Val loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}')
    print(f'Epoch 用时: {end_time - start_time:.0f} 秒\n')
    
    if val_acc > best_accuracy:
        best_accuracy = val_acc
        # 保存模型状态
        torch.save(model.state_dict(), 'best_model_state.bin')
        print("最佳模型已保存。")

print("模型训练完成。")


开始训练模型...
Epoch 1/3
Train loss: 0.4883, Accuracy: 0.8101
Val loss: 0.3907, Accuracy: 0.8470
Epoch 用时: 2320 秒

最佳模型已保存。
Epoch 2/3
Train loss: 0.2691, Accuracy: 0.9057
Val loss: 0.4443, Accuracy: 0.8542
Epoch 用时: 2695 秒

最佳模型已保存。
Epoch 3/3
Train loss: 0.1610, Accuracy: 0.9517
Val loss: 0.5363, Accuracy: 0.8535
Epoch 用时: 13158 秒

模型训练完成。


In [12]:
print("分类报告：")
report = classification_report(y_true, y_pred, target_names=label_encoder.classes_)
print(report)


分类报告：
              precision    recall  f1-score   support

    negative       0.90      0.92      0.91      1825
     neutral       0.75      0.67      0.71       638
    positive       0.79      0.83      0.81       465

    accuracy                           0.85      2928
   macro avg       0.81      0.81      0.81      2928
weighted avg       0.85      0.85      0.85      2928

