<a href="https://colab.research.google.com/github/sawyersong2/playeverything/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 第一步：安装/升级必要的依赖库.

In [None]:
!pip install -q transformers datasets evaluate torch

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h

# 第二步：导入所需的核心库

In [None]:
import torch
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

# 第三步：设置设备（自动检测 GPU/CPU）

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"当前使用设备：{device}")
if torch.cuda.is_available():
    print(f"GPU 型号：{torch.cuda.get_device_name(0)}")

当前使用设备：cuda
GPU 型号：Tesla T4


# 第四步：加载数据集和评估指标

In [None]:
dataset = load_dataset("imdb")
accuracy_metric = evaluate.load("accuracy")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

# 第五步：加载 BERT 预训练模型和分词器

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2  # 仅指定分类数量，不额外设置problem_type，避免干扰损失计算
).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 第六步：数据预处理函数

In [None]:
def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,  # 仅保留截断，padding由DataCollatorWithPadding动态处理
        max_length=512    # 仅限制最大长度，不强制填充
    )

# 对整个数据集应用预处理函数，并转换为PyTorch张量格式
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["text"]
).with_format("torch")

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

# 第七步：创建数据整理器

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 第八步：定义评估函数

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

# 第九步：设置训练参数

In [None]:
training_args = TrainingArguments(
    output_dir="./bert-imdb-sentiment",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    eval_strategy="epoch",  # 修正：替换 evaluation_strategy 为 eval_strategy（低版本支持）
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=torch.cuda.is_available(),
    report_to="none"
)

# 第十步：创建 Trainer 实例

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


# 第十一步：开始训练模型

In [None]:
print("="*50)
print("开始微调 BERT 模型...")
print("="*50)
trainer.train()

开始微调 BERT 模型...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2045,0.196244,0.92648
2,0.1356,0.208957,0.93904
3,0.0756,0.261818,0.94112


TrainOutput(global_step=4689, training_loss=0.15372916858418456, metrics={'train_runtime': 2468.3432, 'train_samples_per_second': 30.385, 'train_steps_per_second': 1.9, 'total_flos': 1.953368130541488e+16, 'train_loss': 0.15372916858418456, 'epoch': 3.0})

# 第十二步：在测试集上进行最终评估

In [None]:
print("="*50)
print("训练完成，开始在测试集上评估最终模型...")
print("="*50)
final_eval_results = trainer.evaluate(tokenized_datasets["test"])
print(f"\n测试集最终准确率：{final_eval_results['eval_accuracy']:.4f}")
print(f"测试集评估损失：{final_eval_results['eval_loss']:.4f}")

训练完成，开始在测试集上评估最终模型...



测试集最终准确率：0.9411
测试集评估损失：0.2618


# 第十三步：保存最终模型和分词器

In [None]:
print("="*50)
print("保存最终模型和分词器...")
print("="*50)
model.save_pretrained("./best-bert-imdb-sentiment")
tokenizer.save_pretrained("./best-bert-imdb-sentiment")
print("模型保存完成！")

保存最终模型和分词器...
模型保存完成！


# 第十四步：示例推理

In [None]:
def predict_sentiment(text):
    loaded_model = BertForSequenceClassification.from_pretrained("./best-bert-imdb-sentiment").to(device)
    loaded_tokenizer = BertTokenizer.from_pretrained("./best-bert-imdb-sentiment")

    inputs = loaded_tokenizer(
        text,
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = loaded_model(**inputs)

    logits = outputs.logits
    prediction = torch.argmax(logits, dim=-1).item()
    sentiment = "正面" if prediction == 1 else "负面"

    return sentiment, prediction

# 测试示例

In [None]:
test_text1 = "This movie is absolutely fantastic! The acting is brilliant and the plot is very engaging."
test_text2 = "I have never seen such a terrible movie. The story is boring and the actors are unconvincing."

sentiment1, pred1 = predict_sentiment(test_text1)
sentiment2, pred2 = predict_sentiment(test_text2)

print(f"\n示例1文本：{test_text1}")
print(f"预测情感：{sentiment1}，预测标签：{pred1}")
print(f"\n示例2文本：{test_text2}")
print(f"预测情感：{sentiment2}，预测标签：{pred2}")


示例1文本：This movie is absolutely fantastic! The acting is brilliant and the plot is very engaging.
预测情感：正面，预测标签：1

示例2文本：I have never seen such a terrible movie. The story is boring and the actors are unconvincing.
预测情感：负面，预测标签：0
