<a href="https://colab.research.google.com/github/qiuyuzhai/LLM-/blob/main/bert_mrpc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

In [None]:
# 加载数据集
dataset = load_dataset("glue", "mrpc")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

mrpc/train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

mrpc/validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

mrpc/test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [None]:
# 加载评价指标
metric = evaluate.load("glue", "mrpc")


Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# 加载预训练模型和分词器
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, local_files_only=False)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    local_files_only=False
)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 数据预处理函数
def tokenize_function(examples):
    return tokenizer(
        examples["sentence1"],
        examples["sentence2"],
        truncation=True,
        max_length=128
    )

In [None]:
# 原始文本数据转换成模型能训练的格式
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [None]:
# 自动补全，让一个批次里所有样本长度保持一致
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1) # 按照行方向根据最大值所在的索引位置找到最大值
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir="./bert-mrpc-results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="wandb" if "wandb" in locals() else "none",  # 支持WandB可视化
    logging_dir="./logs",
    logging_steps=100
)

In [None]:
# 初始化Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
# 模型训练
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5439,0.496722,0.789216,0.857616
2,0.395,0.456917,0.79902,0.862876
3,0.2645,0.509901,0.801471,0.864322


TrainOutput(global_step=690, training_loss=0.4214570446290832, metrics={'train_runtime': 216.3265, 'train_samples_per_second': 50.868, 'train_steps_per_second': 3.19, 'total_flos': 428577075854640.0, 'train_loss': 0.4214570446290832, 'epoch': 3.0})

In [None]:
predictions, labels, _ = trainer.predict(tokenized_dataset["validation"])
predicted_labels = np.argmax(predictions, axis=1)

In [None]:
# 输出预测结果与真实标签对比（MRPC验证集共408个样本）
print("\n预测结果与真实标签对比（前20个样本）：")
for i in range(20):
    print(f"样本 {i+1}: 预测={predicted_labels[i]}, 真实={labels[i]}")


预测结果与真实标签对比（前20个样本）：
样本 1: 预测=1, 真实=1
样本 2: 预测=0, 真实=0
样本 3: 预测=0, 真实=0
样本 4: 预测=1, 真实=1
样本 5: 预测=0, 真实=0
样本 6: 预测=1, 真实=1
样本 7: 预测=1, 真实=0
样本 8: 预测=1, 真实=1
样本 9: 预测=1, 真实=1
样本 10: 预测=1, 真实=1
样本 11: 预测=1, 真实=1
样本 12: 预测=1, 真实=0
样本 13: 预测=0, 真实=0
样本 14: 预测=1, 真实=1
样本 15: 预测=1, 真实=1
样本 16: 预测=1, 真实=1
样本 17: 预测=1, 真实=1
样本 18: 预测=0, 真实=0
样本 19: 预测=1, 真实=1
样本 20: 预测=0, 真实=0


In [None]:
# 计算并输出最终评估指标
final_metrics = metric.compute(predictions=predicted_labels, references=labels)
print("\n最终评估指标：")
print(f"准确率: {final_metrics['accuracy']:.4f}")
print(f"F1值: {final_metrics['f1']:.4f}")


最终评估指标：
准确率: 0.7990
F1值: 0.8629


In [None]:
# 模型续点训练
from transformers import TrainingArguments, Trainer, BertForSequenceClassification, BertTokenizer

# 加载模型和分词器
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 训练参数配置（重点：resume_from_checkpoint）
training_args = TrainingArguments(
    output_dir="./bert-mrpc-results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    # 从 checkpoint-230 继续训练
    resume_from_checkpoint="./bert-mrpc-results/checkpoint-230",
)

# 假设已准备好数据集（train_dataset、eval_dataset）
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

# 继续训练
trainer.train()

In [None]:
# 模型选择与评估
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, EvaluationStrategy

# 加载 Checkpoint 中的模型和分词器
model = AutoModelForSequenceClassification.from_pretrained("./bert-mrpc-results/checkpoint-690")
tokenizer = AutoTokenizer.from_pretrained("./bert-mrpc-results/checkpoint-690")

# 假设已准备好 eval_dataset
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

# 在验证集上评估
eval_results = trainer.evaluate()
print(f"验证集准确率：{eval_results['eval_accuracy']:.4f}，F1值：{eval_results['eval_f1']:.4f}")


In [None]:
# 模型推理（单个样本）
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# 加载 Checkpoint 中的模型和分词器
model = AutoModelForSequenceClassification.from_pretrained("./bert-mrpc-results/checkpoint-690")
tokenizer = AutoTokenizer.from_pretrained("./bert-mrpc-results/checkpoint-690")

# 准备输入文本
text1 = "The cat sits on the mat."
text2 = "The cat is sitting on the mat."

# 分词并编码
inputs = tokenizer(text1, text2, return_tensors="pt")

# 模型推理
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    pred = torch.argmax(logits, dim=1).item()  # 0=不相似，1=相似
    print(f"预测结果：{pred}（1表示语义相似）")

预测结果：1（1表示语义相似）


In [None]:
# 模型继续微调（在新数据集上继续训练）
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, AutoTokenizer

# 加载 Checkpoint 中的模型和分词器
model = AutoModelForSequenceClassification.from_pretrained("./bert-mrpc-results/checkpoint-690")
tokenizer = AutoTokenizer.from_pretrained("./bert-mrpc-results/checkpoint-690")

# 假设准备了新的微调数据集 new_train_dataset、new_eval_dataset
training_args = TrainingArguments(
    output_dir="./bert-mrpc-finetuned",
    num_train_epochs=2,
    per_device_train_batch_size=16,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=new_train_dataset,
    eval_dataset=new_eval_dataset,
    compute_metrics=compute_metrics,
)

# 开始微调
trainer.train()