<a href="https://colab.research.google.com/github/qiuyuzhai/LLM-/blob/main/bert_mrpc_lora.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [2]:
import numpy as np
import torch
import evaluate
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

In [3]:
# 加载数据集
dataset = load_dataset("glue", "mrpc")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

mrpc/train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

mrpc/validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

mrpc/test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [4]:
# 加载评价指标
metric = evaluate.load("glue", "mrpc")


Downloading builder script: 0.00B [00:00, ?B/s]

In [6]:
# 加载预训练模型和分词器
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, local_files_only=False)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    local_files_only=False
)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# 配置 LoRA 微调参数
lora_config = LoraConfig(
    r=8,  # 低秩矩阵的秩
    lora_alpha=16,  # 缩放因子（通常为 r 的 2 倍）
    target_modules=["query", "key"],  # BERT 注意力层的 Q/K 投影层
    lora_dropout=0.05,  # Dropout 概率
    bias="none",  # 不训练偏置项
    task_type="SEQ_CLS"  # 任务类型：序列分类
)

In [9]:
# 冻结原始模型参数，仅训练 LoRA 矩阵
for param in model.parameters():
    param.requires_grad = False

In [10]:
# 将 LoRA 配置应用到模型
model = get_peft_model(model, lora_config)

In [11]:
# 查看可训练参数比例
model.print_trainable_parameters()

trainable params: 296,450 || all params: 109,780,228 || trainable%: 0.2700


In [18]:
# 数据预处理函数
def tokenize_function(examples):
    return tokenizer(
        examples["sentence1"],
        examples["sentence2"],
        truncation=True,
        max_length=128
    )

In [19]:
# 原始文本数据转换成模型能训练的格式
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [13]:
# 自动补全，让一个批次里所有样本长度保持一致
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1) # 按照行方向根据最大值所在的索引位置找到最大值
    return metric.compute(predictions=predictions, references=labels)

In [16]:
training_args = TrainingArguments(
    output_dir="./bert-mrpc-lora-results",
    learning_rate=2e-4,  # LoRA 微调学习率高于全量微调
    per_device_train_batch_size=32,  # 显存占用低，可放大批次
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="wandb" if "wandb" in locals() else "none",  # 支持 WandB 可视化
    logging_dir="./lora-logs",
    logging_steps=100,
    fp16=True  # 混合精度训练，节省显存
)

In [20]:
# 初始化Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [21]:
# 模型训练
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6368,0.594706,0.683824,0.812227
2,0.5712,0.540824,0.740196,0.823333
3,0.5395,0.533028,0.75,0.831683


TrainOutput(global_step=345, training_loss=0.5736736187036486, metrics={'train_runtime': 67.0922, 'train_samples_per_second': 164.013, 'train_steps_per_second': 5.142, 'total_flos': 452228003940480.0, 'train_loss': 0.5736736187036486, 'epoch': 3.0})

In [22]:
predictions, labels, _ = trainer.predict(tokenized_dataset["validation"])
predicted_labels = np.argmax(predictions, axis=1)

In [23]:
# 输出预测结果与真实标签对比（MRPC验证集共408个样本）
print("\n预测结果与真实标签对比（前20个样本）：")
for i in range(20):
    print(f"样本 {i+1}: 预测={predicted_labels[i]}, 真实={labels[i]}")


预测结果与真实标签对比（前20个样本）：
样本 1: 预测=1, 真实=1
样本 2: 预测=1, 真实=0
样本 3: 预测=0, 真实=0
样本 4: 预测=1, 真实=1
样本 5: 预测=1, 真实=0
样本 6: 预测=1, 真实=1
样本 7: 预测=1, 真实=0
样本 8: 预测=1, 真实=1
样本 9: 预测=1, 真实=1
样本 10: 预测=1, 真实=1
样本 11: 预测=1, 真实=1
样本 12: 预测=1, 真实=0
样本 13: 预测=0, 真实=0
样本 14: 预测=1, 真实=1
样本 15: 预测=1, 真实=1
样本 16: 预测=1, 真实=1
样本 17: 预测=1, 真实=1
样本 18: 预测=0, 真实=0
样本 19: 预测=1, 真实=1
样本 20: 预测=0, 真实=0


In [24]:
# 计算并输出最终评估指标
final_metrics = metric.compute(predictions=predicted_labels, references=labels)
print("\n最终评估指标：")
print(f"准确率: {final_metrics['accuracy']:.4f}")
print(f"F1值: {final_metrics['f1']:.4f}")


最终评估指标：
准确率: 0.7500
F1值: 0.8317


In [None]:
# 模型续点训练（基于 LoRA checkpoint 继续训练）
# 加载 LoRA 模型和配置
peft_config = PeftConfig.from_pretrained("./bert-mrpc-lora-results/checkpoint-690")
base_model = AutoModelForSequenceClassification.from_pretrained(
    peft_config.base_model_name_or_path,
    num_labels=2,
    device_map="auto"
)
lora_model = PeftModel.from_pretrained(base_model, "./bert-mrpc-lora-results/checkpoint-690")

# 续点训练参数配置
resume_training_args = TrainingArguments(
    output_dir="./bert-mrpc-lora-resume",
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,  # 追加 2 个 epoch
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    resume_from_checkpoint="./bert-mrpc-lora-results/checkpoint-690",  # 从指定 checkpoint 续训
    logging_dir="./lora-resume-logs",
    logging_steps=100,
    fp16=True
)

# 初始化续训 Trainer
resume_trainer = Trainer(
    model=lora_model,
    args=resume_training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 继续训练
resume_trainer.train()

In [None]:
# 模型选择与评估（加载最优 LoRA checkpoint）
best_lora_model = PeftModel.from_pretrained(
    base_model,
    "./bert-mrpc-lora-resume/checkpoint-327"  # 假设 checkpoint-327 是最优模型
)

# 初始化评估 Trainer
eval_trainer = Trainer(
    model=best_lora_model,
    tokenizer=tokenizer,
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

# 在验证集上评估
eval_results = eval_trainer.evaluate()
print(f"\n最优模型验证集评估结果：")
print(f"准确率：{eval_results['eval_accuracy']:.4f}，F1值：{eval_results['eval_f1']:.4f}")

In [None]:
# 模型推理（单个样本）

text1 = "The cat sits on the mat."
text2 = "The cat is sitting on the mat."

# 分词并编码
inputs = tokenizer(text1, text2, return_tensors="pt").to(base_model.device)

# 模型推理
best_lora_model.eval()
with torch.no_grad():
    outputs = best_lora_model(**inputs)
    logits = outputs.logits
    pred = torch.argmax(logits, dim=1).item()  # 0=不相似，1=相似
    print(f"\n推理结果：")
    print(f"预测标签：{pred}（1表示语义相似）")

In [None]:
# 模型继续微调（在新数据集上扩展训练）
new_train_dataset = tokenized_dataset["train"].select(range(1000))  # 前1000个样本作为新训练集
new_eval_dataset = tokenized_dataset["validation"].select(range(200))  # 前200个样本作为新验证集

# 新训练参数配置
new_training_args = TrainingArguments(
    output_dir="./bert-mrpc-lora-new-finetune",
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir="./lora-new-finetune-logs",
    logging_steps=50,
    fp16=True
)

# 初始化新微调 Trainer
new_finetune_trainer = Trainer(
    model=best_lora_model,
    args=new_training_args,
    train_dataset=new_train_dataset,
    eval_dataset=new_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 开始新数据集微调
new_finetune_trainer.train()

# 输出新数据集微调后的评估结果
new_eval_results = new_finetune_trainer.evaluate()
print(f"\n新数据集微调后评估结果：")
print(f"准确率：{new_eval_results['eval_accuracy']:.4f}，F1值：{new_eval_results['eval_f1']:.4f}")