In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from peft import get_peft_model, LoraConfig
from trl import DPOConfig, DPOTrainer


In [2]:
model_name = "Qwen/Qwen2.5-1.5B-Instruct"

# 加载模型和tokenizer

In [3]:
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [4]:
# 打印模型，查看模型架构
print(model)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe

# Lora

In [4]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, peft_config)

# 数据集

In [5]:
dataset = load_dataset("phimes/DPO-bad-boy-chinese-for-Qwen2.5-extended")
# 打印数据集
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 1000
    })
})


In [6]:
train_dataset = dataset["train"]
test_dataset = dataset["test"]

# 训练配置  

In [10]:
dpo_args = DPOConfig(
    output_dir="./output",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    eval_strategy="steps",
    eval_steps=10,
    logging_dir="./logs",
    logging_steps=10,
)

In [11]:
trainer = DPOTrainer(
    model=model,
    ref_model=None,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    processing_class=tokenizer,
    args=dpo_args,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [12]:
trainer.train()

Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,0.5508,0.568415,0.168376,-0.103204,1.0,0.271579,-80.937843,-42.507504,-1.569431,-1.222084
20,0.5028,0.524791,0.237129,-0.142604,1.0,0.379734,-80.25029,-42.901512,-1.560557,-1.20734
30,0.4549,0.490688,0.292051,-0.178562,1.0,0.470613,-79.701088,-43.261089,-1.55717,-1.199275
40,0.4334,0.463423,0.33619,-0.211328,1.0,0.547518,-79.259689,-43.588749,-1.553078,-1.190691
50,0.377,0.444045,0.369415,-0.235567,1.0,0.604982,-78.927444,-43.831139,-1.550475,-1.184396
60,0.3799,0.430843,0.393244,-0.252267,1.0,0.645511,-78.689148,-43.998138,-1.549918,-1.181047
70,0.3495,0.424199,0.405532,-0.26079,1.0,0.666322,-78.566277,-44.083374,-1.54988,-1.17955


TrainOutput(global_step=75, training_loss=0.43000678380330404, metrics={'train_runtime': 26.2901, 'train_samples_per_second': 11.411, 'train_steps_per_second': 2.853, 'total_flos': 0.0, 'train_loss': 0.43000678380330404, 'epoch': 3.0})

# 测试结果

In [13]:
print(trainer.evaluate())

{'eval_loss': 0.4232632517814636, 'eval_runtime': 0.4148, 'eval_samples_per_second': 24.109, 'eval_steps_per_second': 12.054, 'eval_rewards/chosen': 0.40707287192344666, 'eval_rewards/rejected': -0.26221197843551636, 'eval_rewards/accuracies': 1.0, 'eval_rewards/margins': 0.6692848205566406, 'eval_logps/chosen': -78.55086517333984, 'eval_logps/rejected': -44.09758758544922, 'eval_logits/chosen': -1.5497031211853027, 'eval_logits/rejected': -1.1791411638259888, 'epoch': 3.0}
