In [1]:
!pip install -q transformers trl peft bitsandbytes datasets
!pip install -q rouge_score bert_score
!pip install -q evaluate nltk

In [2]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import DPOTrainer, DPOConfig
from peft import PeftModel, PeftConfig, LoraConfig

In [3]:
import warnings
warnings.filterwarnings("ignore", message=".*use_reentrant.*")

In [4]:
class Config:
    base_model_id = "Qwen/Qwen2.5-3B"
    sft_model_id = "./qwen-2.5-3b-sft-truthfulqa/sft/checkpoint-288"
    dpo_model_id = "./qwen-2.5-3b-dpo-truthfulqa/dpo"

    dataset_id = "truthfulqa/truthful_qa"

config = Config()

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_id,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map="auto"
)


tokenizer = AutoTokenizer.from_pretrained(
    config.base_model_id,
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"


model = PeftModel.from_pretrained(
    model,
    config.sft_model_id,
    is_trainable=True
)

model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 5,013,504 || all params: 3,090,952,192 || trainable%: 0.1622


In [6]:
def format_sample(sample):
    rejected = sample["incorrect_answers"][0] if sample["incorrect_answers"] else "N/A"
    return {
        "prompt": sample["question"],
        "chosen": sample["best_answer"],
        "rejected": rejected,
    }

raw_dataset = load_dataset(config.dataset_id, "generation")["validation"]
raw_dataset = raw_dataset.map(format_sample, remove_columns=raw_dataset.column_names)

split_dataset = raw_dataset.train_test_split(test_size=0.1, seed=42)
temp_split = split_dataset["test"].train_test_split(test_size=0.5, seed=42)

dataset_split = {
    "train": split_dataset["train"],
    "validation": temp_split["train"],
    "test": temp_split["test"],
}

In [7]:
dpo_config = DPOConfig(
    beta=0.05,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    optim="adamw_8bit",
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    logging_steps=1,
    eval_strategy="epoch",
    save_total_limit=1,
    save_strategy="best",
    metric_for_best_model="eval_loss",
    load_best_model_at_end=True,
    output_dir=config.dpo_model_id,
    bf16=False,
    fp16=True,
    report_to="tensorboard",
    remove_unused_columns=False,
)

In [8]:
trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=dpo_config,
    train_dataset=dataset_split["train"],
    eval_dataset=dataset_split["validation"],
    processing_class=tokenizer
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [9]:
train_history = trainer.train()

Epoch,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
1,0.626,0.642645,0.198456,0.075464,0.75463,0.122992,-25.471392,-27.376974,-0.456204,-0.318629
2,0.624,0.615134,0.251773,0.060095,0.81713,0.191679,-24.405037,-27.684359,-0.482988,-0.31796
3,0.6187,0.592182,0.270354,0.017679,0.837963,0.252675,-24.03343,-28.532682,-0.53942,-0.343841
4,0.5286,0.580749,0.267535,-0.016671,0.858796,0.284206,-24.089804,-29.219664,-0.577102,-0.366243
5,0.5096,0.578678,0.266521,-0.023467,0.858796,0.289988,-24.110083,-29.355589,-0.582954,-0.369986


In [10]:
train_history

TrainOutput(global_step=115, training_loss=0.5960209452587625, metrics={'train_runtime': 178.0891, 'train_samples_per_second': 20.636, 'train_steps_per_second': 0.646, 'total_flos': 0.0, 'train_loss': 0.5960209452587625, 'epoch': 5.0})

In [None]:
import zipfile
import os

def zip_all_files(output_filename='stage-2.zip', directory='qwen-2.5-3b-dpo-truthfulqa'):
    with zipfile.ZipFile(output_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for foldername, subfolders, filenames in os.walk(directory):
            for filename in filenames:
                file_path = os.path.join(foldername, filename)
                # Skip hidden files and system files if desired
                if not filename.startswith('.') and '__pycache__' not in file_path:
                    zipf.write(file_path, os.path.relpath(file_path, directory))

zip_all_files()
