In [1]:
!nvidia-smi

Sat Aug  2 20:16:13 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.57.01              Driver Version: 565.57.01      CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  |   00000000:68:00.0 Off |                    0 |
| N/A   24C    P0             59W /  500W |       1MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
#!/usr/bin/env python
"""
train_dpo_lora.py

Fine-tune mistralai/Mistral-Nemo-Instruct-2407 on chargoddard/chai-dpo
using Direct Preference Optimization (DPO) + LoRA on an A100 80 GB.
"""

import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig
from trl import DPOConfig, DPOTrainer

def main():
    model_name = "mistralai/Mistral-Nemo-Instruct-2407"

    # 1. Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # 2. Load & preprocess DPO dataset
    ds = load_dataset("chargoddard/chai-dpo", split="train")
    def to_dpo_example(ex):
        # 'history' is the conversational field in this dataset
        prompt = "".join(msg["value"] + tokenizer.eos_token for msg in ex["history"])
        rejected = ex["rejected"][0] if isinstance(ex["rejected"], list) else ex["rejected"] # loop laga dena badme reject list pe
        return {"prompt": prompt, "chosen": ex["accepted"], "rejected": rejected}

    dpo_ds = ds.map(to_dpo_example, remove_columns=ds.column_names)

    # 3. Load base model in fp16
    model = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype=torch.bfloat16, device_map="auto"
    )

    # 4. Define LoRA adapter config
    lora_cfg = LoraConfig(
        r=16,
        lora_alpha=16,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )

    # 5. DPO training arguments
    training_args = DPOConfig(
        output_dir="./dpo_mistral_nemo_lora",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=1e-4,
        bf16=True,
        optim="adamw_torch",
        max_steps=5000,
        logging_steps=50,
        save_steps=500,
        save_total_limit=2,
        eval_strategy="no",    
        num_train_epochs=2
        # no eval split here
    )  # DPOConfig only takes training‐argument fields :contentReference[oaicite:1]{index=1}

    # 6. Initialize DPOTrainer
    trainer = DPOTrainer(
        model=model,
        args=training_args,
        train_dataset=dpo_ds,
        processing_class=tokenizer,
        peft_config=lora_cfg,             # wrap with LoRA here
    )

    # 7. Start training
    trainer.train()

if __name__ == "__main__":
    main()

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
50,0.6864
100,0.6859
150,0.6729


In [None]:
from huggingface_hub import login
login("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")

In [None]:
from peft import PeftModel

# Merge and save full model
merged_model = trainer.model.merge_and_unload()
merged_model.save_pretrained("dpo_mistral_nemo_lora_merged")
tokenizer.save_pretrained("dpo_mistral_nemo_lora_merged")

# Upload merged model
merged_model.push_to_hub("your-username/dpo-mistral-nemo-lora", use_auth_token=True)
tokenizer.push_to_hub("your-username/dpo-mistral-nemo-lora", use_auth_token=True)
