In [1]:
!nvidia-smi

Sat Aug  2 20:16:13 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.57.01              Driver Version: 565.57.01      CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  |   00000000:68:00.0 Off |                    0 |
| N/A   24C    P0             59W /  500W |       1MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
#!/usr/bin/env python
"""
train_dpo_lora.py

Fine-tune mistralai/Mistral-Nemo-Instruct-2407 on chargoddard/chai-dpo
using Direct Preference Optimization (DPO) + LoRA on an A100 80 GB.
"""

import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig
from trl import DPOConfig, DPOTrainer

def main():
    model_name = "mistralai/Mistral-Nemo-Instruct-2407"

    # 1. Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # 2. Load & preprocess DPO dataset
    ds = load_dataset("chargoddard/chai-dpo", split="train")
    def to_dpo_example(ex):
        # 'history' is the conversational field in this dataset
        prompt = "".join(msg["value"] + tokenizer.eos_token for msg in ex["history"])
        rejected = ex["rejected"][0] if isinstance(ex["rejected"], list) else ex["rejected"] # loop laga dena badme reject list pe
        return {"prompt": prompt, "chosen": ex["accepted"], "rejected": rejected}

    dpo_ds = ds.map(to_dpo_example, remove_columns=ds.column_names)

    # 3. Load base model in fp16
    model = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype=torch.bfloat16, device_map="auto"
    )

    # 4. Define LoRA adapter config
    lora_cfg = LoraConfig(
        r=16,
        lora_alpha=16,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )

    # 5. DPO training arguments
    training_args = DPOConfig(
        output_dir="./dpo_mistral_nemo_lora",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=1e-4,
        bf16=True,
        optim="adamw_torch",
        max_steps=5000,
        logging_steps=50,
        save_steps=500,
        save_total_limit=2,
        eval_strategy="no",    
        num_train_epochs=2
        # no eval split here
    )  # DPOConfig only takes training‐argument fields :contentReference[oaicite:1]{index=1}

    # 6. Initialize DPOTrainer
    trainer = DPOTrainer(
        model=model,
        args=training_args,
        train_dataset=dpo_ds,
        processing_class=tokenizer,
        peft_config=lora_cfg,             # wrap with LoRA here
    )

    # 7. Start training
    trainer.train()

if __name__ == "__main__":
    main()

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
50,0.6864
100,0.6859
150,0.6729
200,0.654
250,0.7029
300,0.7324
350,0.6991
400,0.7081
450,0.7256
500,0.7502


In [3]:
from huggingface_hub import login
login("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")

In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig

# Load LoRA config
peft_model_path = "/workspace/dpo_mistral_nemo_lora/checkpoint-5000/"  # or checkpoint dir like ./dpo_mistral_nemo_lora/checkpoint-5000
config = PeftConfig.from_pretrained(peft_model_path, local_files_only=True)

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, peft_model_path)

# Merge & unload
merged_model = model.merge_and_unload()

# Save merged model
save_path = "/workspace/merged_dpo_mistral_nemo"
merged_model.save_pretrained(save_path)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.save_pretrained(save_path)


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

('./merged_dpo_mistral_nemo/tokenizer_config.json',
 './merged_dpo_mistral_nemo/special_tokens_map.json',
 './merged_dpo_mistral_nemo/chat_template.jinja',
 './merged_dpo_mistral_nemo/tokenizer.json')

In [16]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig

# Paths
peft_model_path = "/workspace/dpo_mistral_nemo_lora/checkpoint-5000/"  # or e.g., checkpoint path
save_path = "/workspace/merged_dpo_mistral_nemo"
hf_repo_id = "pratt3000/Mistral-Nemo-2407-Role-Playing-LORA-4data-ga16-lr7e6-3ep-baseDPO"  # 👈 change this

# Load LoRA config
config = PeftConfig.from_pretrained(peft_model_path)

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# Load and merge LoRA adapter
model = PeftModel.from_pretrained(base_model, peft_model_path)
merged_model = model.merge_and_unload()

# Save locally
merged_model.save_pretrained(save_path, push_to_hub=True, repo_id=hf_repo_id)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.save_pretrained(save_path, push_to_hub=True, repo_id=hf_repo_id)


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/4.91G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/4.91G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.91G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.87G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.91G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

('/workspace/merged_dpo_mistral_nemo/tokenizer_config.json',
 '/workspace/merged_dpo_mistral_nemo/special_tokens_map.json',
 '/workspace/merged_dpo_mistral_nemo/chat_template.jinja',
 '/workspace/merged_dpo_mistral_nemo/tokenizer.json')

In [None]:
from peft import PeftModel

# Merge and save full model
merged_model = trainer.model.merge_and_unload()


merged_model.save_pretrained("Mistral-Nemo-2407-Role-Playing-LORA-4data_ga16_lr7e6_3ep_baseDPO")
tokenizer.save_pretrained("Mistral-Nemo-2407-Role-Playing-LORA-4data_ga16_lr7e6_3ep_baseDPO")

# Upload merged model
merged_model.push_to_hub("your-username/Mistral-Nemo-2407-Role-Playing-LORA-4data_ga16_lr7e6_3ep_baseDPO", use_auth_token=True)
tokenizer.push_to_hub("your-username/Mistral-Nemo-2407-Role-Playing-LORA-4data_ga16_lr7e6_3ep_baseDPO", use_auth_token=True)