<a href="https://colab.research.google.com/github/ranathakur1/DPO-finetuning-with-LoRA/blob/main/DPO_finetuning_with_LoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets trl

In [None]:
#doanload the dataset
from datasets import load_dataset

dataset = load_dataset("Anthropic/hh-rlhf")


In [None]:
import os
os.makedirs("data", exist_ok=True)

for split, ds in dataset.items():
    ds.to_json(f"data/Anthropic_{split}.json", orient="records", lines=True)

In [None]:
import pandas as pd
import json

# Load Anthropic HH JSON file using pandas to handle JSON Lines format
hh_data = pd.read_json("/content/data/Anthropic_test.json", lines=True).to_dict(orient="records")

# Subsample 3000 records
import random
random.seed(42)
hh_sample = random.sample(hh_data, 3000)

print(f"Sampled {len(hh_sample)} records for DPO training.")

In [None]:
#preprocessing the dataset to make data DPO ready
dpo_data = []

for i, example in enumerate(hh_sample):
    # Extract the full chosen and rejected conversation
    chosen_text = example['chosen'].strip()
    rejected_text = example['rejected'].strip()

    # Split by lines to identify turns
    chosen_lines = chosen_text.split("\n")
    rejected_lines = rejected_text.split("\n")

    # Identify last human turn as the point where assistant responds next
    # Assume last "Human:" line in chosen is the last user input
    last_human_index = max([idx for idx, line in enumerate(chosen_lines) if line.startswith("Human:")])
    prompt_lines = chosen_lines[:last_human_index+1]

    # The chosen reply is the assistant response **after the last human turn**
    chosen_reply = "\n".join(chosen_lines[last_human_index+1:]).replace("Assistant:", "").strip()
    rejected_reply = "\n".join(rejected_lines[last_human_index+1:]).replace("Assistant:", "").strip()

    # Combine context into a single prompt string
    prompt = "\n".join(prompt_lines).strip()

    # Skip examples where chosen or rejected is empty
    if not chosen_reply or not rejected_reply:
        continue

    # Append to DPO-ready list
    dpo_data.append({
        "prompt": prompt,
        "chosen": chosen_reply,
        "rejected": rejected_reply
    })

In [None]:
#get the subset of dataset
dpo_data_small = random.sample(dpo_data,200)

dpo_data_small


In [None]:
from datasets import Dataset

# hh_sample is currently a list of dicts
hh_dataset = Dataset.from_list(hh_sample)

In [None]:
# To inspect the first record of a datasets.Dataset, you can access it by index
hh_dataset[0]

# Alternatively, if you want the pandas DataFrame behavior, convert it first:
# hh_dataset.to_pandas().head(1)

In [None]:
#essential imports
import os
import pandas as pd
import torch
from trl import DPOTrainer
from torch import float32
from datasets import Dataset # Import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from trl import DPOTrainer
import json, random

In [None]:
#set configuration
os.environ["USE_BF16"] = "0"  # disable bf16
os.environ["USE_FP16"] = "1"  # disable fp16


# Load model
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto",offload_folder="offload",dtype=torch.float16,)
model.gradient_checkpointing_disable()
model.config.use_cache = True

In [None]:

# LoRA
lora_config = LoraConfig(
    r=4,
    lora_alpha=8,
    target_modules=["q_proj","v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)


In [None]:
# Convert dpo_data_small to a datasets.Dataset object
dpo_dataset_m = Dataset.from_list(dpo_data_small)



In [None]:
#train with DPO trainer

from trl import DPOConfig, DPOTrainer


dpo_config = DPOConfig(
    learning_rate=2e-5,
    max_length=256,   # truncate sequences to reduce memory
    beta=0.1,
)

trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=dpo_config,
    train_dataset=dpo_dataset_m,
    peft_config=lora_config
)

trainer.train()


In [None]:
#save model
trainer.model.save_pretrained("lora-adapter")
tokenizer.save_pretrained("lora-adapter")

In [None]:
#inferencing- load base model and newly trained lora adapter

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel


base_model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
model = PeftModel.from_pretrained(base_model, "lora-adapter")  # LoRA weights loaded dynamically
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")


In [None]:
#Infer from the model
inputs = tokenizer("Explain what LoRA fine-tuning is in simple terms for a product manager.", return_tensors="pt").to(model.device)

#outputs = model.generate(**inputs)
outputs = model.generate(
    **inputs,
    max_new_tokens=50,   # ðŸ‘ˆ CRITICAL
    do_sample=False
)

In [None]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))