In [None]:
! nvidia-smi

In [None]:
! pip install -U --quiet datasets evaluate torch transformers accelerate trl peft

### **Load Dataset**

In [None]:
from datasets import load_dataset

combined_dpo = load_dataset("rasyosef/ultrafeedback-orca-math-dpo")
combined_dpo

In [None]:
for i in range(3):
  print(combined_dpo["train"][i]["prompt"])
  print(combined_dpo["train"][i]["chosen"])
  print(combined_dpo["train"][i]["rejected"])
  print("\n\n")

### **Load Model**

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "rasyosef/phi-2-sft-openhermes-128k-v2-merged"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="cuda",
    # attn_implementation="flash_attention_2"
  )

In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
print(model)

In [None]:
messages = [{"role":"user", "content":"Who was the last king of Germany?"}]

def chat(messages, max_new_tokens=8):
  tokenized_messages = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
  outputs = model.generate(tokenized_messages, max_new_tokens=max_new_tokens)
  print(tokenizer.decode(outputs[0]))

chat(messages, max_new_tokens=128)

### **Inspect Dataset**

In [None]:
# Lengths Distribution
prompt_lengths = sorted(combined_dpo["train"]["prompt_length"])
chosen_lengths = sorted(combined_dpo["train"]["chosen_length"])
rejected_lengths = sorted(combined_dpo["train"]["rejected_length"])

print("prompt_lengths:", prompt_lengths[1024], prompt_lengths[4096], prompt_lengths[8000], prompt_lengths[12000], max(prompt_lengths))
print("chosen_lengths:", chosen_lengths[1024], chosen_lengths[4096], chosen_lengths[8000], chosen_lengths[12000], max(chosen_lengths))
print("rejected_lengths:", rejected_lengths[1024], rejected_lengths[4096], rejected_lengths[8000], rejected_lengths[12000], max(rejected_lengths))

In [None]:
MAX_LENGTH = 448
combined_dpo_filtered = combined_dpo.filter(lambda example: example['prompt_length'] + example['chosen_length'] < MAX_LENGTH and example['prompt_length'] + example['rejected_length'] < MAX_LENGTH)
combined_dpo_filtered

In [None]:
from collections import Counter
Counter(combined_dpo_filtered["train"]["source_dataset"]), Counter(combined_dpo_filtered["test"]["source_dataset"])

In [None]:
import random
random.seed(42)

combined_dpo_final = combined_dpo_filtered.filter(
    lambda row: row['source_dataset'] != "ultrafeedback" or (row['source_dataset'] == "ultrafeedback" and random.random()<=0.67)
)
#combined_dpo_final = combined_dpo_final.filter(lambda row: "translat" not in (row["prompt"]+row["chosen"]).lower())
combined_dpo_final

In [None]:
from collections import Counter
Counter(combined_dpo_final["train"]["source_dataset"]), Counter(combined_dpo_final["test"]["source_dataset"])

In [None]:
sample = combined_dpo_final["train"].shuffle().select(range(5))

for row in sample:
  print(row["source_dataset"])
  print(row["prompt"])
  print(row["chosen"])
  print(row["rejected"])
  print("\n-----------------------------------------------------\n")

### **DPO with TRL**

In [None]:
from peft import LoraConfig, get_peft_model, cast_mixed_precision_params

peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    # Target all linear layers
    target_modules=["q_proj", "k_proj", "v_proj", "dense", "fc1", "fc2", "lm_head"]
)

dpo_model = get_peft_model(model, peft_config)
cast_mixed_precision_params(dpo_model, dtype=torch.float16)
dpo_model.print_trainable_parameters()

In [None]:
from google.colab import userdata
from trl import DPOConfig, DPOTrainer

batch_size = 4 # On T4 or P100, set batch_size to 1 to avoid Cuda OOM
gradient_accum_steps = 4
epochs = 2

new_model_id = "phi-2-dpo"

eval_steps = 100 #len(combined_dpo_final["train"]) // (batch_size * gradient_accum_steps * 8)
save_steps = eval_steps * 2
logging_steps=eval_steps

print("Eval Steps:", eval_steps)
print("Save Steps:", save_steps)

dpo_config = DPOConfig(
  output_dir=new_model_id,
  beta=0.1,
  max_length=512,
  max_prompt_length=512,
  per_device_train_batch_size=batch_size,
  per_device_eval_batch_size=batch_size,
  gradient_accumulation_steps=gradient_accum_steps,
  num_train_epochs=epochs,
  learning_rate=2e-6,
  warmup_steps=250,
  lr_scheduler_type="cosine",
  remove_unused_columns=False,
  fp16=True,
  logging_strategy="steps",
  logging_steps=logging_steps,
  eval_strategy="steps",
  eval_steps=eval_steps,
  save_strategy="steps",
  save_steps=save_steps,
  seed=42,
  # push_to_hub=True,
  # hub_token=userdata.get("HF_TOKEN"),

  # gradient_checkpointing=True,
)

In [None]:
trainer = DPOTrainer(
    dpo_model, # left ref_model null
    args=dpo_config,
    train_dataset=combined_dpo_final["train"],
    eval_dataset=combined_dpo_final["test"],
    tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
messages = [
    {"role":"system", "content": "You are an AI assistant that follows instruction extremely well. Help as much as you can."},
    {"role":"user", "content":"What is J. Robert Oppenheimer known for?"}
]

def chat(model, messages, max_new_tokens=8):
  tokenized_messages = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
  outputs = model.generate(tokenized_messages, max_new_tokens=max_new_tokens)
  print(tokenizer.decode(outputs[0]))

chat(dpo_model, messages, max_new_tokens=256)

In [None]:
messages = [
    {"role":"system", "content": "You are an AI assistant that follows instruction extremely well. Help as much as you can."},
    {"role":"user", "content":"Who was the last king of Germany?"}
]

chat(dpo_model, messages, max_new_tokens=256)

In [None]:
# Push trainer to Hub
trainer.push_to_hub()

In [None]:
type(model), type(dpo_model)