# Fine-tune a Mistral-7b model with DPO (Direct Preference Optimization)

### Checkout my [Twitter(@rohanpaul_ai)](https://twitter.com/rohanpaul_ai) for daily LLM bits

In [None]:
!pip install --upgrade trl peft accelerate bitsandbytes datasets auto-gptq optimum huggingface-hub sentencepiece wandb autoawq -q

In [None]:
import os
import gc
import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, NewType, Optional, Tuple
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import DPOTrainer
import bitsandbytes as bnb
from google.colab import


model_name = "teknium/OpenHermes-2.5-Mistral-7B"

new_model = "OpenHermes-2.5-Mistral-7B-DPO-Math"

## Format DPO dataset

First take a note of our Dataset here

![](assets/2024-01-02-20-54-54.png)

📌 DPO (Direct Preference Optimization) datasets for LLM training, typically consist of a collection of answers that are ranked by humans. This ranking is essential, as the RLHF process fine-tunes LLMs to output the preferred answer. 

📌 The structure of the dataset is straightforward: for each row, there is one chosen (preferred) answer, and one rejected answer. The goal of RLHF is to guide the model to output the preferred answer.

📌 And Huggingface's `DPOTrainer` expects a very specific format for the dataset. 

📌 Since the model will be trained to directly optimize the preference of which sentence is the most relevant, given two sentences. We provide an example from the Anthropic/hh-rlhf dataset below.

📌 To synthetically create DPO datasets for a set of prompts, you can create the answers with GPT-4/3.5 which will be your preferred answers, and with Llama-2-13b or similar class of models, create the rejected responses. 

It’s a smart way to bypass human feedback and only rely on models with different levels of size/performance.

In [None]:
def format_message(role, content, tokenizer, add_generation_prompt=False):
    if content:
        message = {"role": role, "content": content}
        return tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=add_generation_prompt)
    return ""

def format_prompt_into_chatml(row_sample):
    # Error handling for missing keys in row_sample
    required_keys = ['system', 'question', 'chosen', 'rejected']
    for key in required_keys:
        if key not in row_sample:
            # Handle missing key appropriately, e.g., raise an error or return a default value
            raise ValueError(f"Key '{key}' missing in row sample")

    system = format_message("system", row_sample['system'], tokenizer)
    prompt = format_message("user", row_sample['question'], tokenizer, add_generation_prompt=True)
    chosen = row_sample['chosen'] + "\n"
    rejected = row_sample['rejected'] + "\n"

    return {
        "prompt": system + prompt,
        "chosen": chosen,
        "rejected": rejected,
    }


dataset = load_dataset("Intel/orca_dpo_pairs")['train']

# Save columns
original_columns = dataset.column_names

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# Format dataset
dataset = dataset.map(
    format_prompt_into_chatml,
    remove_columns=original_columns
)

# Print sample
dataset[1]

## Train model with DPO


In [None]:

@dataclass
class DPOConfig(transformers.TrainingArguments):
    """
    Arguments related to the DPO training process itself.
    For all parameters, see:
    https://huggingface.co/docs/transformers/v4.26.1/en/main_classes/trainer#transformers.TrainingArguments
    """

    beta: Optional[float] = field(
        default=0.1,
        metadata={"help": "The beta factor in DPO loss. Higher beta means less divergence from the initial policy."},
    )
    hub_model_revision: Optional[str] = field(
        default="main",
        metadata={"help": ("The Hub model branch to push the model to.")},
    )
    logging_first_step: bool = field(
        default=True,
        metadata={"help": ("Whether to log and evaluate the first global_step or not.")},
    )
    max_prompt_length: Optional[int] = field(
        default=None,
        metadata={"help": ("For DPO, the maximum length of the prompt to use for conditioning the model.")},
    )
    max_length: Optional[int] = field(
        default=None,
        metadata={"help": ("Used by TRL for reward model training, which tries to read this parameter in init.")},
    )
    optim: Optional[str] = field(default="rmsprop")
    remove_unused_columns: bool = field(default=False)


## Note on `DPOConfig` class - 

📌 When you instantiate `DPOConfig` with parameters such as `save_strategy="steps"`, which are not explicitly defined in `DPOConfig`, Python's inheritance mechanism comes into play. Since `DPOConfig` is a subclass of `transformers.TrainingArguments`, it inherits all attributes and methods of the base class. Therefore, even if certain parameters like `save_strategy` are not explicitly defined in `DPOConfig`, they are valid as long as they are part of the `transformers.TrainingArguments`.

📌 The `DPOConfig` class, derived from `transformers.TrainingArguments`, allows customization of training parameters specific to your fine-tuning task. 

📌 In the `DPOConfig` class, you've defined certain parameters like `beta`, `hub_model_revision`, `logging_first_step`, etc. These are additional or overridden parameters on top of the standard ones provided by `transformers.TrainingArguments`.

📌 This mechanism allows your instance `training_args` to use parameters from both `DPOConfig` and `transformers.TrainingArguments`. 

📌 However, it's important to ensure that the parameters you are using in `training_args` are indeed valid and recognized by `transformers.TrainingArguments`.

In [None]:
def train(model_name,
          dataset,
          tokenizer,
          new_model,
          #wandb_project: str = "",
          #wandb_run_name: str = "",
          #wandb_watch: str = "",  # options: false | gradients | all
          #wandb_log_model: str = "",  # options: false | true
          ):
    peft_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=List[str] =['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
    )
    assert (
        model_name
    ), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'"

    # Check if parameter passed or if set within environ
    '''
    use_wandb = len(wandb_project) > 0 or (
        "WANDB_PROJECT" in os.environ and len(os.environ["WANDB_PROJECT"]) > 0
    )
    # Only overwrite environ if wandb param passed
    if len(wandb_project) > 0:
        os.environ["WANDB_PROJECT"] = wandb_project
    if len(wandb_watch) > 0:
        os.environ["WANDB_WATCH"] = wandb_watch
    if len(wandb_log_model) > 0:
        os.environ["WANDB_LOG_MODEL"] = wandb_log_model
    '''

    # Base Model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        load_in_4bit=True
    )
    model.config.use_cache = False

    # Reference model
    ref_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        load_in_4bit=True
    )

    # Training arguments
    training_args = DPOConfig(
        num_train_epochs=3,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        gradient_checkpointing=True,
        learning_rate=5e-5,
        lr_scheduler_type="linear",
        max_steps=200,
        save_strategy="no",
        logging_steps=1,
        output_dir=new_model,
        optim="paged_adamw_32bit",
        warmup_steps=100,
        fp16=True,
        # report_to="wandb",
    )

    dpo_trainer = DPOTrainer(
        model,
        ref_model,
        args=training_args,
        train_dataset=dataset,
        tokenizer=tokenizer,
        peft_config=peft_config,
        beta=0.1,
        max_prompt_length=1024,
        max_length=1536,
    )

In [None]:
dpo_trainer.train(model_name, dataset, tokenizer, new_model)

## Save the finetuned model

In [None]:
# Save artifacts
dpo_trainer.model.save_pretrained("final_checkpoint")
tokenizer.save_pretrained("final_checkpoint")

# Clean up memory
del dpo_trainer, model, ref_model
gc.collect()
torch.cuda.empty_cache()

# Reload model in FP16 (instead of NF4)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    return_dict=True,
    torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Merge base model with the adapter
model = PeftModel.from_pretrained(base_model, "final_checkpoint")
model = model.merge_and_unload()

# Save model and tokenizer
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

# Push them to the HF Hub
# model.push_to_hub(new_model, use_temp_dir=False, token=hf_token)
# tokenizer.push_to_hub(new_model, use_temp_dir=False, token=hf_token)

## Inference

In [None]:
def generate_chat_response(message, new_model):
    tokenizer = AutoTokenizer.from_pretrained(new_model)
    prompt = tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False)

    chat_pipeline = pipeline(
        "text-generation",
        model=new_model,
        tokenizer=tokenizer
    )

    sequences = chat_pipeline(
        prompt,
        do_sample=True,
        temperature=0.8,
        top_p=0.8,
        num_return_sequences=1,
        max_length=250,
    )

    return sequences[0]['generated_text']

# Usage
message = [
    {"role": "system", "content": "You are a friendly AI chatbot."},
    {"role": "user", "content": "Plan a holiday for the summer in Europe?"}
]

generated_text = generate_chat_response(message, new_model)

print(generated_text)
