# Fine-tune a Mistral-7b model with DPO

In [1]:
%pip install -q datasets trl peft bitsandbytes sentencepiece wandb python-dotenv

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import gc
import torch

import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import DPOTrainer, DPOConfig
import bitsandbytes as bnb
import wandb
from dotenv import load_dotenv

load_dotenv()
# Defined in the secrets tab in Google Colab
hf_token = os.environ.get('HF_TOKEN')
wb_token = os.environ.get('WANDB')
wandb.login(key=wb_token)

model_name = "Qwen/Qwen2-1.5B-Instruct"
new_model = "DPO-Qwen2-1.5B-Instruct-Human-like"

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mrayene[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Format dataset

In [3]:
def chatml_format(example):
    # Format instruction
    message = {"role": "user", "content": example['prompt']}
    prompt = tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True)

    # Format chosen answer
    chosen = example['chosen'] + "<|im_end|>\n" # Qwen model uses the same pattern as a stop id.

    # Format rejected answer
    rejected = example['rejected'] + "<|im_end|>\n"

    return {
        "prompt": prompt,
        "chosen": chosen,
        "rejected": rejected,
    }

# Load dataset
dataset = load_dataset("HumanLLMs/Human-Like-DPO-Dataset")['train'] # consider using just a subset of this

# Save columns
original_columns = dataset.column_names

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# Format dataset
dataset = dataset.map(
    chatml_format,
    remove_columns=original_columns
)

# Print sample
dataset[1]

{'prompt': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nDo you have a go-to karaoke jam?<|im_end|>\n<|im_start|>assistant\n',
 'chosen': 'Oh, totally! 😄 I\'m a sucker for a good ol\' rock ballad. Give me some Bon Jovi any day of the week! "Livin\' on a Prayer" is my go-to karaoke jam. There\'s just something about belting out "Oh, we\'re halfway there!" at the top of my lungs that gets me pumped up! 🎤 What about you, do you have a favorite karaoke song? 🎶<|im_end|>\n',
 'rejected': "As a professional AI language model, I don't have personal experiences or emotions, nor do I engage in hobbies or leisure activities. My purpose is to provide accurate and informative responses to assist users with their queries, and I do not possess the capacity to experience personal preferences or enjoyment. I am solely focused on delivering high-quality information and maintaining a professional tone in my interactions.<|im_end|>\n"}

## Train model with DPO

In [None]:
# LoRA configuration
peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
)

# Model to fine-tune
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    load_in_4bit=True
)
model.config.use_cache = False

# Training arguments
training_args = DPOConfig(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=1e-5,
    lr_scheduler_type="cosine",
    max_steps=200,
    save_strategy="no",
    logging_steps=1,
    output_dir=new_model,
    optim="paged_adamw_32bit",
    warmup_steps=50,
    bf16=True,
    report_to="wandb",
    beta=10,
    max_prompt_length=1024,
    max_length=1536,
)

# Create DPO trainer
dpo_trainer = DPOTrainer(
    model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    peft_config=peft_config,
)

# Fine-tune model with DPO
dpo_trainer.train()

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now default to True since model is quantized.
  dpo_trainer = DPOTrainer(




Step,Training Loss
1,0.6931
2,0.6931
3,1.2742
4,2.0616
5,3.4088
6,1.9454
7,1.092
8,0.1857
9,1.0909
10,0.4346


## Upload model

In [6]:
# Save artifacts
dpo_trainer.model.save_pretrained("final_checkpoint")
tokenizer.save_pretrained("final_checkpoint")


('final_checkpoint/tokenizer_config.json',
 'final_checkpoint/special_tokens_map.json',
 'final_checkpoint/vocab.json',
 'final_checkpoint/merges.txt',
 'final_checkpoint/added_tokens.json',
 'final_checkpoint/tokenizer.json')

In [7]:

# Flush memory
del dpo_trainer, model
gc.collect()
torch.cuda.empty_cache()

# Reload model in FP16 (instead of NF4)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    return_dict=True,
    torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Merge base model with the adapter
model = PeftModel.from_pretrained(base_model, "final_checkpoint")
model = model.merge_and_unload()

# Save model and tokenizer
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)


('DPO-Qwen2-1.5B-Instruct-Human-like/tokenizer_config.json',
 'DPO-Qwen2-1.5B-Instruct-Human-like/special_tokens_map.json',
 'DPO-Qwen2-1.5B-Instruct-Human-like/vocab.json',
 'DPO-Qwen2-1.5B-Instruct-Human-like/merges.txt',
 'DPO-Qwen2-1.5B-Instruct-Human-like/added_tokens.json',
 'DPO-Qwen2-1.5B-Instruct-Human-like/tokenizer.json')

In [8]:
# Push them to the HF Hub
model.push_to_hub(new_model, use_temp_dir=False, token=hf_token)
tokenizer.push_to_hub(new_model, use_temp_dir=False, token=hf_token)

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/brayene/DPO-Qwen2-1.5B-Instruct-Human-like/commit/198c1629cf29fb71ad7632f52c9035a852cc607b', commit_message='Upload tokenizer', commit_description='', oid='198c1629cf29fb71ad7632f52c9035a852cc607b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/brayene/DPO-Qwen2-1.5B-Instruct-Human-like', endpoint='https://huggingface.co', repo_type='model', repo_id='brayene/DPO-Qwen2-1.5B-Instruct-Human-like'), pr_revision=None, pr_num=None)

## Inference

In [11]:
# Format prompt
def generate(prompt):
    message = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    tokenizer = AutoTokenizer.from_pretrained(new_model)
    prompt = tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False)
    
    # Create pipeline
    pipeline = transformers.pipeline(
        "text-generation",
        model=new_model,
        tokenizer=tokenizer
    )
    
    # Generate text
    sequences = pipeline(
        prompt,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        num_return_sequences=1,
        max_length=200,
        return_full_text = False,
        
    )
    return sequences[0]['generated_text']

test_samples = [
    "What is your favourite food?",
    "Do you like ice skating?",
    "Have you ever felt disappointed?"
]
for t in test_samples:
    o = generate(t)
    print("User: ", t)
    print("AI: ", o)

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


User:  What is your favourite food?
AI:  As an artificial intelligence language model, I don't have the ability to eat or taste food, so I don't have a favorite dish. However, I'm programmed to provide information and assist with inquiries related to various topics, including food recommendations! Let me know if you have any questions about different cuisines or recipes.


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


User:  Do you like ice skating?
AI:  As an artificial intelligence, I don't have the ability to experience emotions or preferences in the same way that humans do. However, I can provide information about ice skating and answer any questions you may have about it!


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


User:  Have you ever felt disappointed?
AI:  As an artificial intelligence language model, I don't have emotions or feelings like humans do. However, I can understand and empathize with the experiences of those who feel disappointed. It's important to recognize that disappointment is a natural emotion that everyone experiences at some point in life. Sometimes it's caused by unexpected events, while other times it may be due to personal disappointments such as unfulfilled expectations or unmet goals. It's important to remember that disappointment is a normal part of life and that it's okay to feel it from time to time. There are many ways to cope with disappointment, including talking things over with friends, seeking support from a therapist or counselor, or engaging in self-care activities such as exercise or spending time in nature.


In [None]:
generate("Do you have a go-to karaoke jam?")

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
