# Finetuning Mistral-7b with DPO

In [1]:
import os
import gc
import torch

import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import DPOTrainer
import bitsandbytes as bnb
import wandb
from dotenv import load_dotenv

load_dotenv()

# hf_token = os.getenv('HUGGINGFACE_TOKEN')
wb_token = os.getenv('WB_TOKEN')

wandb.login(key=wb_token)

  from .autonotebook import tqdm as notebook_tqdm


bin c:\Users\matth\anaconda3\envs\finetune\Lib\site-packages\bitsandbytes\libbitsandbytes_cuda118.dll


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrealblankname1[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\matth\.netrc


True

## Choosing the Model

In [2]:
model_name = "microsoft/phi-2"
new_model = "ft-phi-2"

## Initializin the Tokenizer

In [3]:
# Initializing the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Formatting Dataset
Here we are formatting the dataset to look more like chat interface.

In [4]:
def chat_format(input):
    sys_msg = {'role': 'system', 
               'content': input['system']}
    if len(input) > 0:
        system = tokenizer.apply_chat_template([sys_msg], tokenize=False)
    else:
        system = ""
    
    user_msg = {'role': 'user',
                'content': input['question']}
    prompt = tokenizer.apply_chat_template([user_msg], tokenize=False, add_generation_prompt=True)
    
    return {
        'prompt': system + prompt,
        'chosen': input['chosen'] + '<|im_end|>\n',
        'rejected': input['rejected'] + '<|im_end|>\n',
    }
    

In [5]:
# Load Dataset from Intel
train_dataset = load_dataset("Intel/orca_dpo_pairs")['train']
og_columns = train_dataset.column_names # saving for later use

# Format the dataset
train_dataset = train_dataset.map(chat_format, remove_columns=og_columns)

train_dataset[1]

Map:   0%|          | 0/12859 [00:00<?, ? examples/s]
No chat template is defined for this tokenizer - using a default chat template that implements the ChatML format (without BOS/EOS tokens!). If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.

Map: 100%|██████████| 12859/12859 [00:01<00:00, 6612.48 examples/s]


{'chosen': 'Midsummer House is a moderately priced Chinese restaurant with a 3/5 customer rating, located near All Bar One.<|im_end|>\n',
 'rejected': ' Sure! Here\'s a sentence that describes all the data you provided:\n\n"Midsummer House is a moderately priced Chinese restaurant with a customer rating of 3 out of 5, located near All Bar One, offering a variety of delicious dishes."<|im_end|>\n',
 'prompt': '<|im_start|>system\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.<|im_end|>\n<|im_start|>user\nGenerate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One<|im_end|>\n<|im_start|>assistant\n'}

In [6]:
print(f'total       : {torch.cuda.get_device_properties(0).total_memory}')
print(f'reserved    : {torch.cuda.memory_reserved(0)}')
print(f'allocated   : {torch.cuda.memory_allocated(0)}')
print(f'free        : {torch.cuda.memory_reserved(0) - torch.cuda.memory_allocated(0)}')

total       : 12884377600
reserved    : 0
allocated   : 0
free        : 0


## Training the Model
We start by setting up the configs here

In [7]:
# LoRA Configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias='none',
    task_type='Causal_LM',
    # target_modules=['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj'],
)

# The Model being trained
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2", 
    torch_dtype=torch.float16,
    load_in_4bit=True
    trust_remote_code=True
)
model.config.use_cache=False

# The model used as a reference
reference = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2", 
    torch_dtype=torch.float16,
    load_in_4bit=True
    trust_remote_code=True
)

# Setting up training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    max_steps=200,
    save_strategy="no",
    logging_steps=1,
    output_dir=new_model,
    optim="paged_adamw_32bit",
    warmup_steps=100,
    bf16=True,
    report_to="wandb",
)

trainer = DPOTrainer(
    model,
    reference,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    peft_config=peft_config,
    beta=0.1,
    max_prompt_length=1024,
    max_length=1536,
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.38s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:44<00:00, 22.31s/it]
Map:  11%|█         | 1441/12859 [00:07<00:52, 218.16 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2287 > 2048). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 12859/12859 [01:01<00:00, 210.21 examples/s]


In [8]:
print(f'total       : {torch.cuda.get_device_properties(0).total_memory}')
print(f'reserved    : {torch.cuda.memory_reserved(0)}')
print(f'allocated   : {torch.cuda.memory_allocated(0)}')
print(f'free        : {torch.cuda.memory_reserved(0) - torch.cuda.memory_allocated(0)}')

total       : 12884377600
reserved    : 11565793280
allocated   : 11161759744
free        : 404033536


In [9]:
trainer.train()

# Saving the model and tokenizer
trainer.model.save_pretrained("final_checkpoint")
tokenizer.save_pretrained("final_checkpoint")



OutOfMemoryError: CUDA out of memory. Tried to allocate 54.00 MiB. GPU 0 has a total capacty of 12.00 GiB of which 0 bytes is free. Of the allocated memory 10.83 GiB is allocated by PyTorch, and 443.12 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# Flush memory to alleviate my crappy GPU
del trainer, model, reference
gc.collect()
torch.cuda.empty_cache()

In [None]:
# loading another model without NF4
base = AutoModelForCausalLM.from_pretrained(
    model_name,
    return_dict=True,
    torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
model = PeftModel.from_pretrained(base, "final_checkpoint")
model = model.merge_and_unload()
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)