# Fine Tuning Generation Models

## Instruction Tuning with QLoRA

### Templating Instruction Data

Model - TinyLlama

Dataset - UltraChat Dataset

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset

# Load a tokenizer to use it's char template
template_tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1BChat-v1.0")

def format_prompt(example):
    """Format the prompt to using the <|user|> template TinyLlama is using."""
    # Format answers
    chat = example["messages"]
    prompt = template_tokenizer.apply_chat_template(chat, tokenize = False)

    return {"text": prompt}


# Load and format the data using the template TinyLlama is using
dataset = (
    load_dataset("HuggingFaceH4/ultrachat_200k", split = "test_sft")
        .shuffle(seed = 42)
        .select(range(3_000))
)

dataset = dataset.map(format_prompt)


In [None]:
# Example of formatted prompt
print(dataset["text"][2576])

### Model Quantization

This is where we apply the Q in QLoRA, namely  quantization. We use the bitsandbytes package to compress the pretrained model to a 4-bit representation.

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "TinyLlama/TinyLlama-1.1B-intermediate-1431k-3T"

# 4 bit quantization configuration - Q in QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,     # Use 4-bit precision model loading
    bnb_4bit_quant_type = "nf4",    # Quantization Type
    bnb_4bit_compute_dtype = "float16",      # Compute dtype
    bnb_4bit_use_double_quant = True      # Apply nested quantization
)

# Load the model to train on the GPU
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map = "auto"
)


model.config.use_cache = False,
model.config.pretraining_tp = 1


# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code = True)
tokenizer.pad_token = "<PAD>"
tokenizer.padding_side = "left"

### LoRA Configuration

In [None]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

# Prepare LoRA Configuration
peft_config = LoraConfig(
    lora_alpha = 32,       # LoRA Scaling
    lora_dropout = 0.1,    # Dropout for LoRA Layers
    r = 64,      # Rank
    bias = "none",
    task_type = "CAUSAL_LM",
    target_modules = [
        "k_proj", "gate_proj", "v_proj", "up_proj", "q_proj", "o_proj", "down_proj"
    ]       # Layers to target
)

# Prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

### Training Configuration


In [None]:
from transformers import TrainingArguments


output_dir = "./results"

# Training Arguments
training_arguments = TrainingArguments(
    output_dir = output_dir,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    optim = "pages_adamw_32bit",
    learning_rate = 2e-4,
    lr_scheduler_type = "cosine",
    num_train_epochs = 1,
    logging_steps = 10,
    fp16 = True,
    gradient_checkpointing = True
)

### Training


In [None]:
from trl import SFTTrainer

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    dataset_text_field = "text",
    tokenizer = tokenizer,
    args = training_arguments,
    max_seq_length = 512

    # Leave this out for regular SFT
    peft_config = peft_config
)

# Train model
trainer.train()

# Save QLoRA weights
trainer.model.save_pretrained("TinyLlama-1.1B-qlora")

### Merge Weights

After we have trained our QLoRA weights, we still need to combine them with the original weights to use them. We reload the model in 16 bits, instead of the quantized 4 bits, to merge the weights.

In [None]:
from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained(
    "TinyLlama-1.1B-qlora",
    low_cpu_mem_usage = True,
    device_map = "auto"
)

# Merge LoRA and base model
merged_model = model.merge_and_unload()

After merging the adapter with the base model, we can use it with the prompt template that we defined earler.

In [None]:
from transformers import pipeline

# Use our predefined prompt template
prompt = """<|user|>
Tell me something about Large Language Models. </s>
<|assistant|>
"""

# Run our instruction-tuned model
pipe = pipeline(task = "text-generation", model = merged_model, tokenizer = tokenizer)
print(pipe(prompt)[0]["generated_text"])

## Preference Tuning with DPO

### Templating Alignment Data

Dataset - `argilla/distilabel-intel-orca-dpo-pairs`

Model - `TinyLlama/TinyLlama-1.1B-Chat-v1.0`

In [None]:
from datasets import load_dataset

def format_prompt(example):
    """Format the prompt to using the <|user|> template TinyLlama is using"""

    # Format answers
    system = "<|system|>\n" + example["system"] + "</s>\n"
    prompt = "<|user|>\n" + example["input"] + "</s>\n<|assistant|>\n"
    chosen = example["chosen"] + "</s>\n"
    rejected = example["rejected"] + "</s>\n"

    return {
        "prompt": system + prompt,
        "chosen": chosen,
        "rejected": rejected
    }



# Apply formatting to the dataset and select relatively short answers
dpo_dataset = load_dataset(
    "argilla/distilabel-intel-orca-dpo-pairs", split = "train"
)

dpo_dataset = dpo_dataset.filter(
    lambda r:
    r["status"] != "tie" and
    r["chosen_score"] >= 8 and
    not r["in_gsm8k_train"] 
)

dpo_dataset = dpo_dataset.map(
    format_prompt,
    remove_columns = dpo_dataset.column_names
)

dpo_dataset

### Model Quantization


In [None]:
from peft import AutoPeftModelForCausalLM
from transformers import BitsAndBytesConfig, AutoTokenizer

# 4-bit quantization configuration - Q in QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,        # Use 4-bit precision model loading
    bnb_4bit_quant_type = "nf4",     # Quantization type
    bnb_4bit_compute_dtype = "float16",       # Compute type
    bnb_4bit_use_double_quant = True,       # Apply nested quantization
)

# Merge LoRA and base model
model = AutoPeftModelForCausalLM.from_pretrained(
    "TinyLlama-1.1B-qlora",
    low_cpu_mem_usage = True,
    device_map = "auto",
    quantization_config = bnb_config
)

merged_model = model.merge_and_upload()


# Load Llama tokenizer
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code = True)
tokenizer.pad_token = "<PAD>"
tokenizer.padding_side = "left"

Next we use the same LoRA configuration as before to perform DPO training.

In [None]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

# Prepare LoRA configuration
peft_config = LoraConfig(
    lora_alpha = 32,    # LoRA Scaling
    lora_dropout = 0.1,       # Dropout for LoRA Layers
    r = 54,     # Rank
    bias = "none",
    task_type = "CAUSAL_LM",
    target_modules = ["k_proj", "gate_proj", "v_proj", "up_proj", "q_proj", "o_proj", "down_proj"]
                    # Layers to target
)

# Prepare the model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

### Training Configuration


In [None]:
from trl import DPOConfig

ouput_dir = "./results"

# Training Arguments
training_arguments = DPOConfig(
    output_dir = output_dir,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    optim = "pages_adamw_32bit",
    learning_rate = 1e-5,
    lr_scheduler_type = "cosine",
    max_steps = 200,
    logging_steps = 10,
    fp16 = True,
    gradient_checkpointing = True,
    warmup_ratio = 0.1
)

### Training

Now that we have prepare all our models and parameters, we can start fine-tuning our model.


In [None]:
from trl import DPOTrainer

# Create a DPO trainer
dpo_trainer = DPOTrainer(
    model,
    args = training_arguments,
    train_dataset = dpo_dataset,
    tokenizer = tokenizer,
    peft_config = peft_config,
    beta = 0.1,
    max_prompt_length = 512,
    max_length = 512
)

# Fine tune the model with DPO
dpo_trainer.train()


# Save adapter
dpo_trainer.model.save_pretrained("TinyLlama-1.1B-dpo-qlora")


In [None]:
from peft import PeftModel

# Merge the LoRA and base model
model = AutoPeftModelForCausalLM.from_pretrained(
    "TinyLlama-1.1B-qlora",
    low_cpu_mem_usage = True,
    device_map = "auto"
)
sft_model = model.merge_and_unload()

# Merge DPO LoRA and SFT model
dpo_model = PeftModel.from_pretrained(
    sft_model,
    "TinyLlama-1.1B-dpo-qlora",
    device_map = "auto"
)

dpo_model = dpo_model.merge_and_unload()