<a href="https://colab.research.google.com/github/seungeun0911/Classification-Models/blob/main/Fine_tuning_Llama_v2_13B_HuggingFace_PEFT%2C_BitsandBytes%2C_TRL%2C%C2%A0WandB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1>A Hands-on Guide to Fine-tuning Llama v2</h1>

In [None]:
# Create project folder and virtual environment
# # mkdir finetuning-llama2 && cd finetuning-llama2
# python3 -m venv finetuning-llama2-env
# source finetuning-llama2-env/bin/activate

In [None]:
# pip3 install ipykernel jupyter
!pip3 install numpy scipy pandas matplotlib seaborn
!pip3 install torch
!pip3 install --upgrade git+https://github.com/huggingface/transformers
!pip3 install accelerate
!pip3 install bitsandbytes
!pip3 install datasets
!pip3 install peft
!pip3 install trl
!pip3 install auto-gptq
!pip3 install --upgrade huggingface_hub
!pip3 install text-generation
!pip3 install ctranslate2
!pip3 install hf-hub-ctranslate2
!pip3 install vllm
!pip3 install wandb
!pip3 install ipywidgets

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-gw5eld7w
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-gw5eld7w
  Resolved https://github.com/huggingface/transformers to commit 29e7a1e1834f331a4916853ecd58549ed78235d6
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
import os
import re
import torch
import argparse
import bitsandbytes as bnb
from trl import SFTTrainer
from functools import partial
from datasets import load_dataset


from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    AutoPeftModelForCausalLM,
    PeftModel
)

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    set_seed,
    logging
)

from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

from text_generation import Client

import ctranslate2

In [None]:
PROJECT_NAME = "llama-v2-fine-tuning"
os.environ["WANDB_PROJECT"] = PROJECT_NAME # set the wandb project where this run will be logged

NOTEBOOK_NAME = "full_path_to_your_notebook.ipynb"
os.environ["WANDB_NOTEBOOK_NAME"] = NOTEBOOK_NAME # set the notebook name where this run will be logged

In [None]:
import wandb, os
wandb.login() # log in and paste your API key when prompted.

[34m[1mwandb[0m: Currently logged in as: [33mcelsowm[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
#

In [None]:
lora_r = 64 # LoRA attention dimension
lora_alpha = 16 # Alpha parameter for LoRA scaling
lora_dropout = 0.1 # Dropout probability for LoRA layers
use_4bit = True # Activate 4-bit precision base model loading
bnb_4bit_compute_dtype = "float16" # Compute dtype for 4-bit base models
bnb_4bit_quant_type = "nf4" # Quantization type (fp4 or nf4)
use_nested_quant = False # Activate nested quantization for 4-bit base models (double quantization)
output_dir = "./results" # Output directory where the model predictions and checkpoints will be stored
num_train_epochs = 1 # Number of training epochs
fp16 = False # Enable fp16/bf16 training (set bf16 to True with an A100)
bf16 = False
per_device_train_batch_size = 4 # Batch size per GPU for training
per_device_eval_batch_size = 4 # Batch size per GPU for evaluation
gradient_accumulation_steps = 1 # Number of update steps to accumulate the gradients for
gradient_checkpointing = True # Enable gradient checkpointing
max_grad_norm = 0.3 # Maximum gradient normal (gradient clipping)
learning_rate = 2e-4 # Initial learning rate (AdamW optimizer)
weight_decay = 0.001 # Weight decay to apply to all layers except bias/LayerNorm weights
optim = "paged_adamw_32bit" # Optimizer to use
lr_scheduler_type = "constant" # Learning rate schedule (constant a bit better than cosine)
max_steps = -1 # Number of training steps (overrides num_train_epochs)
warmup_ratio = 0.03 # Ratio of steps for a linear warmup (from 0 to learning rate)
group_by_length = True # Group sequences into batches with same length. Saves memory and speeds up training considerably
save_steps = 25 # Save checkpoint every X updates steps
logging_steps = 25 # Log every X updates steps
max_seq_length = None # Maximum sequence length to use
packing = False # Pack multiple short examples in the same input sequence to increase efficiency
device_map = {"": 0} # Load the entire model on the GPU 0


In [None]:
dataset = load_dataset("heliosbrahma/mental_health_chatbot_dataset")
dataset = dataset['train']
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Dataset({
    features: ['text'],
    num_rows: 172
})

In [None]:
def format_conversation(conversation):

    if '<HUMAN>:' not in conversation['text'] or '<ASSISTANT>:' not in conversation['text']:
        return None

    segments = conversation['text'].split('\n', 1)
    reformatted_segments = []

    human_text = segments[0].replace('<HUMAN>:', '').strip()
    assistant_text = segments[1].replace('<ASSISTANT>:', '').strip()

    reformatted_segments.append(f'<s>[INST] {human_text} [/INST] {assistant_text} </s>')

    return {'text': ''.join(reformatted_segments)}

In [None]:
format_conversation(dataset[12])

{'text': "<s>[INST] What do I do if I’m worried about my mental health? [/INST] I'm really glad you reached out and shared your concerns about your mental health. It's essential to take care of yourself, and seeking help is a crucial step towards that. Here are some suggestions for what you can do if you're worried about your mental health:\n\n1. Talk to someone you trust: Reach out to a friend, family member, or someone close to you who can lend a listening ear and provide support.\n\n2. Seek professional help: Consider talking to a mental health professional, such as a psychologist, counselor, or therapist. They can offer expert guidance and support tailored to your specific needs.\n\n3. Educate yourself: Take some time to learn about the mental health issues you might be experiencing. Understanding what you're going through can help you feel more in control and less isolated.\n\n4. Practice self-care: Engage in activities that bring you joy and relaxation. This can include exercise,

In [None]:
# Apply the transformation
transformed_dataset = dataset.map(format_conversation)

You can optionally push the dataset to your HuggingFace account

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=getattr(torch, bnb_4bit_compute_dtype),
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [None]:
# Check GPU compatibility with bfloat16
if getattr(torch, bnb_4bit_compute_dtype) == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

#### Load base model and tokenizer

In [None]:
model_name = "TheBloke/Llama-2-7B-fp16"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [None]:
model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [None]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="wandb"
)

In [None]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=transformed_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)



Map:   0%|          | 0/172 [00:00<?, ? examples/s]

In [None]:
# Train model
trainer.train()

OutOfMemoryError: ignored

In [None]:
# Save trained model
new_model = "Llama-2-13B-fp16-mental-health-chatbot"
trainer.model.save_pretrained(new_model)

In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "Is psychotherapy a substitute for medication?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=1000)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])



<s>[INST] Is psychotherapy a substitute for medication? [/INST]
 nobody is saying that psychotherapy is a substitute for medication.
[INST] Is psychotherapy a substitute for medication? [/INST]
[INST] Is psychotherapy a substitute for medication?
[INST] Is psychotherapy a substitute for medication? [/INST]
[INST] Is psychotherapy a substitute for medication? [/INST]
[INST] Is psychotherapy a substitute for medication? [/INST]
[INST] Is psychotherapy a substitute for medication? [/INST]
[INST] Is psychotherapy a substitute for medication? [/INST]
[INST] Is psychotherapy a substitute for medication? [/INST]
[INST] Is psychotherapy a substitute for medication? [/INST]
[INST] Is psychotherapy a substitute for medication? [/INST]
[INST] Is psychotherapy a substitute for medication? [/INST]
[INST] Is psychotherapy a substitute for medication? [/INST]
[INST] Is psychotherapy a substitute for medication? [/INST]
[INST] Is psychotherapy a substitute for medication? [/INST]
[INST] Is psychothera

In [None]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)

In [None]:
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

In [None]:
# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
model.push_to_hub(new_model, use_temp_dir=False)

In [None]:
tokenizer.push_to_hub(new_model, use_temp_dir=False)

If you want to deploy this model with Text Generation Inference (TGI) or CTranslate2, follow us on Medium as we will publish these guides very soon!