# **Fine-tuning with QLoRA and Flash Attention 2**
## **on the Yelp Reviews dataset**
Author: Skylar Jung (sj3394)

This notebook is intended to be run on a ampere or ada GPU, with CUDA 12.2 and python3.10.

### 1. Dependency installations and imports

In [None]:
!pip install -U bitsandbytes==0.45.0
!pip install transformers==4.47.0
!pip install -U peft==0.10.0
!pip install -U accelerate
!pip install -U trl
!pip install datasets==2.21.0
!pip install torch torchvision torchaudio

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [None]:
!pip install -q -U git+https://github.com/huggingface/accelerate.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone


In [None]:
!pip install flash-attn --no-build-isolation

Collecting flash-attn
  Downloading flash_attn-2.7.2.post1.tar.gz (3.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.1/3.1 MB[0m [31m169.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m86.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash-attn: filename=flash_attn-2.7.2.post1-cp310-cp310-linux_x86_64.whl size=190160474 sha256=0b454d9e650bfc437cc71335080172a5d05f51eab355636c9d5b7321fec7318e
  Stored in directory: /root/.cache/pip/wheels/da/ec/5b/b2c37a8e4f755ad82492a822463bca0817f0e0e11de874b550
Successfully built flash-attn
Installing collected packages: flash-attn
Successfully installed flash-at

In [None]:
import pandas as pd
import re
import json
import os
from pprint import pprint
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset, Dataset
from huggingface_hub import notebook_login

from peft import LoraConfig, PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

### 2. Pre-trained Llama-2 7B setup

In [None]:
MODEL_NAME = "TinyPixel/Llama-2-7B-bf16-sharded"

def get_bnb_config():
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=torch.bfloat16
  )
  return bnb_config

def get_model(use_flash_attn=False, bnb_config=None):
  if use_flash_attn:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        trust_remote_code=True,
        quantization_config=bnb_config,
        attn_implementation="flash_attention_2",
        device_map="auto"
    )
  else:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        trust_remote_code=True,
        quantization_config=bnb_config,
        device_map="auto"
    )
  return model

def prepare_pretrained_model(model, MODEL_NAME="TinyPixel/Llama-2-7B-bf16-sharded"):
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
  tokenizer.pad_token = tokenizer.eos_token
  model = prepare_model_for_kbit_training(model)
  return tokenizer

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/626 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/14 [00:00<?, ?it/s]

model-00001-of-00014.safetensors:   0%|          | 0.00/981M [00:00<?, ?B/s]

model-00002-of-00014.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00003-of-00014.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00004-of-00014.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00005-of-00014.safetensors:   0%|          | 0.00/944M [00:00<?, ?B/s]

model-00006-of-00014.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00007-of-00014.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00008-of-00014.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00009-of-00014.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00010-of-00014.safetensors:   0%|          | 0.00/944M [00:00<?, ?B/s]

model-00011-of-00014.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00012-of-00014.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00013-of-00014.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00014-of-00014.safetensors:   0%|          | 0.00/847M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/676 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


### 3. Data-loading and fine-tuning setup

In [None]:
def prepare_model_for_peft(model):
  def get_last_layer_linears(model):
    def get_num_layers(model):
      numbers = set()
      for name, _ in model.named_parameters():
          for number in re.findall(r'\d+', name):
              numbers.add(int(number))
      return max(numbers)

    names = []

    num_layers = get_num_layers(model)
    for name, module in model.named_modules():
        if str(num_layers) in name and not "encoder" in name:
            if isinstance(module, torch.nn.Linear):
                names.append(name)
    return names

  config = LoraConfig(
      r=2,
      lora_alpha=32,
      target_modules=get_last_layer_linears(model),
      lora_dropout=0.05,
      bias="none",
      task_type="CAUSAL_LM"
  )

  model = get_peft_model(model, config)
  model = model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
  return model, config

In [None]:
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler

dataset_name = "yelp_review_full" #Business reviews and ratings
data = load_dataset(dataset_name, split="train[:10%]").cache()

def get_data_loader(dataset, batch_size, num_workers=4):
    sampler = DistributedSampler(dataset)

    data_loader = DataLoader(
        dataset,
        batch_size=batch_size,
        sampler=sampler,
        num_workers=num_workers,
        pin_memory=True,
    )

    return data_loader

train_batch_size = 8

### 4. Main fine-tuning process

In [1]:
from trl import SFTTrainer
max_seq_length = 512

def run_experiment(use_quantization=False, use_flash_attn=False):
  bnb_config = get_bnb_config()
  if use_quantization:
    model = get_model(use_flash_attn, bnb_config)
  else:
    model = get_model(use_flash_attn)
  tokenizer = prepare_pretrained_model(model)
  model, peft_config = prepare_model_for_peft(model)

  data_loader = get_data_loader(data, train_batch_size)

  training_arguments = transformers.TrainingArguments(
      per_device_train_batch_size=8,
      gradient_accumulation_steps=4,
      num_train_epochs=1,
      learning_rate=1e-4,
      fp16=True,
      output_dir="finetune_yelp",
      optim="paged_adamw_8bit",
      lr_scheduler_type="cosine",
      warmup_ratio=0.01,
      report_to="none",
      ddp_find_unused_parameters=False if torch.cuda.device_count() > 1 else None,
      local_rank=-1
  )

  trainer = SFTTrainer(
      model=model,
      train_dataset=data_loader,
      peft_config=peft_config,
      dataset_text_field="text",
      max_seq_length=max_seq_length,
      tokenizer=tokenizer,
      args=training_arguments,
  )

  model.config.use_cache = False
  trainer.train()

In [None]:
run_experiment(use_quantization=False, use_flash_attn=False)
run_experiment(use_quantization=True, use_flash_attn=False)
run_experiment(use_quantization=True, use_flash_attn=True)

### 5. Miscellaneous code: model saves and sample runs

In [None]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model
model_to_save.save_pretrained("outputs")

In [None]:
lora_config = LoraConfig.from_pretrained('outputs')
model = get_peft_model(model, lora_config)

In [None]:
prompt="I'd like to choose a restaurant for dinner with great reviews."

In [None]:
text = prompt
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))