<a href="https://colab.research.google.com/github/ringerH/QLoRA-ft/blob/main/4_bit_llama_2_7b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q -U bitsandbytes
!pip install transformers==4.31 #temporary fix required owing to breaking changes on Aug 9th 2023
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.31
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.31)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.0
    Uninstalling tokenizers-0.15.0:
      Successfully uninstalled tokenizers-0.15.0
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully u

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "Trelis/Llama-2-7b-chat-hf-sharded-bf16"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

In [7]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [12]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainables: {trainable_params} || total: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [13]:
from peft import LoraConfig, get_peft_model
config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)
print_trainable_parameters(model)

trainables: 8388608 || total: 3508801536 || trainable%: 0.23907331075678143


In [19]:
from datasets import load_dataset

data = load_dataset("nisaar/LLAMA2_Legal_Dataset_4.4k_Instructions")
data = data.map(lambda samples: tokenizer(samples["instruction"]), batched=True)

Downloading readme:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.9M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/4394 [00:00<?, ? examples/s]

In [47]:
import transformers
tokenizer.pad_token = tokenizer.eos_token
trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=20,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train()



Step,Training Loss
1,2.284
2,1.791
3,1.0777
4,1.1752
5,1.7028
6,1.3572
7,0.559
8,1.2462
9,1.8508
10,1.0177


TrainOutput(global_step=20, training_loss=1.413290736079216, metrics={'train_runtime': 108.0166, 'train_samples_per_second': 0.741, 'train_steps_per_second': 0.185, 'total_flos': 26711085170688.0, 'train_loss': 1.413290736079216, 'epoch': 0.02})

In [None]:
from transformers import TextStreamer
model.config.use_cache = True
model.eval()

In [49]:
def stream(user_prompt):
    runtimeFlag = "cuda:0"
    system_prompt = 'You are quantized'

    B_INST, E_INST = "[INST]", "[/INST]"
    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

    prompt = f"{B_INST} {B_SYS}{system_prompt.strip()}{E_SYS}{user_prompt.strip()} {E_INST}\n\n"

    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

    streamer = TextStreamer(tokenizer)

    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=500)

In [50]:
stream('Define your architecture in 2 lines')

<s> [INST] <<SYS>>
You are quantized
<</SYS>>

Define your architecture in 2 lines [/INST]

As a quantized AI language model, my architecture is designed to optimize for low-bit precision while maintaining comparable performance to my unquantized counterpart. I achieve this through a combination of careful model pruning, quantization-aware training, and the use of efficient digital circuits for inference.</s>
