<a href="https://colab.research.google.com/github/rajkorde/aispikes/blob/main/training/quant_lora/QuantizeFintune_Llama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install accelerate peft bitsandbytes datasets evaluate trl

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-non

In [None]:
!pip freeze

## Load and quantize model with bitsandbytes

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

In [None]:
model_name = "adept/persimmon-8b-base"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)


base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
)

model_for_lora = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
)

In [None]:
list(model_for_lora.parameters())

In [None]:
print(f"Allocated VRAM: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
print(f"Reserved VRAM : {torch.cuda.memory_reserved() / 1e9:.2f} GB")

In [None]:
total_params = sum(p.numel() for p in model_for_lora.parameters())
total_size_bytes = sum(p.numel() * p.element_size() for p in model_for_lora.parameters())
print(f"Total parameters: {total_params:,}")
print(f"Model size: {total_size_bytes / (1024 ** 3):.2f} GB")

In [None]:
type(model_for_lora)

In [None]:
from bitsandbytes.nn import Linear4bit

any_4bit = any(isinstance(m, Linear4bit) for m in model_for_lora.modules())
print(f"Uses 4-bit layers: {any_4bit}")

## Finetune with LoRA

In [None]:
from peft import get_peft_model, LoraConfig, TaskType
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset

In [None]:
lora_config = LoraConfig(r=8, lora_alpha=16, target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"], lora_dropout=0.1, bias="none", task_type=TaskType.CAUSAL_LM)

In [None]:
model = get_peft_model(model_for_lora, lora_config)

In [None]:
for name, module in model.named_modules():
    if "Linear" in str(type(module)):
        print(name)


In [None]:
dataset = load_dataset("tatsu-lab/alpaca")["train"]

In [None]:
dataset

In [None]:
dataset[2]

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.unk_token

In [None]:
def format_batch(batch):
    texts = [
        f"{inst.strip()}\n{inp.strip()}\n{out.strip()}" if inp.strip()
        else f"{inst.strip()}\n{out.strip()}"
        for inst, inp, out in zip(batch["instruction"], batch["input"], batch["output"])
    ]
    tokenized = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=512
    )
    tokenized["labels"] = [
        [(tid if tid != tokenizer.pad_token_id else -100) for tid in ids]
        for ids in tokenized["input_ids"]
    ]
    return tokenized


In [None]:
model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
%env CUDA_LAUNCH_BLOCKING=1

In [None]:
tokenized = dataset.map(format_batch, batched=True)

In [None]:
tokenized

In [None]:
print(tokenizer.pad_token_id)
print(tokenized[0]["input_ids"][:10])

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    num_train_epochs=1,
    output_dir="./lora-output",
    logging_steps=10,
    fp16=True,
)

In [None]:
# vocab_size = tokenizer.vocab_size

# def check_for_bad_labels(dataset):
#     for i, row in enumerate(dataset):
#         for token_id in row["labels"]:
#             if token_id >= vocab_size or token_id < -100:
#                 print(f"Bad token at row {i}: {token_id}")
#                 return
#     print("All labels look good.")

# check_for_bad_labels(tokenized)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [None]:
trainer.train()

## Evaluations

In [None]:
import evaluate
from datasets import load_dataset

In [None]:
dataset = load_dataset("gsm8k", "main")["test"].select(range(100))

In [None]:
dataset

In [None]:
metric = evaluate.load("accuracy")

In [None]:
def run_eval(model, tokenizer, dataset):
  correct = 0
  for sample in dataset:
    input_text = "Q: " + sample["question"] + "\nA: "
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=64)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if sample["answer"].strip() in answer:
      correct += 1
  return correct / len(dataset)