# **Setup & installs**

In [1]:
!pip install --upgrade -q bitsandbytes transformers accelerate datasets peft trl huggingface_hub

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.7/564.7 kB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install tensorboard==2.19.0

In [3]:
import os, torch, gc, locale
# GC -> Garbage collector we use it when we want to free some memory while training models
# locale -> Deals with language/encoding settings on your computer. Sometimes saving tokenizer/model files causes encoding errors. Forcing locale to "UTF-8" fixes that.

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, AutoTokenizer,
    BitsAndBytesConfig, TrainingArguments, pipeline, logging
)
# BitsAndBytesConfig-> for quantization to load model in lower precision (4-bit or 8-bit) so it fits in GPU memory.


from peft import LoraConfig, PeftModel
from trl import SFTTrainer
'''
Trl: SFTTrainer
“Supervised Fine-Tuning” trainer.
Simplifies fine-tuning language models on instruction datasets.
It uses your dataset’s text field and trains the model with LoRA efficiently.
'''

from huggingface_hub import login

In [4]:
# (Optional) login now to avoid later prompts
login()  # paste your HF token (needs access to Llama 2 gated model)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
# Make sure UTF-8 (avoids tokenizer save issues in some environments)
locale.getpreferredencoding = lambda: "UTF-8"

# Pick device dtype
USE_BF16 = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8  # A100 or better
DTYPE = torch.bfloat16 if USE_BF16 else torch.float16
print("Using dtype:", DTYPE)


Using dtype: torch.float16


# **Config: model, dataset, QLoRA, LoRA, training args**

In [6]:
MODEL_NAME   = "meta-llama/Llama-2-7b-chat-hf"   # ensure HF access approved
DATASET_NAME = "mlabonne/guanaco-llama2-1k"        # has a 'text' field
NEW_MODEL_ID = "Riyan213/llama2-7b-chat-qlora-demo"  # where you’ll push

# QLoRA (4-bit) quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=DTYPE,   # bf16 on A100, else fp16
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False,
)

# LoRA config
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

# Training args
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    warmup_ratio=0.03,
    logging_steps=25,
    save_strategy="epoch",
    lr_scheduler_type="cosine",
    optim="paged_adamw_32bit",
    fp16=not USE_BF16,
    bf16=USE_BF16,
    report_to="tensorboard",
)


# **Load dataset, tokenizer, base model (4-bit)**

In [None]:
dataset = load_dataset(DATASET_NAME, split="train")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
)
model.config.use_cache = False
model.config.pretraining_tp = 1


# **Trainer (TRL SFTTrainer) & training**

In [8]:
def formatting_prompts_func(example):
    return example["text"]

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    args=training_args,
    formatting_func=formatting_prompts_func,
    processing_class=tokenizer,
)

trainer.train()

# Save adapter weights + tokenizer locally
ADAPTER_DIR = "./adapter-qlora"
trainer.model.save_pretrained(ADAPTER_DIR)
tokenizer.save_pretrained(ADAPTER_DIR)
print("Adapter saved to:", ADAPTER_DIR)

Applying formatting function to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


Step,Training Loss
25,1.5886
50,1.3234
75,1.2719
100,1.3277
125,1.2176
150,1.2124
175,1.2496
200,1.271
225,1.2098
250,1.1856


Adapter saved to: ./adapter-qlora


# **Monitor with TensorBoard**
---
TensorBoard = a dashboard for training deep learning models.

It shows live graphs and charts of how your model is doing during training.

In [9]:
# in Colab:
%load_ext tensorboard
%tensorboard --logdir results


Output hidden; open in https://colab.research.google.com to view.

# **Quick test generation (with adapters still attached)**

In [11]:
gen = pipeline(
    task="text-generation",
    model=trainer.model,
    tokenizer=tokenizer,
    dtype=DTYPE,
    device_map="auto",
    max_new_tokens=200,
)
print(gen("[INST] What is a large language model? [/INST]")[0]["generated_text"])


Device set to use cuda:0


[INST] What is a large language model? [/INST] A large language model is a machine learning model that is trained on a large dataset of text, and is capable of generating text that is similar to human language. It is typically trained using a variant of the transformer architecture, and is trained on large datasets such as the internet or a large corpus of text. Large language models are capable of generating text that is coherent and contextually appropriate, and can be used for a wide range of applications such as language translation, text summarization, and chatbots. They are also often used for research purposes, such as studying the properties of language and understanding how to improve machine learning models. 


# **Free VRAM (cleanly) before merging**

In [12]:
del gen
del trainer
del model
gc.collect()
torch.cuda.empty_cache()


# **Merge LoRA adapters into full FP16 model (for easy deployment)**

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    low_cpu_mem_usage=True,
    return_dict=True,
    dtype=DTYPE,   # bf16/fp16 full model (no 4-bit here)
)

merged = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
merged = merged.merge_and_unload()  # weights merged into base_model

# Save merged model + tokenizer
MERGED_DIR = "./merged-fp16"
merged.save_pretrained(MERGED_DIR)
tokenizer.save_pretrained(MERGED_DIR)
print("Merged model saved to:", MERGED_DIR)


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# **(Optional) Quick test on merged model**

In [None]:
merged_pipe = pipeline(
    task="text-generation",
    model=merged,
    tokenizer=tokenizer,
    dtype=DTYPE,
    device_map="auto",
    max_new_tokens=200,
)
print(merged_pipe("[INST] Explain QLoRA in simple terms. [/INST]")[0]["generated_text"])

# cleanup
del merged_pipe
del merged
gc.collect()
torch.cuda.empty_cache()


# **Push to Hugging Face Hub (adapters or merged)**

Push only LoRA adapters (lightweight, preferred for sharing):

In [None]:
from huggingface_hub import create_repo

In [None]:
ADAPTER_REPO_ID = NEW_MODEL_ID + "-adapters"
create_repo(ADAPTER_REPO_ID, exist_ok=True)
from peft import PeftModel
# reload adapter weights dir & push
from transformers import AutoModelForCausalLM
base = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="cpu")
peft_model = PeftModel.from_pretrained(base, ADAPTER_DIR)
peft_model.push_to_hub(ADAPTER_REPO_ID)
tokenizer.push_to_hub(ADAPTER_REPO_ID)
print("Adapters pushed to:", ADAPTER_REPO_ID)
