<a href="https://colab.research.google.com/github/rajaranjith/HCL-GenAI-Training/blob/main/Assignment_4_Gold_Badge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Install dependencies
!pip -q install -U pip

# Pinned versions for reproducibility (known compatible as of late 2025)
!pip -q install -U \
  transformers==4.57.3 \
  trl==0.26.2 \
  peft==0.18.0 \
  bitsandbytes==0.49.0 \
  accelerate \
  datasets \
  sentencepiece

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/1.8 MB[0m [31m21.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m39.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
#Imports + GPU / dtype detection
import os
import torch

from datasets import load_dataset
from transformers import AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig
from trl import SFTConfig, SFTTrainer

os.environ["TOKENIZERS_PARALLELISM"] = "false"

assert torch.cuda.is_available(), "❌ No GPU detected. In Colab: Runtime → Change runtime type → GPU"

gpu_name = torch.cuda.get_device_name(0)
cc_major, cc_minor = torch.cuda.get_device_capability(0)

# bf16 is typically supported on A100/L4 (Ampere+), not on T4
bf16_supported = (cc_major >= 8)
compute_dtype = torch.bfloat16 if bf16_supported else torch.float16

print(f"✅ GPU: {gpu_name}")
print(f"✅ Compute capability: {cc_major}.{cc_minor}")
print(f"✅ Using compute dtype: {compute_dtype}")

✅ GPU: Tesla T4
✅ Compute capability: 7.5
✅ Using compute dtype: torch.float16


In [3]:
#Pick model + load tokenizer
model_id = "mistralai/Mistral-7B-Instruct-v0.3"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
# Many causal LMs don't ship a pad token; for training we set it to EOS.
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("pad_token:", tokenizer.pad_token)
print("eos_token:", tokenizer.eos_token)

#Load an example instruction dataset (Dolly 15k) and convert to TRL prompt/completion format
#TRL SFTTrainer accepts prompt-completion datasets, including the conversational prompt/completion format we’ll build here.
dataset_id = "databricks/databricks-dolly-15k"
raw = load_dataset(dataset_id, split="train")

print(raw)
print("Example row keys:", raw.column_names)
print("First row:", raw[0])

#Now convert Dolly → {"prompt": [...], "completion": [...]}.
def to_prompt_completion(example):
    instruction = (example.get("instruction") or "").strip()
    context = (example.get("context") or "").strip()
    response = (example.get("response") or "").strip()

    if context:
        user_msg = f"{instruction}\n\nContext:\n{context}"
    else:
        user_msg = instruction

    return {
        "prompt": [{"role": "user", "content": user_msg}],
        "completion": [{"role": "assistant", "content": response}],
    }

# Shuffle + small subset for a quick Colab run (increase later)
raw = raw.shuffle(seed=42)

max_train_samples = 2000   # increase for real training (e.g., 15000 for full Dolly)
max_eval_samples  = 200

split = raw.train_test_split(test_size=max_eval_samples, seed=42)
train_ds = split["train"].select(range(min(max_train_samples, len(split["train"]))))
eval_ds  = split["test"]

train_ds = train_ds.map(to_prompt_completion, remove_columns=train_ds.column_names)
eval_ds  = eval_ds.map(to_prompt_completion, remove_columns=eval_ds.column_names)

print("Train example:", train_ds[0])
print("Eval example:", eval_ds[0])

#Dataset license note: Dolly 15k is under CC BY‑SA 3.0 and states it can be used for commercial purposes under that license.

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

pad_token: </s>
eos_token: </s>


README.md: 0.00B [00:00, ?B/s]

databricks-dolly-15k.jsonl:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]

Dataset({
    features: ['instruction', 'context', 'response', 'category'],
    num_rows: 15011
})
Example row keys: ['instruction', 'context', 'response', 'category']
First row: {'instruction': 'When did Virgin Australia start operating?', 'context': "Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.", 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'}


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Train example: {'prompt': [{'content': 'Which notable person in this list with surname Liebers was born last?\n\nContext:\nLiebers is a German language surname. Notable people with the name include:\n\nMario Liebers (born 1960), German former competitive figure skater\nMartin Liebers (born 1985), German former competitive figure skater\nMatthias Liebers (born 1958), former German footballer\nPeter Liebers (born 1988), German former figure skater.', 'role': 'user'}], 'completion': [{'content': 'Peter Liebers', 'role': 'assistant'}]}
Eval example: {'prompt': [{'content': 'What are the key components of an MLOps pipeline?', 'role': 'user'}], 'completion': [{'content': 'MLOps unlike traditional DevOps in software development is not only concerned with the management of code but must additionally account for data and models. A functional MLOps pipeline must be able to link together, and track changes, to the code used to develop a model and prepare the data, the data used for training and v

In [4]:
#QLoRA: 4-bit quantization config (bitsandbytes)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",         # QLoRA commonly uses NF4
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=compute_dtype,
)

#LoRA adapter config (what gets trained)
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
)
#(bitsandbytes describes QLoRA/4-bit training support)

In [5]:
#Training config (SFTConfig)
#This is tuned to be Colab-friendly (T4-compatible).
output_dir = "mistral7b-qlora-dolly-adapter"

training_args = SFTConfig(
    output_dir=output_dir,

    # sequence length (lower if you OOM)
    max_length=512, # Reduced max_length to save memory

    # batch / accumulation (effective batch = batch_size * grad_accum)
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16, # Increased gradient_accumulation_steps to save memory

    # training schedule
    learning_rate=2e-4,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    max_steps=50, #200, bump this up for better results (e.g., 1000+)

    # logging/saving
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=2,

    # precision
    bf16=False,
    fp16=False,

    # memory savers
    gradient_checkpointing=True, # Re-enabled to save memory, assuming BFloat16 issue is resolved by fp32
    gradient_checkpointing_kwargs={"use_reentrant": False},

    # optimizer (8-bit / paged to reduce VRAM)
    optim="paged_adamw_8bit", # Reverted to paged_adamw_8bit for memory efficiency

    # IMPORTANT: prompt-completion dataset → loss on completion only
    completion_only_loss=True,

    report_to="none",

    # passed through to from_pretrained() when model is given as a string
    model_init_kwargs={
        "quantization_config": bnb_config,
        "device_map": "auto",
        # "torch_dtype": compute_dtype, # Removed to prevent implicit AMP activation
    },
)

In [6]:
#Train (this is the actual QLoRA fine-tune)
trainer = SFTTrainer(
    model=model_id,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    peft_config=peft_config,
    processing_class=tokenizer,  # TRL uses "processing_class" for tokenizers
)

trainer.train()

# Save the trained adapter (LoRA weights) + tokenizer
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print("✅ Saved adapter to:", output_dir)

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Tokenizing train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
50,1.4221,,1.690695,128713.0,0.693731


✅ Saved adapter to: mistral7b-qlora-dolly-adapter


In [7]:
#Quick inference test (base + adapter)
from transformers import AutoModelForCausalLM
from peft import PeftModel

# Reload base model in 4-bit
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_config,
)

# Attach trained adapter
ft_model = PeftModel.from_pretrained(base_model, output_dir)
ft_model.eval()

def generate_chat(user_text, max_new_tokens=200, temperature=0.7, top_p=0.9):
    messages = [{"role": "user", "content": user_text}]
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
    )

    # Put inputs on same device as model
    device = next(ft_model.parameters()).device
    input_ids = input_ids.to(device)

    with torch.no_grad():
        out = ft_model.generate(
            input_ids=input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
        )

    return tokenizer.decode(out[0], skip_special_tokens=True)

print(generate_chat("Give me 5 practical tips for debugging Python code."))

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Give me 5 practical tips for debugging Python code. 1. Use the `print()` function to output the current state of your variables to the console.
2. Use the `breakpoint()` function to stop the program at the line where the error occurs.
3. Use the `assert` statement to check whether the program is doing what it should be doing.
4. Use the `pdb` module to execute the code line by line and check the state of the program.
5. Use the `ipdb` module to execute the code line by line and check the state of the program.
