<a href="https://colab.research.google.com/github/rajaranjith/HCL-GenAI-Training/blob/main/Assignment_2_Gold_Badge_Working_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Install dependencies
!pip -q install -U \
  "trl[peft]==0.26.2" \
  "transformers>=4.42.0" \
  "datasets>=2.18.0" \
  "accelerate>=0.30.0" \
  bitsandbytes \
  evaluate \
  sentencepiece


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/518.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m518.9/518.9 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
#Check GPU + select dtype
import torch

assert torch.cuda.is_available(), "❌ No GPU found. In Colab: Runtime → Change runtime type → GPU"

gpu_name = torch.cuda.get_device_name(0)
cc_major, cc_minor = torch.cuda.get_device_capability(0)
print("✅ GPU:", gpu_name, "| Compute Capability:", (cc_major, cc_minor))

# bfloat16 is best on newer GPUs (A100/L4 etc). T4 usually uses float16.
use_bf16 = (cc_major >= 8)
compute_dtype = torch.bfloat16 if use_bf16 else torch.float16
print("Using dtype:", compute_dtype)

✅ GPU: Tesla T4 | Compute Capability: (7, 5)
Using dtype: torch.float16


In [24]:
#Load a small dataset (SST‑2) and convert to prompt/completion format
from datasets import load_dataset
import torch # Added for torch.Tensor check

# SST-2 sentiment dataset (short texts = faster + less VRAM)
raw = load_dataset("glue", "sst2")

# Small subsets (adjust as you want)
train_raw = raw["train"].shuffle(seed=42).select(range(400))
eval_raw  = raw["validation"].shuffle(seed=123).select(range(200))

SYSTEM_PROMPT = (
    "You are a sentiment classifier. "
    "Reply with exactly one word: Positive or Negative."
)

def to_prompt_completion(example):
    gold = "Positive" if int(example["label"]) == 1 else "Negative"
    user_msg = (
        "Classify the sentiment of the following sentence.\n"
        "Answer with exactly one word: Positive or Negative.\n\n"
        f"Sentence: {example['sentence']}"
    )
    return {
        # Conversational prompt-completion format supported by TRL SFTTrainer
        "prompt": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_msg},
        ],
        "completion": [
            {"role": "assistant", "content": gold}
        ],
    }

train_ds = train_raw.map(to_prompt_completion, remove_columns=train_raw.column_names)
eval_ds  = eval_raw.map(to_prompt_completion, remove_columns=eval_raw.column_names)


def tokenize_function(examples):
    # 'examples' is a dict where keys are column names and values are lists of items for the batch
    # e.g., examples['prompt'] is a list of conversation lists (one per example in the batch)

    input_ids_batch = []
    attention_mask_batch = []
    labels_batch = []

    for i in range(len(examples['prompt'])):
        prompt_conversation = examples['prompt'][i]
        completion_conversation = examples['completion'][i]

        full_conversation = prompt_conversation + completion_conversation

        # Tokenize the full conversation
        full_sequence_input_ids = tokenizer.apply_chat_template(
            full_conversation,
            tokenize=True,
            padding='max_length',
            truncation=True,
            max_length=256,
            return_tensors="pt",
            add_generation_prompt=False,
        )

        # Create labels, masking out the prompt part
        labels = full_sequence_input_ids.clone()

        # Tokenize the prompt part to determine its length accurately
        prompt_tokens = tokenizer.apply_chat_template(
            prompt_conversation,
            tokenize=True,
            add_generation_prompt=True, # Keep True here as we want to mask out the prompt *up to the point of generation*
            return_tensors="pt",
        )
        prompt_length = prompt_tokens.shape[1] # Get sequence length of the prompt

        labels[0, :prompt_length] = -100 # Mask out prompt tokens

        input_ids_batch.append(full_sequence_input_ids[0])
        attention_mask_batch.append(full_sequence_input_ids[0].ne(tokenizer.pad_token_id).int())
        labels_batch.append(labels[0])

    return {
        "input_ids": input_ids_batch,
        "attention_mask": attention_mask_batch,
        "labels": labels_batch
    }

# NOTE: This manual tokenization is a workaround for the IndexError.
# In a typical TRL setup, SFTTrainer handles tokenization, but given the persistent error,
# we are pre-tokenizing and setting labels with padding and truncation to ensure consistency.

# Remove existing columns that are not `input_ids`, `attention_mask`, `labels`
# so that the Trainer does not try to re-tokenize or mishandle them.
final_train_ds = train_ds.map(
    tokenize_function,
    batched=True,
    remove_columns=train_ds.column_names
)
final_eval_ds = eval_ds.map(
    tokenize_function,
    batched=True,
    remove_columns=eval_ds.column_names
)

print(final_train_ds[0])

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

{'input_ids': [1, 3, 5718, 2343, 1040, 22558, 1070, 1040, 3064, 13039, 29491, 781, 3588, 17749, 1163, 5436, 1392, 2475, 29515, 8970, 3236, 1210, 12933, 1965, 29491, 781, 781, 27736, 1404, 29515, 15370, 1030, 1968, 26212, 1065, 27445, 1265, 1505, 22187, 4215, 1072, 4462, 29501, 1034, 1065, 8305, 1968, 29473, 4, 8970, 3236, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [32]:
#Load Mistral in 4‑bit (QLoRA) + prepare LoRA adapters
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
import torch # Ensure torch is imported for dtype

# You can switch to "mistral-community/Mistral-7B-Instruct-v0.3" if needed
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16, # Explicitly set to float16 to avoid bfloat16 conflict
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.float16, # Explicitly set to float16 to avoid bfloat16 conflict
    quantization_config=bnb_config,
)

# Important for training stability / memory
model.config.use_cache = False

# Prepare quantized model for k-bit training
model = prepare_model_for_kbit_training(model)

# LoRA config (good default for 7B on small GPU)
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)

model = get_peft_model(model, lora_config)

# Explicitly move model to float16 to avoid bfloat16 issues on T4
model.to(torch.float16)

model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable params: 20,971,520 || all params: 7,268,995,072 || trainable%: 0.2885


In [36]:
#Fine‑tune with TRL SFTTrainer
from trl import SFTTrainer, SFTConfig

OUTPUT_DIR = "/content/mistral_sst2_qlora_adapter"

training_args = SFTConfig(
    output_dir=OUTPUT_DIR,

    # Memory-friendly settings (works on many Colab GPUs)
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16, # Increased for memory stability

    num_train_epochs=1,
    learning_rate=2e-4,          # LoRA typically uses higher LR than full finetune
    warmup_ratio=0.05,

    logging_steps=10,

    # Avoid extra integrations
    report_to="none",

    # Change optimizer to avoid bfloat16 conflict on T4
    optim="adamw_torch", # Changed from "paged_adamw_8bit"

    # Removed fp16/bf16 to rely on model's explicit dtype (float16) and accelerate defaults

    # Keep it simple for a demo
    save_strategy="no",
    eval_strategy="no",
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=final_train_ds, # Use pre-tokenized dataset
    eval_dataset=final_eval_ds, # Use pre-tokenized dataset for evaluation
)

trainer.train()
print("✅ Training finished.")

Step,Training Loss
10,1.1633
20,0.0


✅ Training finished.


In [37]:
#Save the adapter + tokenizer
trainer.model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"✅ Saved LoRA adapter to: {OUTPUT_DIR}")

✅ Saved LoRA adapter to: /content/mistral_sst2_qlora_adapter


In [38]:
#Inference helper + quick evaluation on the raw validation subset
import re
import torch

@torch.inference_mode()
def predict_label(sentence: str) -> str:
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content":
            "Classify the sentiment of the following sentence.\n"
            "Answer with exactly one word: Positive or Negative.\n\n"
            f"Sentence: {sentence}"
        },
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True,
    )
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    out = model.generate(
        **inputs,
        max_new_tokens=5,
        do_sample=False,
        temperature=0.0,
    )

    gen = tokenizer.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True).strip()
    # Normalize to first word
    gen_clean = re.sub(r"[^A-Za-z]+", " ", gen).strip().split(" ")[0].lower()

    if gen_clean.startswith("pos"):
        return "Positive"
    if gen_clean.startswith("neg"):
        return "Negative"
    return gen.strip()

def accuracy_on_eval(n=50):
    correct = 0
    for ex in eval_raw.select(range(n)):
        pred = predict_label(ex["sentence"])
        gold = "Positive" if int(ex["label"]) == 1 else "Negative"
        correct += int(pred == gold)
    return correct / n

print("Sample prediction:", predict_label(eval_raw[0]["sentence"]))
print("Accuracy on 50 examples:", accuracy_on_eval(50))

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
Caching is incompatible with gradient checkpointing in MistralDecoderLayer. Setting `past_key_values=None`.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Sample prediction: Positive


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

Accuracy on 50 examples: 0.9
