In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created: 1st January, 2025
Author: Aaron Maladry
Last Modified: January 20, 2026
Maintainer: Pranaydeep Singh
Description: Script for fine-tuning a Llama model with in-context learning.
"""

'\nCreated: 1st January, 2025\nAuthor: Aaron Maladry\nLast Modified: January 20, 2026\nMaintainer: Pranaydeep Singh\nDescription: Script for fine-tuning a Llama model with in-context learning.\n'

# Fine-tuning Llama with “In-Context Learning” Style Prompts

This notebook shows how to **fine-tune an instruction-tuned Llama model** (example: **Llama-3.2-1B-Instruct**) using **prompted training examples** that include **few-shot in-context demonstrations**.

## What you’ll learn
- What is *in-context learning (ICL)* is, and how it differs from standard supervised fine-tuning (SFT)
- How to turn a dataset into **chat-style prompts** with **k-shot examples** inside the prompt
- How to fine-tune efficiently using **4-bit quantization** + **LoRA (PEFT)**
- How to sanity-check outputs and save your trained adapter

> Note: This is still a form of supervised training (we train on the correct labels), but the **input prompt** includes a few examples to encourage the model to follow the pattern.

> Tip: Please follow the instructions in the README to install the required packages before proceeding



## 0) Quick prerequisites
- A GPU is strongly recommended (even for 1B models).
- You may need a Hugging Face token if the model is gated.
- This notebook uses:
  - `transformers` for the model/tokenizer
  - `datasets` for loading data
  - `bitsandbytes` for 4-bit quantization
  - `peft` for LoRA
  - `trl` for `SFTTrainer`


In [3]:
import os
import random
import numpy as np
import torch

from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
)

from peft import LoraConfig, prepare_model_for_kbit_training
from trl import SFTTrainer
import bitsandbytes as bnb

# Optional but useful for metrics
import evaluate


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Reproducibility helpers

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

SEED = 42
set_seed(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
print("CUDA available:", torch.cuda.is_available())
print("Torch:", torch.__version__)

Device: cuda
CUDA available: True
Torch: 2.9.1+cu128


## 1) Hugging Face access (token)
Some models (including some Llama checkpoints) require accepting a license / being authenticated.

Best practice: **do not hard-code tokens in notebooks**. Use an environment variable:
- `HF_TOKEN` on your machine/Colab secrets

If you don’t need it, you can leave it blank.

In [6]:
# Option A (recommended): set HF_TOKEN in your environment and read it here
HF_TOKEN = os.environ.get("HF_TOKEN", "")

# Option B (not recommended): paste token here temporarily
# HF_TOKEN = "..."

if HF_TOKEN:
    from huggingface_hub import login
    login(token=HF_TOKEN)
    print("Logged into Hugging Face.")
else:
    print("HF_TOKEN not set. This is OK if your model is not gated.")

HF_TOKEN not set. This is OK if your model is not gated.


## 2) Choose a model
We use an **instruction-tuned** causal LM so it understands chat prompts.

Good beginner-friendly options:
- `meta-llama/Llama-3.2-1B-Instruct` (small, but may be gated)
- other small instruction-tuned models available to you

We’ll load the model with **4-bit quantization** to reduce VRAM usage.

In [7]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"

# Used for output folder naming
save_as_name = model_name.split("/")[-1]
print("Model:", model_name)


Model: meta-llama/Llama-3.2-1B-Instruct


## 3) Load tokenizer + model (4-bit)
**Quantization** stores weights in lower precision so the model uses less memory.

- 4-bit quantization is great for fitting models on smaller GPUs.
- We then use **LoRA adapters** so we only train a small number of parameters.

If you are on CPU (slow) or `bitsandbytes` isn’t supported, set `load_in_4bit=False`.


In [8]:
use_4bit = True

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN or None)

# Many Llama tokenizers don't define a pad token; use eos as pad for batching
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Llama-style chat models typically expect right padding
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config if use_4bit else None,
    device_map="auto" if torch.cuda.is_available() else None,
    token=HF_TOKEN or None,
)

# Important for training with gradient checkpointing / TRL sometimes
base_model.config.use_cache = False

# Prepare for k-bit training (enables gradients correctly for quantized weights)
base_model = prepare_model_for_kbit_training(base_model)

print("Loaded model + tokenizer.")


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct.
401 Client Error. (Request ID: Root=1-696f9f57-1cb7b89b0fd6e51a1e7e5eca;deb1ac6e-e59c-4397-9161-0aa9e710142d)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-1B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in.

In [None]:
#set up quantization config
#quantization helps to reduce the memory footprint of the model
#making it possible to finetune on smaller GPUs
#4-bit quantization is used here
#you can also try 8-bit quantization by changing the config below
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=False,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

# load model and tokenizer with quantization
base_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, token=access_token)
base_model.config.use_cache = False
base_model = prepare_model_for_kbit_training(base_model)
tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)

tokenizer.pad_token = tokenizer.eos_token
# for Llama models the padding side needs to be set to right, check for other models
tokenizer.padding_side = "right"



ImportError: The installed version of bitsandbytes (<0.43.1) requires CUDA, but CUDA is not available. You may need to install PyTorch with CUDA support or upgrade bitsandbytes to >=0.43.1.

## 4) What task are we training?
We’ll use **AG News** (topic classification) as a simple demo.

AG News has 4 labels:
- World
- Sports
- Business
- Sci/Tech

### Key idea: training data becomes chat prompts
For each training example, we build a chat like:

1. System: instructions
2. Few-shot examples (k demonstrations)
3. The new input to classify
4. During training only: the correct label as the assistant answer

That means the model learns **to follow your prompt format**.

In [9]:
# Increase this if you have enough compute.
raw = load_dataset("ag_news", split="train[:1%]")

# ag_news usually uses column name "label" (int) and "text" (string)
label_col = "label" if "label" in raw.column_names else "labels"
text_col = "text"

# Convert label ids -> label names if available
label_names = None
try:
    label_names = raw.features[label_col].names
except Exception:
    label_names = None

def id_to_label(i: int) -> str:
    if label_names is not None:
        return str(label_names[int(i)])
    # fallback: simple mapping (AG News)
    mapping = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
    return mapping.get(int(i), str(i))

# Convert to pandas for easy sampling and splitting
import pandas as pd

df = raw.to_pandas()
df["label_text"] = df[label_col].apply(id_to_label)

df_train, df_val = train_test_split(
    df,
    test_size=0.1,
    random_state=SEED,
    stratify=df[label_col] if label_col in df.columns else None,
)

print("Train size:", len(df_train))
print("Val size:", len(df_val))
print(df_train.head(2)[[text_col, "label_text"]])


Train size: 1080
Val size: 120
                                                   text label_text
1088  Defense Could Dominate in the Big Ten (AP) AP ...     Sports
1131  Traders Bet Oracle Wins in Antitrust Case  CHI...   Business


## 5) Build the few-shot chat prompts
We’ll create a function that constructs a list of chat messages.

### Prompt design tips
- Keep the **output format** very explicit (e.g., “Answer with one label only”).
- Keep labels **exactly spelled** and consistent.
- Few-shot examples should be representative.

We’ll keep it simple: random k-shot sampling from the training set.
(You can later upgrade this to similarity search / balanced sampling.)


In [10]:
SYSTEM_PROMPT = (
    "You are an expert news topic classifier. "
    "Classify the given news text into exactly one of these labels: "
    "World, Sports, Business, Sci/Tech. "
    "Reply with the label only."
)

def build_chat(text: str, kshot_examples: list[dict], train: bool, gold_label: str | None = None):
    """Return a chat (list of role/content dicts) in Llama chat format."""
    chat = [{"role": "system", "content": SYSTEM_PROMPT}]

    # Add k-shot demonstrations
    for ex in kshot_examples:
        chat.append({"role": "user", "content": f"Text:\n{ex['text']}\n\nLabel:"})
        chat.append({"role": "assistant", "content": ex["label_text"]})

    # Add the new sample
    chat.append({"role": "user", "content": f"Text:\n{text}\n\nLabel:"})

    # During training, include the gold answer as assistant output
    if train:
        assert gold_label is not None
        chat.append({"role": "assistant", "content": gold_label})

    return chat

def chat_to_text(chat) -> str:
    """Convert chat messages into a single string using the tokenizer's chat template."""
    return tokenizer.apply_chat_template(chat, tokenize=False)

# Quick sanity check
example = df_train.sample(n=2, random_state=SEED).to_dict(orient="records")
chat = build_chat(
    text=df_train.iloc[0][text_col],
    kshot_examples=example,
    train=True,
    gold_label=df_train.iloc[0]["label_text"],
)
print(chat_to_text(chat)[:700])


NameError: name 'tokenizer' is not defined

## 6) Turn the dataset into prompt-text for TRL
`trl.SFTTrainer` expects a dataset with a text field (by default, named `text`).

We will create:
- `trainset`: prompts WITH the gold label (teacher forcing)
- `valset`: prompts WITHOUT the gold label (we’ll use it for quick eval later)

To keep this fast, we’ll generate prompts on the fly into new columns.

In [None]:
K_SHOT = 2

rng = np.random.default_rng(SEED)

train_records = df_train.to_dict(orient="records")

def make_training_prompt(row, pool, k_shot=2):
    # sample k-shot examples from pool (avoid sampling the same row if possible)
    idxs = rng.choice(len(pool), size=k_shot, replace=False)
    kshot = [pool[i] for i in idxs]
    chat = build_chat(text=row[text_col], kshot_examples=kshot, train=True, gold_label=row["label_text"])
    return chat_to_text(chat)

# Build prompts (for larger datasets, consider batched mapping with datasets.map)
df_train_out = df_train.copy()
df_train_out["text"] = [make_training_prompt(r, train_records, K_SHOT) for r in train_records]

trainset = Dataset.from_pandas(df_train_out[["text"]])

print(trainset[0]["text"][:700])

## 7) Configure LoRA (PEFT)
LoRA adds small trainable matrices into selected linear layers.

Why this matters:
- Full fine-tuning updates **all** weights (expensive)
- LoRA updates a **small** set of parameters (cheap)

We’ll automatically find the 4-bit linear layers and target them.


In [None]:
def find_all_linear_names(model):
    """Find module names of 4-bit linear layers for LoRA targeting."""
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            parts = name.split('.')
            lora_module_names.add(parts[0] if len(parts) == 1 else parts[-1])
    return sorted(list(lora_module_names))

target_modules = find_all_linear_names(base_model)
print("Number of target modules:", len(target_modules))
print("Example target modules:", target_modules[:10])

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=target_modules,
    task_type="CAUSAL_LM",
    lora_dropout=0.05,
)

## 8) Training configuration
These settings are deliberately small/safe for a starter run.

Tips:
- If you get CUDA OOM: lower `max_seq_length`, lower `K_SHOT`, lower batch size.
- If training is very slow: use fewer examples, fewer epochs.


In [None]:
output_dir = f"./trained_{save_as_name}_icl_lora"

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    learning_rate=2e-4,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    report_to=[],  # add "wandb" if you want
)

trainer = SFTTrainer(
    model=base_model,
    train_dataset=trainset,
    args=training_args,
    peft_config=lora_config,
    tokenizer=tokenizer,
    dataset_text_field="text",
    max_seq_length=512,
)

In [None]:
trainer.train()


## 9) Save your trained adapter
With PEFT/LoRA, you usually save the **adapter weights** (small!) and the tokenizer.

You can later load them on top of the base model for inference.


In [None]:
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print("Saved to:", output_dir)


## 10) Quick inference sanity check (optional)
This is a *minimal* sanity check: we run a single example with the same prompt format (but **without** the gold label).

For a proper evaluation, you’d:
- run inference over the validation set
- parse predicted labels
- compute accuracy + confusion matrix


In [None]:
from peft import PeftModel

# Reload base model + adapter (useful if you've restarted runtime)
base = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config if use_4bit else None,
    device_map="auto" if torch.cuda.is_available() else None,
    token=HF_TOKEN or None,
)
base.config.use_cache = True

model = PeftModel.from_pretrained(base, output_dir)
model.eval()

# Pick a validation sample
sample = df_val.sample(n=1, random_state=SEED).iloc[0]

# Build k-shot prompt (without gold label)
train_pool = df_train.sample(n=min(200, len(df_train)), random_state=SEED).to_dict(orient="records")
kshot = random.sample(train_pool, k=K_SHOT)

chat = build_chat(text=sample[text_col], kshot_examples=kshot, train=False)
prompt_text = chat_to_text(chat)

inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)

with torch.no_grad():
    out = model.generate(
        **inputs,
        max_new_tokens=10,
        do_sample=False,
        temperature=0.0,
        eos_token_id=tokenizer.eos_token_id,
    )

gen = tokenizer.decode(out[0], skip_special_tokens=True)
print("--- PROMPT (truncated) ---")
print(prompt_text[-800:])
print("\n--- GENERATED ---")
print(gen[-400:])
print("\n--- GOLD LABEL ---")
print(sample["label_text"])


## 11) Recommended Next Steps

- **Try your own dataset** but remember to adapt the system prompt (`important`) and few-shot examples setup (if needed) 
- Test larger or specialized models for your domain or language
- If you have a lot more training data, look into full fine-tuning
- If you have no training data, look into zero-shot