In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created: 1st January, 2025
Author: Aaron Maladry
Last Modified: November 6, 2025
Modified by: Pranaydeep Singh
Description: Script for fine-tuning a Llama model with in-context learning.
"""

'\nCreated: 1st January, 2025\nAuthor: Aaron Maladry\nLast Modified: November 6, 2025\nModified by: Pranaydeep Singh\nDescription: Script for fine-tuning a Llama model with in-context learning.\n'

In [None]:
#install dependencies
!pip install transformers datasets scikit-learn accelerate bitsandbytes peft trl

Collecting bitsandbytes
  Using cached bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB)
Collecting peft
  Using cached peft-0.17.1-py3-none-any.whl.metadata (14 kB)
Collecting trl
  Downloading trl-0.25.0-py3-none-any.whl.metadata (11 kB)
Collecting transformers
  Using cached transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub>=0.24.0 (from datasets)
  Using cached huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Using cached tokenizers-0.22.1-cp39-abi3-macosx_10_12_x86_64.whl.metadata (6.8 kB)
Using cached bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
Using cached peft-0.17.1-py3-none-any.whl (504 kB)
Downloading trl-0.25.0-py3-none-any.whl (462 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hUsing cached transformers-4.57.1-py3-none-any.whl (12.0 MB)
Using cached huggingface_hub-0.36.0-py

In [1]:
#imports

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
import transformers
import torch
import pandas as pd
import re
from peft import LoraConfig, prepare_model_for_kbit_training
from peft import prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM
import datasets
import bitsandbytes as bnb
import huggingface_hub
from transformers.pipelines.pt_utils import KeyDataset
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm
  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


### Starter Script 2: Fine-tuning a Llama Model with In-context Learning.

Fine-tune a Instruction-tuned model (Llama3.2-1B in this notebook) with In-context Learning ie. providing examples of outputs in the prompt.

This form of training is not compute intensive compared to SFT and will be much faster, but less performant.

Refer to Starter Script 3 for Inference!

In [None]:
access_token = "PLACEHOLDER_TOKEN" #sometimes you need to set your token here since access to some models is restricted
huggingface_hub.login(token=access_token) #only needed if you are using private models or pushing to the hub

# TODO: set your model here
# take care, you need transformers version 4.43, 4.45 is not yet supported, same for ipex-llm==2.1.0b2
model_name = "meta-llama/Llama-3.2-1B-Instruct"
# removes the repository name (here meta-llama) for saving the output file
save_as_name = model_name.split("/")[1]

In [3]:
#set up quantization config
#quantization helps to reduce the memory footprint of the model
#making it possible to finetune on smaller GPUs
#4-bit quantization is used here
#you can also try 8-bit quantization by changing the config below
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=False,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

# load model and tokenizer with quantization
base_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, token=access_token)
base_model.config.use_cache = False
base_model = prepare_model_for_kbit_training(base_model)
tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)

tokenizer.pad_token = tokenizer.eos_token
# for Llama models the padding side needs to be set to right, check for other models
tokenizer.padding_side = "right"



ImportError: The installed version of bitsandbytes (<0.43.1) requires CUDA, but CUDA is not available. You may need to install PyTorch with CUDA support or upgrade bitsandbytes to >=0.43.1.

In [None]:
# TODO: replace this prompt for your task, this will be asked during inference for each sample
# NOTE: here we also have a placeholder for the label, this is only used during training
prompt = r"\begin[user]Find the topic of this news article:\n\n### Text: {PLACEHOLDER_FOR_INPUTTEXT}\end[user]\begin[assistant]### Label:{PLACEHOLDER_FOR_LABEL}\end[assistant]"
# You can modify the prompt as per your requirement, just make sure to keep the {PLACEHOLDER_FOR_INPUTTEXT} and {PLACEHOLDER_FOR_LABEL} intact

In [None]:
# function to set up the prompt template with in-context examples
# this is used for both training and inference
# it's a bit long and dense but should be easy to modify for your own use-case
# only thing you need to change is the system prompt for your task
# NOTE: you can also implement more sophisticated selection strategies for in-context examples

def set_prompt_template(dataframe, example_df, prompt,tokenizer, n_incontext=1, train=True):
    """ 
    This function prepares the prompt template for inference. You provide the full dataframe and the example dataframe.
    """
    full_explanations = []
    # get an example for each unique label
    for i, row in dataframe.iterrows():

        chat = []

        # NOTE: here you can implement more sophisticated selection strategies for in-context examples, such as similarity search, selecting an example for each label or something like contrastive examples.
        # isolate n_incontext examples from example_df
        # for each sample, new examples are selected
        incontext_examples = example_df.sample(n=n_incontext)
        
        # shuffle the examples
        incontext_examples = incontext_examples.sample(frac=1).reset_index(drop=True)

        # TODO: set up the system prompt, this can include personality description AND general guidelines.
        systemprompt =" You are an expert trained in identifying different news articles topics based on their content.\n"
        # adding general task description.
        systemprompt += " Your task is to analyze the news headlines and classify them into one of the following categories: World, Sports, Business, Sci/Tech.\n"
        
        # each of the i
        for samplenr, sample in incontext_examples.iterrows():
            example_input = prompt.replace("{PLACEHOLDER_FOR_INPUTTEXT}" , sample['text'])\
            .replace("{PLACEHOLDER_FOR_LABEL}", sample['task_labels'])
            user_text = re.search(r'\\begin\[user\](.*?)\\end\[user\]', example_input, re.DOTALL)
            system_text = re.search(r'\\begin\[assistant\](.*?)\\end\[assistant\]', example_input, re.DOTALL)
            if samplenr == 0:
                # adds the system prompt for the first example only
                chat.append({"role": "user", "content": systemprompt + user_text.group(0).replace(r"\begin[user]", "").replace(r"\end[user]", "").replace("  ", " ").capitalize()})
            else:
                # of it's not the first example, the system prompt is not added again, only the "prompt" content that is repeated for each sample and example
                chat.append({"role": "user", "content": user_text.group(0).replace(r"\begin[user]", "").replace(r"\end[user]", "").replace("  ", " ").capitalize()})
            # for the in-context examples, we also add the assistant response
            chat.append({"role": "assistant", "content": system_text.group(0).replace(r"\begin[assistant]", "").replace(r"\end[assistant]", "").replace("  ", " ").capitalize()},)


        # get the actual to classify text and insert it into the prompt
        sample_text = prompt.replace("{PLACEHOLDER_FOR_INPUTTEXT}" , row['text']).replace("{PLACEHOLDER_FOR_LABEL}", row['task_labels'])

        # get text between \begin[user] and \end[user], removing these placeholder tokens
        user_text = re.search(r'\\begin\[user\](.*?)\\end\[user\]', sample_text, re.DOTALL)
        system_text = re.search(r'\\begin\[assistant\](.*?)\\end\[assistant\]', sample_text, re.DOTALL)
        
        chat.append({"role": "user","content": user_text.group(0).replace(r"\begin[user]", "").replace(r"\end[user]", "").replace("  "," ").capitalize()})
        if train:
            chat.append({"role": "assistant","content": system_text.group(0).replace(r"\begin[assistant]", "").replace(r"\end[assistant]", "").replace("  "," ").capitalize()})
        
        # this function automatically sets up the chat tokens for instruction tuning based on the model tokenizer.
        input_chat = tokenizer.apply_chat_template(chat, tokenize=False)
        full_explanations.append(input_chat)
    #add to dataframe, overwrite "text" column for training
    dataframe["text"] = full_explanations
    return dataframe

In [None]:
# load in the train data, needs two columns: "text" and "labels" 
# The labels should be text, otherwise you need to convert the numbers to textual labels.

train_data = datasets.load_dataset("ag_news", split="train[:1%]") # using a small subset for demonstration, replace with your dataset

# cannot name this labels for training purposes
train_data["task_labels"] = train_data["labels"]

# set up the prompt template
train_data = set_prompt_template(dataframe=train_data, example_df=train_data, prompt=prompt, tokenizer=tokenizer)
trainset = datasets.Dataset.from_pandas(train_data)



In [None]:
# Find all linear layer names for LoRA adaptation
# We only adapt the linear layers in the model, others are frozen
# No changes needed here, can be used directly for any model and task

def find_all_linear_names(model): # copied from https://github.com/mzbac/llama2-fine-tune/blob/master/utils.py
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    return list(lora_module_names)

In [None]:
# Set up PEFT LoRA for fine-tuning.
# Read more about the different parameters here: https://huggingface.co/docs/peft/en/package_reference/lora
lora_config = LoraConfig(
    lora_alpha=16,
    r=32,
    target_modules=find_all_linear_names(base_model),
    task_type="CAUSAL_LM",
)
#max_seq_length = 1024

trainer = SFTTrainer(
    model=base_model,
    train_dataset=trainset,
    args=TrainingArguments(
        per_device_train_batch_size=1,  # This is actually the global batch size for SPMD.
        num_train_epochs=2,
        output_dir=f"./trained_{save_as_name}",
        eval_accumulation_steps=10,
        dataloader_drop_last = True,  # Required for SPMD.
        hub_private_repo=True,
    ),
    peft_config=lora_config
)

trainer.train()