### Installation

In [None]:
# Install dependencies
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
!pip install --no-deps unsloth
!pip install evaluate
!pip install rouge_score



## Imports

In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from functools import partial
import torch
from unsloth import is_bfloat16_supported
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
import evaluate

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


## Unsloth

In [None]:
max_seq_length = 2048
dtype = None
load_in_4bit = True

original_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

==((====))==  Unsloth 2025.5.9: Fast Llama patching. Transformers: 4.52.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/336M [00:00<?, ?B/s]

Unsloth 2025.5.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## data


In [None]:
huggingface_dataset_name = "neil-code/dialogsum-test"
dataset = load_dataset(huggingface_dataset_name)

README.md:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/1.81M [00:00<?, ?B/s]

validation.csv:   0%|          | 0.00/441k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/447k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1999 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/499 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/499 [00:00<?, ? examples/s]

In [None]:
dataset['train'].column_names

['id', 'dialogue', 'summary', 'topic']

In [None]:
def create_prompt_formats(sample):
    """
    Format various fields of the sample ('instruction','output')
    Then concatenate them using two newline characters
    :param sample: Sample dictionnary
    """
    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    INSTRUCTION_KEY = "### Instruct: Summarize the below conversation."
    RESPONSE_KEY = "### Output:"
    END_KEY = "### End"

    blurb = f"\n{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}"
    input_context = f"{sample['dialogue']}" if sample["dialogue"] else None
    response = f"{RESPONSE_KEY}\n{sample['summary']}"
    end = f"{END_KEY}"

    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    formatted_prompt = "\n\n".join(parts)
    sample["text"] = formatted_prompt

    return sample

In [None]:
# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )

In [None]:
# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def preprocess_dataset(tokenizer, max_length: int,seed, dataset):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)

    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=['id', 'topic', 'dialogue', 'summary'],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset

In [None]:
## Pre-process dataset
max_length = get_max_length(original_model)
print(max_length)
seed = 4012
train_dataset = preprocess_dataset(tokenizer, max_length,seed, dataset['train'])
eval_dataset = preprocess_dataset(tokenizer, max_length,seed, dataset['validation'])

Found max lenth: 8192
8192
Preprocessing dataset...


Map:   0%|          | 0/1999 [00:00<?, ? examples/s]

Map:   0%|          | 0/1999 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1999 [00:00<?, ? examples/s]

Preprocessing dataset...


Map:   0%|          | 0/499 [00:00<?, ? examples/s]

Map:   0%|          | 0/499 [00:00<?, ? examples/s]

Filter:   0%|          | 0/499 [00:00<?, ? examples/s]

## Fine Tune Model


In [None]:
"""
apply LoRA adapters to a pre-trained language model. This allows for efficient
fine-tuning by only updating a small percentage of the model's parameters.

attr->
model: The pre-trained language model to be finetuned.
r: This is the LoRA rank. It determines the dimensionality of the
   low-rank matrices used in the adapters.
   higher rank -> more params trained -> better perf -> more vram
traget_modules: module name where LoRA adapters will be applied
lora_alpha: This is the scaling factor for the LoRA adapters
lora_dropout: droput applied to LoRA layers
bias: This specifies whether to train the bias parameters in the LoRA layers
use_gradient_checkpointing: enables gradient checkpointing, which can reduce
memory usage during training by recomputing gradients instead of storing them.
"""
# r = Choose any number > 0 ! Suggested 8, 16, 32, 64, 128

patched_model = FastLanguageModel.get_peft_model(
    original_model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth: Already have LoRA adapters! We shall skip this step.


In [None]:
trainer = SFTTrainer(
    model = patched_model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # max_steps = 60,
        num_train_epochs = 1,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        gradient_checkpointing=True,
        eval_steps=1,
        do_eval=True,
        report_to = "none"
    ),
)

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,999 | Num Epochs = 1 | Total steps = 250
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 83,886,080/8,000,000,000 (1.05% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,0.8255
2,0.8161
3,0.8884
4,0.8426
5,0.8197
6,0.8553
7,0.9401
8,0.683
9,0.9407
10,1.064


## Model Inference

In [None]:
def gen(model, prompt, max_length=1000):
    """
    Generates text from a given model and prompt.

    Args:
        model: The language model to use for generation.
        prompt (str): The input prompt.
        max_length (int): The maximum number of tokens to generate.

    Returns:
        list: A list containing the generated text.
    """
    # Encode the prompt
    inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False)
    # Move inputs to the same device as the model
    inputs = inputs.to(model.device)

    # Generate text
    outputs = model.generate(**inputs, max_length=max_length, num_return_sequences=1)

    # Decode the generated tokens
    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return generated_text

In [None]:
def get_output(model, dataset, index):
    prompt = dataset[index]['dialogue']
    summary = dataset[index]['summary']

    formatted_prompt = f"Instruct: Summarize the following conversation.\n{prompt}\nOutput:\n"
    res = gen(model,formatted_prompt,1000)

    output = res[0].split('Output:\n')[1]

    dash_line = '-'.join('' for x in range(100))
    print(dash_line)
    print(f'INPUT PROMPT:\n{formatted_prompt}')
    print(dash_line)
    print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
    print(dash_line)
    print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

In [None]:
get_output(original_model, dataset['test'], 1)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


---------------------------------------------------------------------------------------------------
INPUT PROMPT:
Instruct: Summarize the following conversation.
#Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official memos. The use of Instant Message programs by employees during working hours is strictly prohibited.
#Person2#: Sir, does this apply to intra-office communications only? Or will it also restrict external communications?
#Person1#: It should apply to all communications, not only in this office between employees, but also any outside communications.
#Person2#: But sir, many employees use Instant Messaging to communicate with their clients.
#Person1#: They will just have to 

In [None]:
FastLanguageModel.for_inference(patched_model)
get_output(patched_model, dataset['test'], 1)

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
Instruct: Summarize the following conversation.
#Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official memos. The use of Instant Message programs by employees during working hours is strictly prohibited.
#Person2#: Sir, does this apply to intra-office communications only? Or will it also restrict external communications?
#Person1#: It should apply to all communications, not only in this office between employees, but also any outside communications.
#Person2#: But sir, many employees use Instant Messaging to communicate with their clients.
#Person1#: They will just have to 

## Save

In [None]:
patched_model.save_pretrained("llama3_8B_summarise_fine_tune")
tokenizer.save_pretrained("llama3_8B_summarise_fine_tune")
patched_model.push_to_hub("romitraj-ds/llama3_8B_summarise_fine_tune", token = "TOKEN")
tokenizer.push_to_hub("romitraj-ds/llama3_8B_summarise_fine_tune", token = "TOKEN")

README.md:   0%|          | 0.00/578 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/336M [00:00<?, ?B/s]

Saved model to https://huggingface.co/romitraj-ds/llama3_8B_summarise_fine_tune


tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

## Evaluation


In [None]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for idx, dialogue in enumerate(dialogues):
    human_baseline_text_output = human_baseline_summaries[idx]
    prompt = f"Instruct: Summarize the following conversation.\n{dialogue}\nOutput:\n"

    original_model_res = gen(original_model,prompt,400)
    original_model_text_output = original_model_res[0].split('Output:\n')[1]

    peft_model_res = gen(patched_model,prompt,400)
    peft_model_output = peft_model_res[0].split('Output:\n')[1]
    peft_model_text_output, success, result = peft_model_output.partition('###')

    original_model_summaries.append(original_model_text_output)
    peft_model_summaries.append(peft_model_text_output)
    print(f"Done: {idx+1}")

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, peft_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'peft_model_summaries'])
df

Done: 1
Done: 2
Done: 3
Done: 4
Done: 5
Done: 6
Done: 7
Done: 8
Done: 9
Done: 10


Unnamed: 0,human_baseline_summaries,original_model_summaries,peft_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,#Person1# asks Ms. Dawson to take a dictation ...,#Person1# asks Ms. Dawson to take a dictation ...
1,In order to prevent employees from wasting tim...,#Person1# asks Ms. Dawson to take a dictation ...,#Person1# asks Ms. Dawson to take a dictation ...
2,Ms. Dawson takes a dictation for #Person1# abo...,#Person1# asks Ms. Dawson to take a dictation....,#Person1# asks Ms. Dawson to take a dictation ...
3,#Person2# arrives late because of traffic jam....,#Person2# got stuck in the traffic jam and #Pe...,#Person2# got stuck in a traffic jam on the wa...
4,#Person2# decides to follow #Person1#'s sugges...,#Person2# got stuck in traffic and #Person1# s...,#Person2# got stuck in traffic again. #Person1...
5,#Person2# complains to #Person1# about the tra...,#Person2# got stuck in the traffic jam and #Pe...,#Person2# got stuck in a traffic jam and #Pers...
6,#Person1# tells Kate that Masha and Hero get d...,#Person1# tells Kate Masha and Hero are gettin...,#Person1# tells Kate Masha and Hero are gettin...
7,#Person1# tells Kate that Masha and Hero are g...,#Person1# tells Kate that Masha and Hero are g...,#Person1# tells Kate Masha and Hero are gettin...
8,#Person1# and Kate talk about the divorce betw...,#Person1# tells Kate Masha and Hero are gettin...,#Person1# tells Kate Masha and Hero are gettin...
9,#Person1# and Brian are at the birthday party ...,#Person1# gives a necklace to Brian for his bi...,#Person1# gives a birthday gift to Brian and i...


In [None]:
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('PEFT MODEL:')
print(peft_model_results)

print("Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL")

improvement = (np.array(list(peft_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(peft_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ORIGINAL MODEL:
{'rouge1': np.float64(0.31705290845026823), 'rouge2': np.float64(0.10047710327274166), 'rougeL': np.float64(0.26201278643250203), 'rougeLsum': np.float64(0.24806785489138694)}
PEFT MODEL:
{'rouge1': np.float64(0.42695474575617476), 'rouge2': np.float64(0.16893660011240563), 'rougeL': np.float64(0.3473905790841749), 'rougeLsum': np.float64(0.34841494823852615)}
Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL
rouge1: 10.99%
rouge2: 6.85%
rougeL: 8.54%
rougeLsum: 10.03%


## Result

**ORIGINAL MODEL:**

{'rouge1': np.float64(0.31705290845026823), 'rouge2': np.float64(0.10047710327274166), 'rougeL': np.float64(0.26201278643250203), 'rougeLsum': np.float64(0.24806785489138694)}

**PEFT MODEL:**

{'rouge1': np.float64(0.42695474575617476), 'rouge2': np.float64(0.16893660011240563), 'rougeL': np.float64(0.3473905790841749), 'rougeLsum': np.float64(0.34841494823852615)}

**Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL**

rouge1: 10.99%
rouge2: 6.85%
rougeL: 8.54%
rougeLsum: 10.03%