### Installation

In [None]:
# Install dependencies
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
!pip install --no-deps unsloth
!pip install evaluate
!pip install rouge_score

## Imports

In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from functools import partial
import torch
from unsloth import is_bfloat16_supported
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
import evaluate

## Unsloth

In [None]:
max_seq_length = 2048
dtype = None
load_in_4bit = True

original_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

## data


In [None]:
huggingface_dataset_name = "neil-code/dialogsum-test"
dataset = load_dataset(huggingface_dataset_name)

In [None]:
dataset['train'].column_names

In [None]:
def create_prompt_formats(sample):
    """
    Format various fields of the sample ('instruction','output')
    Then concatenate them using two newline characters
    :param sample: Sample dictionnary
    """
    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    INSTRUCTION_KEY = "### Instruct: Summarize the below conversation."
    RESPONSE_KEY = "### Output:"
    END_KEY = "### End"

    blurb = f"\n{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}"
    input_context = f"{sample['dialogue']}" if sample["dialogue"] else None
    response = f"{RESPONSE_KEY}\n{sample['summary']}"
    end = f"{END_KEY}"

    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    formatted_prompt = "\n\n".join(parts)
    sample["text"] = formatted_prompt

    return sample

In [None]:
# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )

In [None]:
# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def preprocess_dataset(tokenizer, max_length: int,seed, dataset):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)

    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=['id', 'topic', 'dialogue', 'summary'],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset

In [None]:
## Pre-process dataset
max_length = get_max_length(original_model)
print(max_length)
seed = 4012
train_dataset = preprocess_dataset(tokenizer, max_length,seed, dataset['train'])
eval_dataset = preprocess_dataset(tokenizer, max_length,seed, dataset['validation'])

## Fine Tune Model


In [None]:
"""
apply LoRA adapters to a pre-trained language model. This allows for efficient
fine-tuning by only updating a small percentage of the model's parameters.

attr->
model: The pre-trained language model to be finetuned.
r: This is the LoRA rank. It determines the dimensionality of the
   low-rank matrices used in the adapters.
   higher rank -> more params trained -> better perf -> more vram
traget_modules: module name where LoRA adapters will be applied
lora_alpha: This is the scaling factor for the LoRA adapters
lora_dropout: droput applied to LoRA layers
bias: This specifies whether to train the bias parameters in the LoRA layers
use_gradient_checkpointing: enables gradient checkpointing, which can reduce
memory usage during training by recomputing gradients instead of storing them.
"""
# r = Choose any number > 0 ! Suggested 8, 16, 32, 64, 128

patched_model = FastLanguageModel.get_peft_model(
    original_model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

In [None]:
trainer = SFTTrainer(
    model = patched_model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # max_steps = 60,
        num_train_epochs = 1,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        gradient_checkpointing=True,
        eval_steps=1,
        do_eval=True,
        report_to = "none"
    ),
)

In [None]:
trainer_stats = trainer.train()

## Model Inference

In [None]:
def gen(model, prompt, max_length=1000):
    """
    Generates text from a given model and prompt.

    Args:
        model: The language model to use for generation.
        prompt (str): The input prompt.
        max_length (int): The maximum number of tokens to generate.

    Returns:
        list: A list containing the generated text.
    """
    # Encode the prompt
    inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False)
    # Move inputs to the same device as the model
    inputs = inputs.to(model.device)

    # Generate text
    outputs = model.generate(**inputs, max_length=max_length, num_return_sequences=1)

    # Decode the generated tokens
    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return generated_text

In [None]:
def get_output(model, dataset, index):
    prompt = dataset[index]['dialogue']
    summary = dataset[index]['summary']

    formatted_prompt = f"Instruct: Summarize the following conversation.\n{prompt}\nOutput:\n"
    res = gen(model,formatted_prompt,1000)

    output = res[0].split('Output:\n')[1]

    dash_line = '-'.join('' for x in range(100))
    print(dash_line)
    print(f'INPUT PROMPT:\n{formatted_prompt}')
    print(dash_line)
    print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
    print(dash_line)
    print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

In [None]:
get_output(original_model, dataset['test'], 1)

In [None]:
FastLanguageModel.for_inference(patched_model)
get_output(patched_model, dataset['test'], 1)

## Save

In [None]:
patched_model.save_pretrained("llama3_8B_summarise_fine_tune")
tokenizer.save_pretrained("llama3_8B_summarise_fine_tune")
patched_model.push_to_hub("romitraj-ds/llama3_8B_summarise_fine_tune", token = "TOKEN")
tokenizer.push_to_hub("romitraj-ds/llama3_8B_summarise_fine_tune", token = "TOKEN")

## Evaluation


In [None]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for idx, dialogue in enumerate(dialogues):
    human_baseline_text_output = human_baseline_summaries[idx]
    prompt = f"Instruct: Summarize the following conversation.\n{dialogue}\nOutput:\n"

    original_model_res = gen(original_model,prompt,400)
    original_model_text_output = original_model_res[0].split('Output:\n')[1]

    peft_model_res = gen(patched_model,prompt,400)
    peft_model_output = peft_model_res[0].split('Output:\n')[1]
    peft_model_text_output, success, result = peft_model_output.partition('###')

    original_model_summaries.append(original_model_text_output)
    peft_model_summaries.append(peft_model_text_output)
    print(f"Done: {idx+1}")

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, peft_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'peft_model_summaries'])
df

In [None]:
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('PEFT MODEL:')
print(peft_model_results)

print("Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL")

improvement = (np.array(list(peft_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(peft_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

## Result

**ORIGINAL MODEL:**

{'rouge1': np.float64(0.31705290845026823), 'rouge2': np.float64(0.10047710327274166), 'rougeL': np.float64(0.26201278643250203), 'rougeLsum': np.float64(0.24806785489138694)}

**PEFT MODEL:**

{'rouge1': np.float64(0.42695474575617476), 'rouge2': np.float64(0.16893660011240563), 'rougeL': np.float64(0.3473905790841749), 'rougeLsum': np.float64(0.34841494823852615)}

**Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL**

rouge1: 10.99%
rouge2: 6.85%
rougeL: 8.54%
rougeLsum: 10.03%