In [2]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install evaluate  # Install the evaluate library for metrics

import torch
from unsloth import FastLanguageModel
import math
from datasets import load_dataset
from unsloth.chat_templates import get_chat_template
from unsloth.chat_templates import train_on_responses_only
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from trl import SFTTrainer
from unsloth import is_bfloat16_supported
from torch.utils.data import DataLoader
from tqdm import tqdm
import evaluate


In [4]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=242d321503c4feb7db07c7eac0533524e6257a55aaf38c8ce2108fdf6c37fd66
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [5]:
# Initialize metrics
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.


In [6]:
# 4bit pre-quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
]  # More models at https://huggingface.co/unsloth

# Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct",  # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2024.10.6: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [7]:
# Prepare the model for PEFT
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",     # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,   # We support rank stabilized LoRA
    loftq_config=None,  # And LoftQ
)

# Set the chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3.1",
)

In [8]:
# Function to format prompts
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]

    # Combine instruction, input, and output
    convos = [{"role": "user", "content": f"Instruction: {instruction}\nInput: {input_text}"} for instruction, input_text in zip(instructions, inputs)]
    responses = [{"role": "assistant", "content": output} for output in outputs]

    # Combine conversations and apply the chat template
    conversations = [{"conversations": [convo, response]} for convo, response in zip(convos, responses)]

    texts = [tokenizer.apply_chat_template(convo["conversations"], tokenize=False, add_generation_prompt=False)
             for convo in conversations]

    return {"text": texts}

# Load and split the dataset
dataset = load_dataset("tatsu-lab/alpaca", split="train")

# Split dataset into training and validation sets
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset['train']
eval_dataset = dataset['test']

# Apply formatting to both datasets
train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
eval_dataset = eval_dataset.map(formatting_prompts_func, batched=True)

# Initialize the trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    dataset_num_proc=2,
    packing=False,  # Can make training 5x faster for short sequences.
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",  # Use this for WandB etc
        evaluation_strategy="steps",
        eval_steps=10,
    ),
)


README.md:   0%|          | 0.00/7.47k [00:00<?, ?B/s]

(…)-00000-of-00001-a09b74b3ef9c3b56.parquet:   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

Map:   0%|          | 0/46801 [00:00<?, ? examples/s]

Map:   0%|          | 0/5201 [00:00<?, ? examples/s]



Map (num_proc=2):   0%|          | 0/46801 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/5201 [00:00<?, ? examples/s]

In [9]:
# Adjust trainer to train on responses only
trainer = train_on_responses_only(
    trainer,
    instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
    response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
)

# Start training
trainer_stats = trainer.train()

Map:   0%|          | 0/46801 [00:00<?, ? examples/s]

Map:   0%|          | 0/5201 [00:00<?, ? examples/s]

**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!
`pip install --upgrade --no-cache-dir unsloth git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/trl.git`


Step,Training Loss,Validation Loss
10,1.2737,1.405264
20,1.3504,1.365965
30,1.2754,1.342801
40,1.6331,1.340452
50,1.2919,1.333517
60,1.3115,1.330356


In [11]:
# Evaluate the model to get perplexity
eval_results = trainer.evaluate()
perplexity = math.exp(eval_results["eval_loss"])
print(f"Perplexity: {perplexity}")

# Prepare the model for inference
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

# Function to generate predictions and compute metrics
def evaluate_model(model, tokenizer, eval_dataset, batch_size=8):
    from torch.utils.data import DataLoader

    eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size)
    predictions = []
    references = []

    model.eval()
    with torch.no_grad():
        for batch in tqdm(eval_dataloader):
            instructions = batch['instruction']
            inputs_text = batch['input']
            outputs_text = batch['output']

            batch_messages = []
            for instruction, input_text in zip(instructions, inputs_text):
                message = [
                    {
                        "role": "user",
                        "content": f"Instruction: {instruction}\nInput: {input_text}"
                    }
                ]
                batch_messages.append(message)

            # Apply chat template and tokenize
            inputs = tokenizer.apply_chat_template(
                batch_messages,
                tokenize=True,
                add_generation_prompt=True,  # Must add for generation
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=max_seq_length,
            ).to("cuda")

            # Compute attention mask
            attention_mask = inputs.ne(tokenizer.pad_token_id).long()

            # Generate outputs from the model
            outputs = model.generate(
                input_ids=inputs,
                attention_mask=attention_mask,
                max_new_tokens=64,
                temperature=1.0,
                min_p=0.1,
            )

            # Get the length of the inputs
            input_length = inputs.shape[1]

            # Decode the generated text
            generated_tokens = outputs[:, input_length:]
            decoded_outputs = tokenizer.batch_decode(
                generated_tokens, skip_special_tokens=True
            )

            # Append the generated text to predictions
            predictions.extend(decoded_outputs)
            # Append the reference outputs
            references.extend(outputs_text)

    # Compute BLEU and ROUGE scores
    bleu_score = bleu_metric.compute(predictions=predictions, references=references)
    rouge_score = rouge_metric.compute(predictions=predictions, references=references)

    print(f"BLEU score: {bleu_score['bleu']}")
    print(f"ROUGE score: {rouge_score}")

# Evaluate the model
evaluate_model(model, tokenizer, eval_dataset)


Perplexity: 3.7824032125564866


100%|██████████| 651/651 [1:15:33<00:00,  6.96s/it]


BLEU score: 0.0909689813037571
ROUGE score: {'rouge1': 0.41231944107458596, 'rouge2': 0.1970377769672788, 'rougeL': 0.3435583318106801, 'rougeLsum': 0.3571616172839701}
