In [1]:
# Import necessary packages
!pip install unsloth "xformers==0.0.28.post2"
!pip install evaluate
!pip install rouge_score
# Get the latest nightly Unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Import libraries
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
import evaluate
from transformers import TrainingArguments, DataCollatorWithPadding
from trl import SFTTrainer
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import nltk
import numpy as np
from math import exp

# Install NLTK data
nltk.download('punkt')

# Define model parameters
max_seq_length = 2048  # Adjust as needed
dtype = None  # Auto-detection
load_in_4bit = True  # Use 4-bit quantization

# Load the pre-trained model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Apply PEFT (Parameter-Efficient Fine-Tuning)
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

# Define the prompt template
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# Modify the code to format prompts based on Alpaca's data structure
EOS_TOKEN = tokenizer.eos_token  # Ensure EOS_TOKEN is defined

def formatting_prompts_func_alpaca(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input_text, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input_text, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Load and prepare the Alpaca training dataset
alpaca_dataset = load_dataset("yahma/alpaca-cleaned", split="train")
alpaca_dataset = alpaca_dataset.map(formatting_prompts_func_alpaca, batched=True)

# Initialize the trainer for fine-tuning
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=alpaca_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)


Collecting unsloth
  Downloading unsloth-2024.11.5-py3-none-any.whl.metadata (59 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/59.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.6/59.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xformers==0.0.28.post2
  Downloading xformers-0.0.28.post2-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting unsloth-zoo>=2024.11.1 (from unsloth)
  Downloading unsloth_zoo-2024.11.4-py3-none-any.whl.metadata (16 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.8.14-py3-none-any.whl.metadata (8.4 kB)
Collecting transformers>=4.46.1 (from unsloth)
  Downloading transfo

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

Unsloth 2024.11.5 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

alpaca_data_cleaned.json:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/51760 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [2]:
# Train the model
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 51,760 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
1,1.8736
2,2.3641
3,2.0199
4,2.2092
5,2.0774
6,1.9473
7,1.4669
8,1.6698
9,1.5427
10,1.7228


In [3]:
# Load the Dolly dataset
test_dataset = load_dataset("databricks/databricks-dolly-15k", split="train[:10%]")

# Define prompt formatting for Dolly dataset
def formatting_prompts_func_dolly(examples):
    instructions = examples["instruction"]
    contexts = examples.get("context", [""] * len(instructions))
    responses = examples["response"]
    texts = []
    refs = []
    for instruction, context, response in zip(instructions, contexts, responses):
        # Prepare input prompt without the response
        text = alpaca_prompt.format(instruction, context, "") + EOS_TOKEN
        texts.append(text)
        refs.append(response)
    return {"text": texts, "references": refs}

# Apply formatting function
test_dataset = test_dataset.map(formatting_prompts_func_dolly, batched=True)

README.md:   0%|          | 0.00/8.20k [00:00<?, ?B/s]

databricks-dolly-15k.jsonl:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]

Map:   0%|          | 0/1501 [00:00<?, ? examples/s]

In [4]:
# Tokenize the test dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=max_seq_length,
    )

# Tokenize and remove unnecessary columns
test_dataset = test_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=[column for column in test_dataset.column_names if column != 'references']
)

# Initialize the data collator
data_collator = DataCollatorWithPadding(
    tokenizer, padding='longest', return_tensors='pt'
)

# Create DataLoader for the test dataset
def custom_collate_fn(batch):
    batch_output = data_collator(batch)
    batch_output['references'] = [item['references'] for item in batch]
    return batch_output

test_loader = DataLoader(test_dataset, batch_size=2, collate_fn=custom_collate_fn)

Map:   0%|          | 0/1501 [00:00<?, ? examples/s]

In [5]:
# Initialize metrics
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

model.eval()
predictions = []
references = []

# Generate predictions and collect references
for batch in tqdm(test_loader):
    input_ids = batch["input_ids"].to(model.device)
    attention_mask = batch["attention_mask"].to(model.device)
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=512,
            num_beams=1,
            do_sample=False,
            early_stopping=True,
        )
    generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    refs = batch["references"]
    predictions.extend(generated_texts)
    references.extend(refs)

# Compute BLEU score
tokenized_preds = [nltk.word_tokenize(pred) for pred in predictions]
tokenized_refs = [[nltk.word_tokenize(ref)] for ref in references]
bleu = bleu_metric.compute(predictions=tokenized_preds, references=tokenized_refs)

# Compute ROUGE scores
rouge_scores = rouge_metric.compute(
    predictions=predictions,
    references=references,
    rouge_types=["rouge1", "rouge2", "rougeL"],
    use_stemmer=True,
)

print("BLEU score:", bleu["bleu"])
print("\nROUGE scores:")
for rouge_type, score in rouge_scores.items():
    print(f"{rouge_type.upper()}:")
    print(f"  Precision: {score['precision']:.4f}")
    print(f"  Recall:    {score['recall']:.4f}")
    print(f"  F1 Score:  {score['fmeasure']:.4f}")

# Compute Perplexity
total_loss = 0
total_tokens = 0
for batch in tqdm(test_loader):
    input_ids = batch["input_ids"].to(model.device)
    attention_mask = batch["attention_mask"].to(model.device)
    labels = input_ids.clone()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    total_loss += loss.item() * attention_mask.sum().item()
    total_tokens += attention_mask.sum().item()

avg_loss = total_loss / total_tokens
perplexity = exp(avg_loss)
print("\nPerplexity:", perplexity)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

  0%|          | 0/751 [00:00<?, ?it/s]

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`references` in this case) have excessive nesting (inputs type `list` where type `int` is expected).