ref
- https://huggingface.co/blog/mlabonne/sft-llama3
- https://colab.research.google.com/drive/164cg_O7SV7G8kZr_JXqLd6VC7pd86-1Z#scrollTo=zGX9wG7Lhc-z

In [1]:
# !pip install -qqq "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" --progress-bar off
# from torch import __version__
# from packaging.version import Version as V

# xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
# !pip install -qqq --no-deps {xformers} trl peft accelerate bitsandbytes triton --progress-bar off

## 1. Load model for PEFT

In [2]:
from datasets import load_dataset
from unsloth import FastLanguageModel

from trl import SFTConfig, SFTTrainer
import torch
from trl import SFTTrainer
from datasets import load_dataset
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, is_bfloat16_supported

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [10]:
dataset = load_dataset("stanfordnlp/imdb", split="train")
dataset = dataset.select(range(100))

In [11]:
# Load model
max_seq_length = 512  # 1024 # 2048

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B-bnb-4bit",  # unsloth/gemma-2-9b-it-bnb-4bit
    max_seq_length=max_seq_length,
    dtype=None,  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit=True,  # Use 4bit quantization to reduce memory usage. Can be False
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

# Prepare model for PEFT
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,  # Dropout = 0 is currently optimized
    bias="none",  # Bias = "none" is currently optimized
    use_gradient_checkpointing=True,  # True or "unsloth" for very long context
    random_state=3407,
)

print(model.print_trainable_parameters())

==((====))==  Unsloth 2024.9.post1: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla V100-PCIE-16GB. Max memory: 15.773 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1. CUDA = 7.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth




ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [5]:
# from trl import SFTConfig, SFTTrainer

# args = SFTConfig(
#     output_dir="./output",
#     max_seq_length=max_seq_length,
#     dataset_text_field="text",
# )

# trainer = SFTTrainer(
#     model=model,
#     args=args,
#     train_dataset=dataset,
# )
# trainer.train()

In [12]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=True,
    args=TrainingArguments(
        learning_rate=3e-4,
        lr_scheduler_type="linear",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        num_train_epochs=1,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        warmup_steps=10,
        output_dir="output",
        seed=0,
    ),
)

trainer.train()

Generating train split: 0 examples [00:00, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 60 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 3
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.7676
2,2.8398
3,2.832


TrainOutput(global_step=3, training_loss=2.8131510416666665, metrics={'train_runtime': 20.4234, 'train_samples_per_second': 2.938, 'train_steps_per_second': 0.147, 'total_flos': 1112830925340672.0, 'train_loss': 2.8131510416666665, 'epoch': 0.8})