In [1]:
import os
import torch
import pickle

# Import unsloth components
from unsloth import FastLanguageModel, UnslothTrainer, UnslothTrainingArguments, is_bfloat16_supported
# Import the unsloth utility module to patch its offload function
import unsloth.models._utils as unsloth_utils

# === Monkey Patch Start ===
def offload_to_disk_patch(W, model, name, temporary_location):
    os.makedirs(temporary_location, exist_ok=True)
    filename = os.path.join(temporary_location, f"{name}.pt")
    # If W is an Embedding module, use its weight tensor
    if hasattr(W, "weight"):
        W = W.weight
    torch.save(W, filename, pickle_module=pickle, pickle_protocol=pickle.HIGHEST_PROTOCOL)
    # Explicitly pass weights_only=False to avoid unpickling issues
    offloaded_W = torch.load(filename, map_location="cpu", mmap=True, weights_only=False)
    offloaded_W._offloaded_file_location = filename
    return offloaded_W

# Apply the patch so unsloth uses the corrected offload function
unsloth_utils.offload_to_disk = offload_to_disk_patch
# === Monkey Patch End ===

from datasets import load_dataset
from transformers import TextStreamer

max_seq_length = 2048
dtype = None
load_in_4bit = True

# Load the base model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.1-8B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Define the unlikelihood loss function
def unlikelihood_loss(logits, labels):
    # logits: [batch_size, seq_length, vocab_size]
    # labels: [batch_size, seq_length] (with -100 for ignored tokens)
    valid_mask = (labels != -100)
    # Compute probabilities
    probs = torch.softmax(logits, dim=-1)
    # Clamp labels to avoid negative indices (ignored ones will be masked later)
    clamped_labels = labels.clamp(min=0)
    # Create one-hot encoding for valid labels
    one_hot = torch.zeros_like(probs)
    one_hot = one_hot.scatter(-1, clamped_labels.unsqueeze(-1), 1.0)
    # Zero out positions corresponding to ignored labels (-100)
    one_hot = one_hot * valid_mask.unsqueeze(-1)
    
    # Set probabilities for the true tokens to 1.0 so they do not affect the min computation
    masked_probs = probs.masked_fill(one_hot.bool(), 1.0)
    # Get the minimum probability across the vocabulary for each position
    min_probs = torch.min(masked_probs, dim=-1)[0]
    # Compute loss only on valid positions, add epsilon for numerical stability
    loss = -torch.log(min_probs + 1e-8)
    loss = (loss * valid_mask).sum() / valid_mask.sum()
    return loss


# Wrap the model with PEFT (LoRA) configuration
model = FastLanguageModel.get_peft_model(
    model,
    r=128,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj",
                    "embed_tokens", "lm_head"],
    lora_alpha=32,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=True,
    loftq_config=None,
)

# Define a prompt format for Wikipedia articles
wikipedia_prompt = """Wikipedia Article
### Title: {}

### Article:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    titles = examples["title"]
    texts = examples["text"]
    outputs = []
    for title, text in zip(titles, texts):
        text = wikipedia_prompt.format(title, text) + EOS_TOKEN
        outputs.append(text)
    return {"text": outputs}

# Load and prepare a subset of the Wikipedia dataset
dataset = load_dataset("wikimedia/wikipedia", "20231101.en", split="train")
dataset = dataset.train_test_split(train_size=0.01)["train"]
dataset = dataset.map(formatting_prompts_func, batched=True)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA RTX 6000 Ada Generation. Max memory: 44.521 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Offloading input_embeddings to disk to save VRAM
Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2025.2.15 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Unsloth: Training embed_tokens in mixed precision to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM


In [2]:

# Custom trainer that uses unlikelihood loss
class UnlikelihoodTrainer(UnslothTrainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        outputs = model(**inputs)
        logits = outputs.logits
        labels = inputs["labels"]
        loss = unlikelihood_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Train on the Wikipedia dataset
trainer = UnlikelihoodTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=UnslothTrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        max_steps=120,
        warmup_steps=10,
        learning_rate=5e-5,
        embedding_learning_rate=1e-5,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)

trainer_stats = trainer.train()

# Define a prompt format for Alpaca-style instruction following
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}"""

alpaca_dataset = load_dataset("vicgalle/alpaca-gpt4", split="train")

def formatting_prompts_func_alpaca(conversations):
    texts = []
    for instruction, output in zip(conversations["instruction"], conversations["output"]):
        text = alpaca_prompt.format(instruction, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

alpaca_dataset = alpaca_dataset.map(formatting_prompts_func_alpaca, batched=True)

# Train on the Alpaca dataset
trainer = UnlikelihoodTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=alpaca_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=8,
    args=UnslothTrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        max_steps=120,
        warmup_steps=10,
        learning_rate=5e-5,
        embedding_learning_rate=1e-5,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.00,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)

trainer_stats = trainer.train()

# Finalize model for inference and save both model and tokenizer
FastLanguageModel.for_inference(model)

model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")


Tokenizing train dataset (num_proc=2): 100%|██████████| 64078/64078 [01:32<00:00, 689.37 examples/s]
Tokenizing train dataset (num_proc=2): 100%|██████████| 64078/64078 [00:34<00:00, 1876.13 examples/s]
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 64,078 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 120
 "-____-"     Number of trainable parameters = 1,386,217,472
/pytorch/aten/src/ATen/native/cuda/ScatterGatherKernel.cu:365: operator(): block: [0,0,0], thread: [127,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/ScatterGatherKernel.cu:365: operator(): block: [0,0,0], thread: [32,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/ScatterGatherKernel.cu:365: operator(): block: [0,0,0], thread: [33,0,0]

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
