In [1]:
import os
import torch
import pickle

os.environ['UNSLOTH_RETURN_LOGITS'] = '1'

# Import unsloth components
from unsloth import FastLanguageModel, UnslothTrainer, UnslothTrainingArguments, is_bfloat16_supported
# Import the unsloth utility module to patch its offload function
import unsloth.models._utils as unsloth_utils




🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:

# === Monkey Patch Start ===
def offload_to_disk_patch(W, model, name, temporary_location):
    os.makedirs(temporary_location, exist_ok=True)
    filename = os.path.join(temporary_location, f"{name}.pt")
    # If W is an Embedding module, use its weight tensor
    if hasattr(W, "weight"):
        W = W.weight
    torch.save(W, filename, pickle_module=pickle, pickle_protocol=pickle.HIGHEST_PROTOCOL)
    # Explicitly pass weights_only=False to avoid unpickling issues
    offloaded_W = torch.load(filename, map_location="cpu", mmap=True, weights_only=False)
    offloaded_W._offloaded_file_location = filename
    return offloaded_W

# Apply the patch so unsloth uses the corrected offload function
unsloth_utils.offload_to_disk = offload_to_disk_patch
# === Monkey Patch End ===

from datasets import load_dataset
from transformers import TextStreamer

max_seq_length = 2048
dtype = None
load_in_4bit = True

# Load the base model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.1-8B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)


==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA RTX 6000 Ada Generation. Max memory: 44.521 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [3]:
# Wrap the model with PEFT (LoRA) configuration
model = FastLanguageModel.get_peft_model(
    model,
    r=128,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj",
                    "embed_tokens", "lm_head"],
    lora_alpha=32,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=True,
    loftq_config=None,
)

Unsloth: Offloading input_embeddings to disk to save VRAM
Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2025.2.15 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Unsloth: Training embed_tokens in mixed precision to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM


In [4]:
# Define a prompt format for Wikipedia articles
wikipedia_prompt = """Wikipedia Article
### Title: {}

### Article:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    titles = examples["title"]
    texts = examples["text"]
    outputs = []
    for title, text in zip(titles, texts):
        text = wikipedia_prompt.format(title, text) + EOS_TOKEN
        outputs.append(text)
    return {"text": outputs}

# Load and prepare a subset of the Wikipedia dataset
dataset = load_dataset("wikimedia/wikipedia", "20231101.en", split="train")
dataset = dataset.train_test_split(train_size=0.0001)["train"]
dataset = dataset.map(formatting_prompts_func, batched=True)


In [None]:
# def unlikelihood_loss(logits, labels):
#     """
#     Compute unlikelihood loss based on lowest non-zero probabilities.
    
#     Args:
#         logits: Model output logits [batch_size, seq_len, vocab_size]
#         labels: Target labels [batch_size, seq_len]
#     """
#     # Input validation
#     if logits is None:
#         raise ValueError("Logits cannot be None - check model outputs")
        
#     # Get probability distribution
#     probs = torch.softmax(logits.float(), dim=-1)  # [batch_size, seq_len, vocab_size]
    
#     # Create mask for valid positions (non-padding)
#     valid_mask = (labels != -100)  # [batch_size, seq_len]
    
#     # Create mask for correct labels to exclude them
#     label_mask = torch.zeros_like(probs).scatter_(-1, labels.unsqueeze(-1).clamp(min=0), 1.0)
    
#     # Mask out correct labels by setting their probabilities to 1.0
#     masked_probs = probs.masked_fill(label_mask.bool(), 1.0)
    
#     # Add small epsilon to avoid exact zeros
#     eps = 1e-8
#     masked_probs = masked_probs + eps
    
#     # Find minimum non-one probability (which will be lowest non-zero + eps)
#     min_probs = torch.min(masked_probs, dim=-1)[0]  # [batch_size, seq_len]
    
#     # Only consider valid positions
#     loss = min_probs * valid_mask
    
#     # Average over valid positions
#     num_valid = valid_mask.sum()
#     if num_valid > 0:
#         loss = loss.sum() / num_valid
#     else:
#         loss = loss.sum() * 0.0
    
#     return loss

In [5]:
def unlikelihood_loss(logits, labels):
    """
    Makes tokens with lowest non-zero probabilities become most likely.
    """
    # Get probability distribution
    probs = torch.softmax(logits.float(), dim=-1)  # [batch_size, seq_len, vocab_size]
    
    # Find tokens with lowest non-zero probabilities
    eps = 1e-8
    zero_mask = (probs <= eps)  # Mask for zero probs
    
    # Set zero probabilities to infinity so they're never selected as minimum
    masked_probs = probs.masked_fill(zero_mask, float('inf'))
    
    # Find the minimum non-zero probability for each position
    min_nonzero_probs, min_indices = torch.min(masked_probs, dim=-1)
    
    # Create target mask for these minimum probability tokens
    target_mask = torch.zeros_like(probs)
    target_mask.scatter_(-1, min_indices.unsqueeze(-1), 1.0)
    
    # Calculate cross entropy loss but with our min-prob tokens as targets
    # We want to maximize their probability, so we minimize negative log probability
    loss = -torch.log(probs + eps) * target_mask
    
    # Average over positions and batch
    valid_mask = (labels != -100)
    loss = (loss.sum(dim=-1) * valid_mask).sum() / (valid_mask.sum() + eps)
    
    return loss

In [None]:

# Custom trainer that uses unlikelihood loss
class UnlikelihoodTrainer(UnslothTrainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        outputs = model(**inputs)
        logits = outputs.logits
        labels = inputs["labels"]
        loss = unlikelihood_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Train on the Wikipedia dataset
trainer = UnlikelihoodTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=UnslothTrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        max_steps=120,
        warmup_steps=10,
        learning_rate=5e-5,
        embedding_learning_rate=1e-5,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to=["tensorboard"]
    ),
)

trainer_stats = trainer.train()

# Define a prompt format for Alpaca-style instruction following
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}"""

alpaca_dataset = load_dataset("vicgalle/alpaca-gpt4", split="train")

def formatting_prompts_func_alpaca(conversations):
    texts = []
    for instruction, output in zip(conversations["instruction"], conversations["output"]):
        text = alpaca_prompt.format(instruction, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

alpaca_dataset = alpaca_dataset.map(formatting_prompts_func_alpaca, batched=True)

# Train on the Alpaca dataset
trainer = UnlikelihoodTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=alpaca_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=8,
    args=UnslothTrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        max_steps=120,
        warmup_steps=10,
        learning_rate=5e-5,
        embedding_learning_rate=1e-5,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.00,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to=["tensorboard"]
    ),
)

trainer_stats = trainer.train()

# Finalize model for inference and save both model and tokenizer
FastLanguageModel.for_inference(model)

model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")


Tokenizing train dataset (num_proc=2): 100%|██████████| 640/640 [00:02<00:00, 253.36 examples/s]
Tokenizing train dataset (num_proc=2): 100%|██████████| 640/640 [00:00<00:00, 760.85 examples/s] 
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 640 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 120
 "-____-"     Number of trainable parameters = 1,386,217,472


Step,Training Loss
1,141.4599
2,141.123
3,141.2666
4,141.0008
5,141.103
6,141.3393
7,141.3109
8,141.4927
9,141.5543
10,141.7718


# Finito