In [None]:
# Adapted from unsloths's notebooks

# import os
# if "COLAB_" not in "".join(os.environ.keys()):
#     print("Not in Colab. Assuming Unsloth is already installed.")
#     print("If not, please install with: pip install unsloth[colab-new]")
# else:
#     # Do this only in Colab notebooks! Otherwise use pip install unsloth
#     print("Installing Unsloth and dependencies for Colab...")
#     !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
#     !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
#     !pip install --no-deps "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
#     print("Installation complete.")

from unsloth import FastLanguageModel # Changed from FastModel for DPO consistency
from transformers import AutoTokenizer
import torch
from trl import DPOTrainer, DPOConfig
from datasets import load_dataset
from unsloth.chat_templates import get_chat_template

max_seq_length = 2048 # Choose any for long context!
# Use "unsloth" for LoRA optimization to fit 2x larger batch sizes!
use_gradient_checkpointing = "unsloth" # True or "unsloth" for very long context

model_id = "unsloth/gemma-3-4b-it"

# Load Gemma-3 model
model, _ = FastLanguageModel.from_pretrained(
    model_name = model_id, # Using the instruction-tuned base
    max_seq_length = max_seq_length,
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    # token = "hf_...", # use one if using gated models
    # You can add dtype=torch.bfloat16 if your GPU supports it for potentially faster training
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # LoRA rank. 8-64 is common. DPO template used 64. Let's try 16.
    lora_alpha = 32, # Recommended lora_alpha = 2 * r
    lora_dropout = 0.05, # Supports any, but = 0 is optimized for some Unsloth features
    bias = "none",    # Supports any, but = "none" is optimized
    # Target modules for Gemma-3. These are typical for Llama-like architectures.
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    use_gradient_checkpointing = use_gradient_checkpointing,
    random_state = 3407,
    max_seq_length = max_seq_length,
)

# Set up chat template for Gemma-3
# DPOTrainer will use this to format prompts and responses
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
    # map_eos_token = True, # Gemma-3 tokenizer has an EOS token, so this might not be needed
)
# It's good practice to set pad_token if it's not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


# --- Data Prep for DPO -------------------------------------------------------
# set to the dpo pairs dataset that was generated by auto_unslop
dpo_file = "./dpo_pairs_dataset.jsonl"
print(f"Loading DPO dataset from {dpo_file}")
dpo_dataset = load_dataset("json", data_files=str(dpo_file), split="train")

# quick sanity-check – drop any rows that somehow lack the three fields
req_cols = {"prompt", "chosen", "rejected"}
before   = len(dpo_dataset)
dpo_dataset = dpo_dataset.filter(lambda x: all(col in x and x[col] for col in req_cols))
after    = len(dpo_dataset)
if after == 0:
    raise ValueError("All rows were filtered out – check dataset contents.")
if after < before:
    print(f"Filtered out {before - after} malformed rows; {after} remain.")

print(f"DPO dataset ready with {after} samples.")


# --- Train the model with DPO ---
# Note: For DPO, you typically don't need a separate eval_dataset during training,
# but it can be useful for monitoring.
# The `ref_model` is set to None for LoRA, DPOTrainer will handle creating a reference.

# Ensure the dataset is not empty
if len(dpo_dataset) == 0:
    raise ValueError("DPO dataset is empty. Please provide a valid dataset.")

dpo_trainer = DPOTrainer(
    model = model,
    ref_model = None, # Handled automatically by DPOTrainer for LoRA
    train_dataset = dpo_dataset,
    # eval_dataset = YOUR_EVAL_DPO_DATASET_HERE, # Optional
    tokenizer = tokenizer,
    args = DPOConfig(
        per_device_train_batch_size = 1, # Adjust based on your GPU memory
        gradient_accumulation_steps = 4, # Effective batch size = 1 * 4 = 4
        warmup_ratio = 0.1, # Or warmup_steps
        num_train_epochs = 3, # For a quick demo. Set to 1-3 for a real run.
        # max_steps = 60, # Alternatively, use max_steps for a fixed number of steps
        learning_rate = 5e-5, # Common DPO learning rate
        logging_steps = 10,
        optim = "adamw_8bit", # Unsloth optimizes this
        seed = 42,
        output_dir = "outputs_gemma3_4b_dpo",
        max_length = max_seq_length,         # Max length of combined prompt + response
        max_prompt_length = max_seq_length // 2, # Max length of prompt
        beta = 0.1, # DPO beta parameter
        report_to = "none", # "wandb" or "tensorboard"
        lr_scheduler_type = "linear",
        # bf16 = True, # Set to True if your GPU supports bfloat16 and you loaded model with bfloat16
        # fp16 = False, # Set to True for mixed precision if bf16 is not available (and not using 4bit)
    ),
)

print("Starting DPO training...")
trainer_stats = dpo_trainer.train()
print("DPO training finished.")


# --- Show current memory stats (optional) ---
if torch.cuda.is_available():
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.memory_reserved() / 1024 / 1024 / 1024, 3) # Corrected: use memory_reserved
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
    print(f"{start_gpu_memory} GB of memory reserved after training.") # This will be peak if called after train

    # Show final memory and time stats
    if hasattr(trainer_stats, 'metrics') and 'train_runtime' in trainer_stats.metrics:
        used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
        # Note: start_gpu_memory was captured *after* model loading.
        # A more accurate "used_memory_for_lora" would require capturing memory before model load and after.
        # For simplicity, we'll just show peak reserved.
        used_percentage = round(used_memory / max_memory * 100, 3)
        print(f"{trainer_stats.metrics['train_runtime']:.2f} seconds used for training.")
        print(f"{trainer_stats.metrics['train_runtime']/60:.2f} minutes used for training.")
        print(f"Peak reserved memory = {used_memory} GB.")
        print(f"Peak reserved memory % of max memory = {used_percentage} %.")
else:
    print("CUDA not available. Memory stats not shown.")


# --- Inference after DPO ---
# For inference, make sure the tokenizer has the chat template for Gemma-3
# The model is already LoRA-adapted.

# Reload tokenizer with chat template if needed (should be already set)
# tokenizer = get_chat_template(
#     tokenizer,
#     chat_template = "gemma-3",
# )

# Example prompt
messages = [{
    "role": "user",
    "content": "What are the pros and cons of pineapple on pizza?"
}]

# Apply chat template for generation
# For DPO, the model has learned from preferences, so it should generate better responses.
# The prompt format should match what it saw during training (user turn).
# `add_generation_prompt=True` adds the necessary tokens to signal the model to start generating.
# For Gemma-3, this means it will end with `<start_of_turn>model\n`
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Crucial for generation
    return_tensors = "pt",
).to("cuda" if torch.cuda.is_available() else "cpu")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)

print("\n--- Generating DPO model response (streaming) ---")
_ = model.generate(
    inputs,
    max_new_tokens = 128, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 0.7, # Slightly lower temperature for more focused output after DPO
    top_p = 0.95,
    top_k = 64,
    streamer = text_streamer,
    pad_token_id = tokenizer.eos_token_id # Important for generation
)
print("\n--- End of DPO model response ---")

print("\nScript finished. Remember to replace placeholder DPO dataset with your actual data.")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.4.7: Fast Gemma3 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.553 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Unsloth: Making `base_model.model.vision_tower.vision_model` require gradients
Loading DPO dataset from ./dpo_pairs_dataset.jsonl


Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/320 [00:00<?, ? examples/s]

DPO dataset ready with 320 samples.


Extracting prompt in train dataset (num_proc=48):   0%|          | 0/320 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=48):   0%|          | 0/320 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=48):   0%|          | 0/320 [00:00<?, ? examples/s]

Starting DPO training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 320 | Num Epochs = 3 | Total steps = 240
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 32,788,480/4,000,000,000 (0.82% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
10,1.9029,-0.283318,0.889211,0.25,-1.172529,-2506.524902,-2402.157471,-4.831528,-4.792796,0,0,0,0
20,1.6954,3.861495,2.541639,0.7,1.319857,-2508.290283,-2387.523193,-4.650035,-4.648216,No Log,No Log,No Log,No Log
30,1.1474,-5.592238,-7.692761,0.75,2.100524,-2668.98877,-2509.349365,-4.461812,-4.574172,No Log,No Log,No Log,No Log
40,1.2196,0.287206,-7.624055,0.9,7.911261,-2660.171387,-2558.669678,-4.217445,-4.168513,No Log,No Log,No Log,No Log
50,1.2615,-4.089805,-11.590117,0.8,7.500314,-2561.205566,-2509.316895,-4.044028,-4.072822,No Log,No Log,No Log,No Log
60,1.7538,-11.867795,-20.503782,0.85,8.635985,-2714.646484,-2605.021973,-4.631408,-4.674015,No Log,No Log,No Log,No Log
70,1.9163,-9.156513,-22.363285,0.85,13.206772,-2681.920898,-2538.767578,-4.751256,-4.794691,No Log,No Log,No Log,No Log
80,0.4712,-2.798518,-19.194332,0.95,16.395815,-2564.045654,-2537.972412,-4.697979,-4.782566,No Log,No Log,No Log,No Log
90,0.0004,6.759625,-21.737743,1.0,28.49737,-2529.057373,-2598.914551,-4.560782,-4.692301,No Log,No Log,No Log,No Log
100,0.2462,6.345956,-22.088821,0.975,28.434778,-2464.271973,-2587.531006,-4.474179,-4.543868,No Log,No Log,No Log,No Log


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


DPO training finished.
GPU = NVIDIA GeForce RTX 4090. Max memory = 23.553 GB.
20.564 GB of memory reserved after training.
1447.59 seconds used for training.
24.13 minutes used for training.
Peak reserved memory = 20.564 GB.
Peak reserved memory % of max memory = 87.309 %.

--- Generating DPO model response (streaming) ---
Okay, let's dive into the highly debated topic of pineapple on pizza! Here’s a breakdown of the pros and cons, considering both the objective and subjective aspects:

**Pros (Arguments in favor):**

* **Flavor Contrast:** This is the biggest argument. The sweetness and acidity of pineapple provide a fantastic contrast to the savory, salty, and often fatty elements of cheese, ham, and tomato sauce. It's a complex, multi-layered taste experience.
* **Acidity Cuts Through Fat:** Pineapple's acidity helps to break down the richness of the cheese and meat, preventing the pizza from feeling too

--- End of DPO model response ---
DPO LoRA adapters and tokenizer saved to ./g

In [5]:
p = "In this world, salaries are determined by the desirability of the work: if everybody wants to do the job and it's fun, it pays minimum wage. But if it's hard or awful work that nobody wants to do, the pay is high. You decide to apply for the highest-paying job in the world."
p = "Write 1000 word short story to this prompt: A close friend of yours can read minds. It was their dream to work for the FBI or CIA to catch bad guys. You accompanied them to their first interview, but instead they walk straight back out. They whisper to you to walk calmly out to the car and not to say a word or make eye contact, act calm."

# Example prompt
messages = [{
    "role": "user",
    "content": p
}]

# Apply chat template for generation
# For DPO, the model has learned from preferences, so it should generate better responses.
# The prompt format should match what it saw during training (user turn).
# `add_generation_prompt=True` adds the necessary tokens to signal the model to start generating.
# For Gemma-3, this means it will end with `<start_of_turn>model\n`
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Crucial for generation
    return_tensors = "pt",
).to("cuda" if torch.cuda.is_available() else "cpu")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)

print("\n--- Generating DPO model response (streaming) ---")
_ = model.generate(
    inputs,
    max_new_tokens = 600, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 0.7, # Slightly lower temperature for more focused output after DPO
    top_p = 0.95,
    top_k = 64,
    streamer = text_streamer,
    pad_token_id = tokenizer.eos_token_id # Important for generation
)
print("\n--- End of DPO model response ---")


--- Generating DPO model response (streaming) ---
The air in the Sterling Federal Building was a specific kind of heavy, a blend of legal documents, nervous energy, and the low, constant hum of fluorescent lights. It pressed down on me, a physical weight, and it was nothing compared to the weight on Leo’s shoulders. He was a study in controlled anxiety, a fact I’d become intimately familiar with over the last decade. Leo could read minds. Not with a dramatic, telepathic projection, but a quiet, internal awareness, a constant, low-level reception of thoughts and feelings. It was a gift, a curse, and, as he’d always dreamed, a potential weapon against the bad guys. 

He’d applied to the FBI, specifically the Behavioral Analysis Unit, a path he’d mapped out with the single-minded determination of a man who’d spent years processing the internal monologues of criminals. I, on the other hand, was a freelance writer, content with observing the world from a comfortable distance. I’d agreed to

In [6]:

# --- Saving the DPO-finetuned model (LoRA adapters) ---
dpo_model_save_path = "gemma-3-4b-dpo-lora"
dpo_trainer.save_model(dpo_model_save_path) # Saves LoRA adapters
tokenizer.save_pretrained(dpo_model_save_path)
print(f"DPO LoRA adapters and tokenizer saved to ./{dpo_model_save_path}")

# To load the LoRA adapters later for inference:
if False: # Set to True to test loading
    from unsloth import FastLanguageModel
    loaded_model, loaded_tokenizer = FastLanguageModel.from_pretrained(
        model_name = dpo_model_save_path, # Path to your saved LoRA adapters
        max_seq_length = max_seq_length,
        load_in_4bit = True,
    )
    # Now `loaded_model` is ready for inference.
    # Ensure tokenizer has chat template
    loaded_tokenizer = get_chat_template(
        loaded_tokenizer,
        chat_template = "gemma-3",
    )
    if loaded_tokenizer.pad_token is None:
        loaded_tokenizer.pad_token = loaded_tokenizer.eos_token

    print("\n--- Generating response from loaded DPO LoRA model (streaming) ---")
    _ = loaded_model.generate(
        inputs, # Using the same inputs as before
        max_new_tokens = 128,
        temperature = 0.7, top_p = 0.95, top_k = 64,
        streamer = TextStreamer(loaded_tokenizer, skip_prompt = True),
        pad_token_id = loaded_tokenizer.eos_token_id
    )
    print("\n--- End of loaded DPO model response ---")


# --- Saving to float16 for VLLM or other deployments (merged model) ---
if True: # Change to True to save merged finetune!
    # Merges LoRA adapters into the base model and saves
    # This creates a standalone model, not just adapters
    merged_model_path = "gemma-3-4b-dpo-unslopped"
    model.save_pretrained_merged(merged_model_path, tokenizer, save_method = "merged_16bit")
    print(f"Merged 16-bit DPO model saved to ./{merged_model_path}")
    # For GGUF, you can then convert this merged model or use Unsloth's direct GGUF saving
    # model.push_to_hub_merged("YOUR_HF_USERNAME/gemma-3-4b-dpo-merged", tokenizer, save_method = "merged_16bit", token = "YOUR_HF_TOKEN")

# --- GGUF / llama.cpp Conversion ---
if False: # Change to True to save to GGUF
    # Saves the LoRA model directly to GGUF by first merging.
    # Quantization types: "Q8_0", "F16", "BF16", "Q4_K_M", "Q5_K_M" etc.
    gguf_model_path = "gemma-3-4b-dpo-gguf"
    model.save_pretrained_gguf(gguf_model_path, tokenizer, quantization_method = "q8_0")
    print(f"GGUF (Q8_0) DPO model saved to ./{gguf_model_path}.gguf") # Unsloth adds .gguf
    # model.push_to_hub_gguf("YOUR_HF_USERNAME/gemma-3-4b-dpo-gguf", tokenizer, quantization_method = "q8_0", token = "YOUR_HF_TOKEN")


DPO LoRA adapters and tokenizer saved to ./gemma-3-4b-dpo-lora
Downloading safetensors index for unsloth/gemma-3-4b-it...


model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  50%|█████     | 1/2 [00:51<00:51, 51.90s/it]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit: 100%|██████████| 2/2 [01:34<00:00, 47.49s/it]


Merged 16-bit DPO model saved to ./gemma-3-4b-dpo-unslopped
