### Installation

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

### Unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.6.5: Fast Llama patching. Transformers: 4.52.4.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.6.5 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


<a name="Data"></a>
### Data Prep


In [None]:
LLAMA3_CHAT_TEMPLATE = (
    "{% for message in messages %}"
    "{% if message['role'] == 'system' %}"
    "{{'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>'}}"
    "{% elif message['role'] == 'user' %}"
    "{{'<|start_header_id|>user<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>'}}"
    "{% elif message['role'] == 'assistant' %}"
    "{{'<|start_header_id|>assistant<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>'}}"
    "{% endif %}"
    "{% endfor %}"
)

tokenizer.chat_template = LLAMA3_CHAT_TEMPLATE
# Setting pad_token to eos_token is a common practice for autoregressive models.
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
def format_dataset(sample):
    prompt = sample["prompt"]
    chosen = sample["chosen"][-1]["content"]
    rejected = sample["rejected"][-1]["content"]

    # Manually construct the turns, ending with the correct <|eot_id|> token.
    sample["prompt"] = f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|>"
    sample["chosen"] = f"<|start_header_id|>assistant<|end_header_id|>\n\n{chosen}<|eot_id|>"
    sample["rejected"] = f"<|start_header_id|>assistant<|end_header_id|>\n\n{rejected}<|eot_id|>"
    return sample


from datasets import load_dataset
dataset = load_dataset("hasancanonder/turkish-orpo-250k", split="train")
dataset = dataset.shuffle(seed=42).select(range(1000)) # Use your subset

# Define the maximum lengths from your ORPOConfig
max_prompt_length = max_seq_length // 2
max_completion_length = max_seq_length // 2

dataset = dataset.map(format_dataset)

def is_within_length_limits(sample):
    prompt_length = len(tokenizer(sample["prompt"])["input_ids"])
    chosen_length = len(tokenizer(sample["chosen"])["input_ids"])
    return (prompt_length < max_prompt_length) and (chosen_length < max_completion_length)

# Filter the dataset
original_size = len(dataset)
dataset = dataset.filter(is_within_length_limits, num_proc=4)
print(f"Filtered dataset from {original_size} to {len(dataset)} records due to length constraints.")

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Filtered dataset from 1000 to 980 records due to length constraints.


Let's print out some examples to see how the dataset should look like

In [None]:
import pprint

row = dataset[1]
print("INSTRUCTION: " + "=" * 50)
pprint.pprint(row["prompt"])
print("ACCEPTED: " + "=" * 50)
pprint.pprint(row["chosen"])
print("REJECTED: " + "=" * 50)
pprint.pprint(row["rejected"])

('<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n'
 '\n'
 'Aşağıdaki kavramların hepsini içeren bir cümle üretin: bitirmek, proje, '
 'okul<|eot_id|>')
('<|start_header_id|>assistant<|end_header_id|>\n'
 '\n'
 'Öğrenciler okul projelerini başarıyla bitirmek için birlikte özenle '
 'çalıştılar, her adımı dikkatlice planladılar ve görevleri aralarında '
 'paylaştırdılar.<|eot_id|>')
('<|start_header_id|>assistant<|end_header_id|>\n'
 '\n'
 'Okul projesini bitirmeye çalıştık.<|eot_id|>')


In [None]:
# Enable reward modelling stats
from unsloth import PatchDPOTrainer

PatchDPOTrainer()

<a name="Train"></a>
### Train the model

In [None]:
from trl import ORPOConfig, ORPOTrainer
from unsloth import is_bfloat16_supported

orpo_trainer = ORPOTrainer(
    model = model,
    train_dataset = dataset,
    tokenizer = tokenizer,
    args = ORPOConfig(
        max_length = max_seq_length,
        max_prompt_length = max_seq_length//2,
        max_completion_length = max_seq_length//2,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        beta = 0.1,
        logging_steps = 1,
        optim = "adamw_8bit",
        lr_scheduler_type = "linear",
        max_steps = 30, # Change to num_train_epochs = 1 for full training runs
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

orpo_trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 980 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 11,272,192/1,000,000,000 (1.13% trained)


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / rejected,logps / chosen,logits / rejected,logits / chosen,log_odds_ratio,log_odds_chosen,eval_logits / chosen,eval_logits / rejected,nll_loss
1,10.1035,-0.215025,-0.342178,1.0,0.127152,-3.421778,-2.150255,-0.296907,-0.924895,-0.284381,1.373828,0,0,2.497449
2,12.4777,-0.307905,-0.409524,0.875,0.101619,-4.095243,-3.079052,-0.160057,-0.981877,-0.490697,1.056331,No Log,No Log,3.070358
3,12.0538,-0.329335,-0.308284,0.75,-0.021051,-3.08284,-3.293353,-0.174453,-0.782391,-1.076322,-0.178879,No Log,No Log,2.905806
4,12.9973,-0.310762,-0.414177,0.75,0.103416,-4.141775,-3.107617,-0.561664,-0.750493,-0.409703,1.065524,No Log,No Log,3.208359


KeyboardInterrupt: 

<a name="Inference"></a>
### Inference

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Create a prompt in the Llama-3 chat format
messages = [
    {"role": "user", "content": "Fransa'nın başkenti Paris'te bulunan meşhur yüksek kulenin adı nedir?"},
]

# Apply the chat template for generation
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True, # Important!
    return_tensors="pt",
).to("cuda")

# Generate response
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(inputs, streamer=text_streamer, max_new_tokens=128, use_cache=True)

NameError: name 'FastLanguageModel' is not defined

### Saving to float16 for VLLM


In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False:
    model.save_pretrained("model")
    tokenizer.save_pretrained("model")
if False:
    model.push_to_hub("hf/model", token = "")
    tokenizer.push_to_hub("hf/model", token = "")


### GGUF / llama.cpp Conversion

In [None]:
# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")