In [1]:
!pip install -U unsloth transformers trl datasets accelerate bitsandbytes


Collecting unsloth
  Downloading unsloth-2026.1.2-py3-none-any.whl.metadata (66 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/66.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.6/66.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting trl
  Downloading trl-0.26.2-py3-none-any.whl.metadata (11 kB)
Collecting datasets
  Downloading datasets-4.4.2-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting unsloth_zoo>=2026.1.2 (from unsloth)
  Downloading unsloth_zoo-2026.1.2-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth)
  Downloading tyro-1.0.3-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.33.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting datasets
  Downloading datasets-4.3.0-py3-none-any.whl.met

In [2]:
MODEL_NAME = "unsloth/mistral-7b-v0.3"
DATASET_NAME = "yahma/alpaca-cleaned"

MAX_SEQ_LEN = 2048
LOAD_4BIT = True
DTYPE = None

LORA_CONFIG = {
    "r": 64,
    "lora_alpha": 128,
    "lora_dropout": 0.05,
}

TRAIN_CONFIG = {
    "batch_size": 2,
    "grad_acc": 8,
    "learning_rate": 5e-5,
    "max_steps": 300,
    "warmup_ratio": 0.05,
}


In [3]:
from unsloth import FastLanguageModel
import torch, time, pandas as pd

def load_model(use_lora=False):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=MODEL_NAME,
        max_seq_length=MAX_SEQ_LEN,
        dtype=DTYPE,
        load_in_4bit=LOAD_4BIT,
    )

    if use_lora:
        model = FastLanguageModel.get_peft_model(
            model,
            r=LORA_CONFIG["r"],
            lora_alpha=LORA_CONFIG["lora_alpha"],
            lora_dropout=LORA_CONFIG["lora_dropout"],
            target_modules=[
                "q_proj","k_proj","v_proj","o_proj",
                "gate_proj","up_proj","down_proj"
            ],
            bias="none",
            use_gradient_checkpointing="unsloth",
            random_state=42,
        )

    return model, tokenizer


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
from datasets import load_dataset

PROMPT = """### Instruction:
{}

### Input:
{}

### Response:
{}"""

def format_fn(examples):
    return {
        "text": [
            PROMPT.format(i, inp, out) + tokenizer.eos_token
            for i, inp, out in zip(
                examples["instruction"],
                examples["input"],
                examples["output"]
            )
        ]
    }

dataset = load_dataset(DATASET_NAME, split="train")


README.md: 0.00B [00:00, ?B/s]

alpaca_data_cleaned.json:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

In [5]:
BASE_BENCHMARK_PROMPTS = [
    {
        "task": "format_control",
        "prompt": "Explain LoRA fine-tuning in exactly 3 bullet points."
    },
    {
        "task": "style_control",
        "prompt": "Explain LoRA fine-tuning like a research paper abstract."
    },
    {
        "task": "reasoning",
        "prompt": "Derive the gradient update rule for LoRA."
    },
]

def run_benchmark(model, tokenizer, prompts, stage):
    FastLanguageModel.for_inference(model)
    rows = []

    for item in prompts:
        inputs = tokenizer(item["prompt"], return_tensors="pt").to("cuda")

        start = time.time()
        out = model.generate(**inputs, max_new_tokens=128)
        latency = time.time() - start

        text = tokenizer.decode(out[0], skip_special_tokens=True)

        rows.append({
            "stage": stage,
            "task": item["task"],
            "prompt": item["prompt"],
            "latency_sec": round(latency, 3),
            "output_text": text,
        })

    return rows


base_model, base_tokenizer = load_model(use_lora=False)
before_results = run_benchmark(
    base_model, base_tokenizer, BASE_BENCHMARK_PROMPTS, "before_finetune"
)


==((====))==  Unsloth 2026.1.2: Fast Mistral patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/157 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [6]:
from trl import SFTTrainer, SFTConfig

model, tokenizer = load_model(use_lora=True)
train_data = dataset.map(format_fn, batched=True)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_data,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LEN,
    args=SFTConfig(
        per_device_train_batch_size=TRAIN_CONFIG["batch_size"],
        gradient_accumulation_steps=TRAIN_CONFIG["grad_acc"],
        learning_rate=TRAIN_CONFIG["learning_rate"],
        max_steps=TRAIN_CONFIG["max_steps"],
        warmup_ratio=TRAIN_CONFIG["warmup_ratio"],
        optim="adamw_8bit",
        logging_steps=25,
        output_dir="mistral_lora_exp",
        report_to="none",
    ),
)

torch.cuda.reset_peak_memory_stats()
train_stats = trainer.train()


==((====))==  Unsloth 2026.1.2: Fast Mistral patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2026.1.2 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/51760 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


🦥 Unsloth: Padding-free auto-enabled, enabling faster training.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 51,760 | Num Epochs = 1 | Total steps = 300
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 167,772,160 of 7,415,795,712 (2.26% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
25,1.0583
50,0.9486
75,0.9311
100,0.9187
125,0.9195
150,0.8982
175,0.924
200,0.9164
225,0.8971
250,0.9152


In [7]:
after_results = run_benchmark(
    model, tokenizer, BASE_BENCHMARK_PROMPTS, "after_finetune"
)


In [8]:
def constraint_score(task, text):
    if task == "format_control":
        return int(text.count("-") == 3)
    if task == "style_control":
        return int("abstract" in text.lower())
    if task == "reasoning":
        return int("gradient" in text.lower())
    return 0


for row in before_results + after_results:
    row["constraint_score"] = constraint_score(row["task"], row["output_text"])


In [9]:
df = pd.DataFrame(before_results + after_results)

df.to_csv("mistral_behavior_benchmark.csv", index=False)
df.to_excel("mistral_behavior_benchmark.xlsx", index=False)

df


Unnamed: 0,stage,task,prompt,latency_sec,output_text,constraint_score
0,before_finetune,format_control,Explain LoRA fine-tuning in exactly 3 bullet p...,12.463,Explain LoRA fine-tuning in exactly 3 bullet p...,0
1,before_finetune,style_control,Explain LoRA fine-tuning like a research paper...,7.204,Explain LoRA fine-tuning like a research paper...,1
2,before_finetune,reasoning,Derive the gradient update rule for LoRA.,6.812,Derive the gradient update rule for LoRA.\n\n#...,1
3,after_finetune,format_control,Explain LoRA fine-tuning in exactly 3 bullet p...,10.058,Explain LoRA fine-tuning in exactly 3 bullet p...,0
4,after_finetune,style_control,Explain LoRA fine-tuning like a research paper...,9.841,Explain LoRA fine-tuning like a research paper...,1
5,after_finetune,reasoning,Derive the gradient update rule for LoRA.,9.516,Derive the gradient update rule for LoRA.\n\nT...,1
