In [1]:
!pip install unsloth

Collecting unsloth
  Downloading unsloth-2026.1.2-py3-none-any.whl.metadata (66 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/66.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.6/66.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2026.1.2 (from unsloth)
  Downloading unsloth_zoo-2026.1.2-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth)
  Downloading tyro-1.0.3-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.33.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting bitsandbytes!=0.46.0,!=0.48.0,>=0.45.5 (from unsloth)
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets!=4.0.*,!=4.1.0,<4.4.0,>=3.4.1 (from unsloth)
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting trl!=0.19.0,<=0.24.0,>=0.18.2 (from unsl

In [2]:
!pip install trl datasets




In [3]:
MODEL_NAME = "unsloth/Phi-3.5-mini-instruct"
DATASET_NAME = "philschmid/guanaco-sharegpt-style"

MAX_SEQ_LEN = 2048
LOAD_4BIT = True
DTYPE = None

LORA_CONFIG = {
    "r": 64,
    "lora_alpha": 128,
    "lora_dropout": 0.05,
}

TRAIN_CONFIG = {
    "batch_size": 1,
    "grad_acc": 8,
    "learning_rate": 5e-5,
    "max_steps": 200,
    "warmup_ratio": 0.05,
}


In [4]:
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
import torch, time, pandas as pd
import unsloth

def load_model(use_lora=False):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=MODEL_NAME,
        max_seq_length=MAX_SEQ_LEN,
        dtype=DTYPE,
        load_in_4bit=LOAD_4BIT,
    )

    tokenizer = get_chat_template(
        tokenizer,
        chat_template="phi-3",
        mapping={"role":"from","content":"value","user":"human","assistant":"gpt"},
    )

    if use_lora:
        model = FastLanguageModel.get_peft_model(
            model,
            r=LORA_CONFIG["r"],
            lora_alpha=LORA_CONFIG["lora_alpha"],
            lora_dropout=LORA_CONFIG["lora_dropout"],
            target_modules=[
                "q_proj","k_proj","v_proj","o_proj",
                "gate_proj","up_proj","down_proj"
            ],
            bias="none",
            use_gradient_checkpointing="unsloth",
            random_state=42,
        )

    return model, tokenizer


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [5]:
from datasets import load_dataset

def format_fn(examples):
    texts = [
        tokenizer.apply_chat_template(
            convo,
            tokenize=False,
            add_generation_prompt=False
        )
        for convo in examples["conversations"]
    ]
    return {"text": texts}

dataset = load_dataset(DATASET_NAME, split="train")


README.md:   0%|          | 0.00/442 [00:00<?, ?B/s]

data/train-00000-of-00001-8aae24b47ddaaf(…):   0%|          | 0.00/8.24M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9033 [00:00<?, ? examples/s]

In [6]:
BENCHMARK_MESSAGES = [
    {
        "task": "format_control",
        "messages": [
            {"from":"human","value":"Explain LoRA in exactly 3 bullet points."}
        ]
    },
    {
        "task": "style_control",
        "messages": [
            {"from":"human","value":"Explain LoRA like a research paper abstract."}
        ]
    },
    {
        "task": "conversation_control",
        "messages": [
            {"from":"human","value":"Ask me a question, then answer it yourself."}
        ]
    },
]


In [7]:
def run_chat_benchmark(model, tokenizer, items, stage):
    FastLanguageModel.for_inference(model)
    rows = []

    for item in items:
        inputs = tokenizer.apply_chat_template(
            item["messages"],
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to("cuda")

        start = time.time()
        out = model.generate(input_ids=inputs, max_new_tokens=128)
        latency = time.time() - start

        text = tokenizer.decode(out[0], skip_special_tokens=True)

        rows.append({
            "stage": stage,
            "task": item["task"],
            "latency_sec": round(latency, 3),
            "output_text": text,
        })

    return rows


In [8]:
base_model, tokenizer = load_model(use_lora=False)

before_results = run_chat_benchmark(
    base_model, tokenizer, BENCHMARK_MESSAGES, "before_finetune"
)


==((====))==  Unsloth 2026.1.2: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/140 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [9]:
from trl import SFTTrainer, SFTConfig

model, tokenizer = load_model(use_lora=True)
train_data = dataset.map(format_fn, batched=True)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_data,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LEN,
    args=SFTConfig(
        per_device_train_batch_size=TRAIN_CONFIG["batch_size"],
        gradient_accumulation_steps=TRAIN_CONFIG["grad_acc"],
        learning_rate=TRAIN_CONFIG["learning_rate"],
        max_steps=TRAIN_CONFIG["max_steps"],
        warmup_ratio=TRAIN_CONFIG["warmup_ratio"],
        optim="adamw_8bit",
        logging_steps=25,
        output_dir="phi35_lora_exp",
        report_to="none",
    ),
)

torch.cuda.reset_peak_memory_stats()
trainer.train()


==((====))==  Unsloth 2026.1.2: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2026.1.2 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Map:   0%|          | 0/9033 [00:00<?, ? examples/s]



Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/9033 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


🦥 Unsloth: Padding-free auto-enabled, enabling faster training.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 9,033 | Num Epochs = 1 | Total steps = 200
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 119,537,664 of 3,940,617,216 (3.03% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
25,1.2906
50,1.1941
75,1.2169
100,1.1199
125,1.1574
150,1.2123
175,1.1887
200,1.1513


TrainOutput(global_step=200, training_loss=1.1913877391815186, metrics={'train_runtime': 1685.2295, 'train_samples_per_second': 0.949, 'train_steps_per_second': 0.119, 'total_flos': 1.5918626793332736e+16, 'train_loss': 1.1913877391815186, 'epoch': 0.1771283073176132})

In [10]:
after_results = run_chat_benchmark(
    model, tokenizer, BENCHMARK_MESSAGES, "after_finetune"
)


In [11]:
def score(task, text):
    if task == "format_control":
        return int(text.count("-") == 3)
    if task == "style_control":
        return int("abstract" in text.lower())
    if task == "conversation_control":
        return int(text.lower().count("?") >= 1)
    return 0

for r in before_results + after_results:
    r["constraint_score"] = score(r["task"], r["output_text"])


In [12]:
df = pd.DataFrame(before_results + after_results)

df.to_csv("phi35_chat_benchmark.csv", index=False)
df.to_excel("phi35_chat_benchmark.xlsx", index=False)

df


Unnamed: 0,stage,task,latency_sec,output_text,constraint_score
0,before_finetune,format_control,21.227,Explain LoRA in exactly 3 bullet points. - LoR...,0
1,before_finetune,style_control,5.503,Explain LoRA like a research paper abstract. T...,1
2,before_finetune,conversation_control,6.37,"Ask me a question, then answer it yourself. Qu...",1
3,after_finetune,format_control,8.817,Explain LoRA in exactly 3 bullet points. - LoR...,0
4,after_finetune,style_control,8.203,Explain LoRA like a research paper abstract. L...,1
5,after_finetune,conversation_control,8.748,"Ask me a question, then answer it yourself. Su...",1
