In [1]:
!pip install unsloth

In [2]:
!pip install trl datasets




In [3]:
MODEL_NAME = "unsloth/Phi-3.5-mini-instruct"
DATASET_NAME = "philschmid/guanaco-sharegpt-style"

MAX_SEQ_LEN = 2048
LOAD_4BIT = True
DTYPE = None

LORA_CONFIG = {
    "r": 16,
    "lora_alpha": 128,
    "lora_dropout": 0.05,
}

TRAIN_CONFIG = {
    "batch_size": 1,
    "grad_acc": 8,
    "learning_rate": 5e-5,
    "max_steps": 200,
    "warmup_ratio": 0.05,
}


In [4]:
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
import torch, time, pandas as pd
import unsloth

def load_model(use_lora=False):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=MODEL_NAME,
        max_seq_length=MAX_SEQ_LEN,
        dtype=DTYPE,
        load_in_4bit=LOAD_4BIT,
    )

    tokenizer = get_chat_template(
        tokenizer,
        chat_template="phi-3",
        mapping={"role":"from","content":"value","user":"human","assistant":"gpt"},
    )

    if use_lora:
        model = FastLanguageModel.get_peft_model(
            model,
            r=LORA_CONFIG["r"],
            lora_alpha=LORA_CONFIG["lora_alpha"],
            lora_dropout=LORA_CONFIG["lora_dropout"],
            target_modules=[
                "q_proj","k_proj","v_proj","o_proj",
                "gate_proj","up_proj","down_proj"
            ],
            bias="none",
            use_gradient_checkpointing="unsloth",
            random_state=42,
        )

    return model, tokenizer


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


In [5]:
from datasets import load_dataset

def format_fn(examples):
    texts = [
        tokenizer.apply_chat_template(
            convo,
            tokenize=False,
            add_generation_prompt=False
        )
        for convo in examples["conversations"]
    ]
    return {"text": texts}

dataset = load_dataset(DATASET_NAME, split="train")


In [6]:
BENCHMARK_MESSAGES = [
    {
        "task": "format_control",
        "messages": [
            {"from":"human","value":"Explain LoRA in exactly 3 bullet points."}
        ]
    },
    {
        "task": "style_control",
        "messages": [
            {"from":"human","value":"Explain LoRA like a research paper abstract."}
        ]
    },
    {
        "task": "conversation_control",
        "messages": [
            {"from":"human","value":"Ask me a question, then answer it yourself."}
        ]
    },
]


In [7]:
def run_chat_benchmark(model, tokenizer, items, stage):
    FastLanguageModel.for_inference(model)
    rows = []

    for item in items:
        inputs = tokenizer.apply_chat_template(
            item["messages"],
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to("cuda")

        start = time.time()
        out = model.generate(input_ids=inputs, max_new_tokens=128)
        latency = time.time() - start

        text = tokenizer.decode(out[0], skip_special_tokens=True)

        rows.append({
            "stage": stage,
            "task": item["task"],
            "latency_sec": round(latency, 3),
            "output_text": text,
        })

    return rows


In [8]:
base_model, tokenizer = load_model(use_lora=False)

before_results = run_chat_benchmark(
    base_model, tokenizer, BENCHMARK_MESSAGES, "before_finetune"
)


==((====))==  Unsloth 2026.1.2: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [9]:
from trl import SFTTrainer, SFTConfig

model, tokenizer = load_model(use_lora=True)
train_data = dataset.map(format_fn, batched=True)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_data,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LEN,
    args=SFTConfig(
        per_device_train_batch_size=TRAIN_CONFIG["batch_size"],
        gradient_accumulation_steps=TRAIN_CONFIG["grad_acc"],
        learning_rate=TRAIN_CONFIG["learning_rate"],
        max_steps=TRAIN_CONFIG["max_steps"],
        warmup_ratio=TRAIN_CONFIG["warmup_ratio"],
        optim="adamw_8bit",
        logging_steps=25,
        output_dir="phi35_lora_exp",
        report_to="none",
    ),
)

torch.cuda.reset_peak_memory_stats()
trainer.train()


==((====))==  Unsloth 2026.1.2: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2026.1.2 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.
The model is already on multiple devices. Skipping the move to device specified in `args`.


ðŸ¦¥ Unsloth: Padding-free auto-enabled, enabling faster training.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 9,033 | Num Epochs = 1 | Total steps = 200
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 29,884,416 of 3,850,963,968 (0.78% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
25,1.3017
50,1.1903
75,1.2137
100,1.1173
125,1.1546
150,1.2101
175,1.1864
200,1.1495


TrainOutput(global_step=200, training_loss=1.1904420852661133, metrics={'train_runtime': 1591.729, 'train_samples_per_second': 1.005, 'train_steps_per_second': 0.126, 'total_flos': 1.554717617344512e+16, 'train_loss': 1.1904420852661133, 'epoch': 0.1771283073176132})

In [1]:
after_results = run_chat_benchmark(
    model, tokenizer, BENCHMARK_MESSAGES, "after_finetune"
)


NameError: name 'run_chat_benchmark' is not defined

In [2]:
def score(task, text):
    if task == "format_control":
        return int(text.count("-") == 3)
    if task == "style_control":
        return int("abstract" in text.lower())
    if task == "conversation_control":
        return int(text.lower().count("?") >= 1)
    return 0

for r in before_results + after_results:
    r["constraint_score"] = score(r["task"], r["output_text"])


NameError: name 'before_results' is not defined

In [12]:
df = pd.DataFrame(before_results + after_results)

df.to_csv("phi35_chat_benchmark.csv", index=False)
df.to_excel("phi35_chat_benchmark.xlsx", index=False)

df


Unnamed: 0,stage,task,latency_sec,output_text,constraint_score
0,before_finetune,format_control,14.764,Explain LoRA in exactly 3 bullet points. - LoR...,0
1,before_finetune,style_control,14.127,Explain LoRA like a research paper abstract. T...,1
2,before_finetune,conversation_control,12.229,"Ask me a question, then answer it yourself. Qu...",1
3,after_finetune,format_control,8.694,Explain LoRA in exactly 3 bullet points. - LoR...,0
4,after_finetune,style_control,8.784,Explain LoRA like a research paper abstract. L...,1
5,after_finetune,conversation_control,8.779,"Ask me a question, then answer it yourself. Su...",1
