In [1]:
!pip install -U unsloth transformers trl datasets accelerate bitsandbytes


Collecting unsloth
  Downloading unsloth-2026.1.2-py3-none-any.whl.metadata (66 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/66.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.6/66.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting trl
  Downloading trl-0.26.2-py3-none-any.whl.metadata (11 kB)
Collecting datasets
  Downloading datasets-4.4.2-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting unsloth_zoo>=2026.1.2 (from unsloth)
  Downloading unsloth_zoo-2026.1.2-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth)
  Downloading tyro-1.0.3-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.33.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting datasets
  Downloading datasets-4.3.0-py3-none-any.whl.met

In [2]:
MODEL_NAME = "unsloth/codegemma-7b"
DATASET_NAME = "philschmid/guanaco-sharegpt-style"

MAX_SEQ_LEN = 2048
LOAD_4BIT = True
DTYPE = None

LORA_CONFIG = {
    "r": 32,            # code models need less rank
    "lora_alpha": 64,
    "lora_dropout": 0.05,
}

TRAIN_CONFIG = {
    "batch_size":1,
    "grad_acc": 8,
    "learning_rate": 3e-5,  # lower LR for code stability
    "max_steps": 200,
    "warmup_ratio": 0.05,
}


In [3]:
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
import torch, time, pandas as pd

def load_model(use_lora=False):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=MODEL_NAME,
        max_seq_length=MAX_SEQ_LEN,
        dtype=DTYPE,
        load_in_4bit=LOAD_4BIT,
    )

    tokenizer = get_chat_template(
        tokenizer,
        chat_template="gemma",
        mapping={"role":"from","content":"value","user":"human","assistant":"gpt"},
    )

    if use_lora:
        model = FastLanguageModel.get_peft_model(
            model,
            r=LORA_CONFIG["r"],
            lora_alpha=LORA_CONFIG["lora_alpha"],
            lora_dropout=LORA_CONFIG["lora_dropout"],
            target_modules=[
                "q_proj","k_proj","v_proj","o_proj",
                "gate_proj","up_proj","down_proj"
            ],
            bias="none",
            use_gradient_checkpointing="unsloth",
            random_state=42,
        )

    return model, tokenizer


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
from datasets import load_dataset

def format_fn(examples):
    texts = [
        tokenizer.apply_chat_template(
            convo,
            tokenize=False,
            add_generation_prompt=False
        )
        for convo in examples["conversations"]
    ]
    return {"text": texts}

dataset = load_dataset(DATASET_NAME, split="train")


README.md:   0%|          | 0.00/442 [00:00<?, ?B/s]

data/train-00000-of-00001-8aae24b47ddaaf(…):   0%|          | 0.00/8.24M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9033 [00:00<?, ? examples/s]

In [5]:
CODE_BENCHMARKS = [
    {
        "task": "bug_fixing",
        "messages": [
            {"from":"human","value":
             "Fix the bug:\n\n```python\ndef add(a,b):\n return a-b\n```"}
        ]
    },
    {
        "task": "refactoring",
        "messages": [
            {"from":"human","value":
             "Refactor this to be more Pythonic:\n\n```python\nfor i in range(0,len(x)):\n print(x[i])\n```"}
        ]
    },
    {
        "task": "constraint_code",
        "messages": [
            {"from":"human","value":
             "Write a Python function to check palindrome. Use recursion only."}
        ]
    },
    {
        "task": "style_consistency",
        "messages": [
            {"from":"human","value":
             "Write a Python function with type hints and docstring to compute factorial."}
        ]
    },
]


In [6]:
def run_code_benchmark(model, tokenizer, items, stage):
    FastLanguageModel.for_inference(model)
    rows = []

    for item in items:
        inputs = tokenizer.apply_chat_template(
            item["messages"],
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to("cuda")

        start = time.time()
        out = model.generate(input_ids=inputs, max_new_tokens=200)
        latency = time.time() - start

        text = tokenizer.decode(out[0], skip_special_tokens=True)

        rows.append({
            "stage": stage,
            "task": item["task"],
            "latency_sec": round(latency, 3),
            "output_text": text,
        })

    return rows


In [7]:
base_model, tokenizer = load_model(use_lora=False)

before_results = run_code_benchmark(
    base_model, tokenizer, CODE_BENCHMARKS, "before_finetune"
)


==((====))==  Unsloth 2026.1.2: Fast Gemma patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.57G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

Unsloth: Will map <end_of_turn> to EOS = <eos>.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [8]:
from trl import SFTTrainer, SFTConfig

model, tokenizer = load_model(use_lora=True)
train_data = dataset.map(format_fn, batched=True)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_data,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LEN,
    args=SFTConfig(
        per_device_train_batch_size=TRAIN_CONFIG["batch_size"],
        gradient_accumulation_steps=TRAIN_CONFIG["grad_acc"],
        learning_rate=TRAIN_CONFIG["learning_rate"],
        max_steps=TRAIN_CONFIG["max_steps"],
        warmup_ratio=TRAIN_CONFIG["warmup_ratio"],
        optim="adamw_8bit",
        logging_steps=25,
        output_dir="codegemma_lora_exp",
        report_to="none",
    ),
)

trainer.train()


==((====))==  Unsloth 2026.1.2: Fast Gemma patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2026.1.2 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Map:   0%|          | 0/9033 [00:00<?, ? examples/s]



Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/9033 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


🦥 Unsloth: Padding-free auto-enabled, enabling faster training.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 9,033 | Num Epochs = 1 | Total steps = 200
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 50,003,968 of 8,587,684,864 (0.58% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
25,17.3319
50,1.688
75,1.6525
100,1.4727
125,1.5432
150,1.5327
175,1.5369
200,1.4713


TrainOutput(global_step=200, training_loss=3.5286365127563477, metrics={'train_runtime': 2763.5399, 'train_samples_per_second': 0.579, 'train_steps_per_second': 0.072, 'total_flos': 2.580769905953587e+16, 'train_loss': 3.5286365127563477, 'epoch': 0.1771283073176132})

In [9]:
after_results = run_code_benchmark(
    model, tokenizer, CODE_BENCHMARKS, "after_finetune"
)


In [10]:
def code_score(task, text):
    if task == "bug_fixing":
        return int("a + b" in text)
    if task == "constraint_code":
        return int("def" in text and "return" in text)
    if task == "style_consistency":
        return int("->" in text and '"""' in text)
    return None

for r in before_results + after_results:
    r["constraint_score"] = code_score(r["task"], r["output_text"])


In [11]:
import pandas as pd

df = pd.DataFrame(before_results + after_results)

df.to_csv("codegemma_code_benchmark.csv", index=False)
df.to_excel("codegemma_code_benchmark.xlsx", index=False)

df


Unnamed: 0,stage,task,latency_sec,output_text,constraint_score
0,before_finetune,bug_fixing,17.191,<start_of_turn>user\nFix the bug:\n\n```python...,0.0
1,before_finetune,refactoring,12.966,<start_of_turn>user\nRefactor this to be more ...,
2,before_finetune,constraint_code,13.321,<start_of_turn>user\nWrite a Python function t...,0.0
3,before_finetune,style_consistency,13.327,<start_of_turn>user\nWrite a Python function w...,0.0
4,after_finetune,bug_fixing,4.593,<start_of_turn>user\nFix the bug:\n\n```python...,1.0
5,after_finetune,refactoring,3.881,<start_of_turn>user\nRefactor this to be more ...,
6,after_finetune,constraint_code,10.398,<start_of_turn>user\nWrite a Python function t...,1.0
7,after_finetune,style_consistency,13.912,<start_of_turn>user\nWrite a Python function w...,1.0
