# Experiment v4 — Chat Alignment + Code Execution Eval (Qwen)


In [1]:
!pip install unsloth

Collecting unsloth
  Downloading unsloth-2026.1.1-py3-none-any.whl.metadata (66 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/66.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.6/66.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2026.1.1 (from unsloth)
  Downloading unsloth_zoo-2026.1.1-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth)
  Downloading tyro-1.0.3-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.33.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting bitsandbytes!=0.46.0,!=0.48.0,>=0.45.5 (from unsloth)
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets!=4.0.*,!=4.1.0,<4.4.0,>=3.4.1 (from unsloth)
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting trl!=0.19.0,<=0.24.0,>=0.18.2 (from unsl

In [2]:
import json, csv, re, torch
from datasets import load_dataset
from unsloth import FastLanguageModel
from unsloth.chat_templates import (
    get_chat_template,
    standardize_sharegpt,
    train_on_responses_only,
)
from transformers import DataCollatorForSeq2Seq
from trl import SFTTrainer, SFTConfig

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
# CONFIG
MODEL_NAME = "unsloth/Qwen2.5-Coder-14B-Instruct"
MAX_SEQ_LEN = 2048
ADAPTER_DIR = "qwen25-chat-lora"

GEN_KWARGS_CHAT = dict(
    max_new_tokens=256,
    temperature=0.7,
    top_p=0.9,
)

GEN_KWARGS_CODE = dict(
    max_new_tokens=300,
    temperature=0.2,   # low temp for determinism
)

In [4]:
# CHAT EVAL PROMPTS (LOCK THESE)
# ------------------------------------------------------------
EVAL_CHAT_PROMPTS = [
    [{"role": "user", "content": "Explain attention in transformers in simple terms."}],
    [{"role": "user", "content": "Refactor this code:\nfor i in range(10):print(i)"}],
    [{"role": "user", "content": "Explain gradient accumulation and why it is useful."}],
    [{"role": "user", "content": "Continue this conversation logically:\nUser: I keep hitting CUDA OOM errors.\nAssistant:"}],
]

In [5]:
# ------------------------------------------------------------
# CODE EXECUTION TASKS Just evlaution before and after
# ------------------------------------------------------------
CODE_EVAL_TASKS = [
    {
        "id": "is_prime",
        "messages": [
            {"role": "user", "content": "Write a Python function is_prime(n) that returns True if n is prime."}
        ],
        "tests": [
            ("is_prime(2)", True),
            ("is_prime(4)", False),
            ("is_prime(13)", True),
            ("is_prime(1)", False),
        ],
    },
    {
        "id": "reverse_string",
        "messages": [
            {"role": "user", "content": "Write a Python function reverse_string(s) that reverses a string."}
        ],
        "tests": [
            ("reverse_string('abc')", "cba"),
            ("reverse_string('')", ""),
            ("reverse_string('racecar')", "racecar"),
        ],
    },
]

In [6]:
# ------------------------------------------------------------
# UTILITIES
# ------------------------------------------------------------
def chat_eval(model, tokenizer, prompts, title):
    FastLanguageModel.for_inference(model)
    outputs = []

    print(f"\n===== {title} =====\n")
    for i, messages in enumerate(prompts, 1):
        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to("cuda")

        with torch.no_grad():
            out = model.generate(input_ids=inputs, **GEN_KWARGS_CHAT)

        text = tokenizer.decode(out[0], skip_special_tokens=True)
        outputs.append(text)

        print(f"[PROMPT {i}]")
        print(text)
        print("-" * 80)

    return outputs


def extract_python_code(text):
    blocks = re.findall(r"```python(.*?)```", text, re.S)
    if blocks:
        return blocks[0]
    return text


def run_code_tests(code, tests):
    local_env = {}
    try:
        exec(code, {}, local_env)
    except Exception as e:
        return False, f"exec error: {e}"

    for expr, expected in tests:
        try:
            result = eval(expr, {}, local_env)
            if result != expected:
                return False, f"{expr} -> {result} (expected {expected})"
        except Exception as e:
            return False, f"{expr} error: {e}"

    return True, "all tests passed"


def run_code_eval(model, tokenizer, tasks, title):
    FastLanguageModel.for_inference(model)
    results = []

    print(f"\n===== CODE EVAL: {title} =====\n")
    for task in tasks:
        inputs = tokenizer.apply_chat_template(
            task["messages"],
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to("cuda")

        with torch.no_grad():
            out = model.generate(input_ids=inputs, **GEN_KWARGS_CODE)

        text = tokenizer.decode(out[0], skip_special_tokens=True)
        code = extract_python_code(text)
        passed, message = run_code_tests(code, task["tests"])

        print(f"[{task['id']}] → {'PASS' if passed else 'FAIL'}")
        if not passed:
            print("Reason:", message)
        print("-" * 80)

        results.append({
            "task_id": task["id"],
            "passed": passed,
            "message": message,
            "code": code,
        })

    return results

In [7]:
# ------------------------------------------------------------
# LOAD BASE MODEL (BEFORE)
# ------------------------------------------------------------
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LEN,
    load_in_4bit=True,
    dtype=None,
)

tokenizer = get_chat_template(tokenizer, chat_template="qwen-2.5")

before_chat = chat_eval(
    base_model, tokenizer, EVAL_CHAT_PROMPTS, "BEFORE FINETUNING"
)

before_code = run_code_eval(
    base_model, tokenizer, CODE_EVAL_TASKS, "BEFORE FINETUNING"
)


==((====))==  Unsloth 2026.1.1: Fast Qwen2 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



===== BEFORE FINETUNING =====

[PROMPT 1]
system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
user
Explain attention in transformers in simple terms.
assistant
Sure! In the context of Transformers, which are a type of artificial intelligence model used for natural language processing and other tasks, "attention" refers to a mechanism that allows the model to focus on different parts of the input data when making predictions.

Imagine you're reading a book, and you want to understand what's happening in a particular sentence. You might need to look at not just that sentence, but also the ones before and after it to get the full picture. Attention in Transformers works similarly—it helps the model pay more attention to certain parts of the input when it's trying to process or generate output.

Here’s a simple breakdown:

1. **Input Sequence**: The model receives a sequence of inputs, like words in a sentence.
2. **Attention Weights**: For each part of the input, 

In [8]:
# ------------------------------------------------------------
# ADD LORA
# ------------------------------------------------------------
model = FastLanguageModel.get_peft_model(
    base_model,
    r=16,
    lora_alpha=16,
    target_modules=[
        "q_proj","k_proj","v_proj","o_proj",
        "gate_proj","up_proj","down_proj"
    ],
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
)

Unsloth 2026.1.1 patched 48 layers with 48 QKV layers, 48 O layers and 48 MLP layers.


In [9]:
# ------------------------------------------------------------
# DATASET (CHAT)
# ------------------------------------------------------------
dataset = load_dataset("mlabonne/FineTome-100k", split="train")
dataset = standardize_sharegpt(dataset)

def format_chat(examples):
    texts = [
        tokenizer.apply_chat_template(c, tokenize=False)
        for c in examples["conversations"]
    ]
    return {"text": texts}

dataset = dataset.map(format_chat, batched=True)


README.md:   0%|          | 0.00/982 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/117M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100000 [00:00<?, ? examples/s]

Unsloth: Standardizing formats (num_proc=2):   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [10]:

# ------------------------------------------------------------
# TRAIN
# ------------------------------------------------------------
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    data_collator=DataCollatorForSeq2Seq(tokenizer),
    max_seq_length=MAX_SEQ_LEN,
    packing=False,
    args=SFTConfig(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        max_steps=30,              # demo run
        learning_rate=2e-4,
        logging_steps=1,
        optim="paged_adamw_8bit",
        output_dir="outputs",
        report_to="none",
    ),
)

trainer = train_on_responses_only(
    trainer,
    instruction_part="<|im_start|>user\n",
    response_part="<|im_start|>assistant\n",
)

trainer.train()

model.save_pretrained(ADAPTER_DIR)
tokenizer.save_pretrained(ADAPTER_DIR)

Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/100000 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/100000 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 68,812,800 of 14,838,846,464 (0.46% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,0.618
2,0.5963
3,0.6303
4,0.5492
5,1.0251
6,0.7798
7,0.6138
8,0.7176
9,0.7032
10,0.4652


('qwen25-chat-lora/tokenizer_config.json',
 'qwen25-chat-lora/special_tokens_map.json',
 'qwen25-chat-lora/chat_template.jinja',
 'qwen25-chat-lora/vocab.json',
 'qwen25-chat-lora/merges.txt',
 'qwen25-chat-lora/added_tokens.json',
 'qwen25-chat-lora/tokenizer.json')

In [12]:
# ------------------------------------------------------------
# AFTER EVAL
# ------------------------------------------------------------
ft_model, _ = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LEN,
    load_in_4bit=True,
)

ft_model.load_adapter(ADAPTER_DIR)

after_chat = chat_eval(
    ft_model, tokenizer, EVAL_CHAT_PROMPTS, "AFTER FINETUNING"
)

after_code = run_code_eval(
    ft_model, tokenizer, CODE_EVAL_TASKS, "AFTER FINETUNING"
)


==((====))==  Unsloth 2026.1.1: Fast Qwen2 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [None]:
# ------------------------------------------------------------
# EXPORT REPORTS
# ------------------------------------------------------------
chat_rows = []
for i, (b, a) in enumerate(zip(before_chat, after_chat)):
    chat_rows.append({
        "prompt_id": i,
        "before_len": len(b.split()),
        "after_len": len(a.split()),
        "before": b,
        "after": a,
    })

