In [6]:
!pip install trl

Collecting trl
  Using cached trl-0.19.1-py3-none-any.whl.metadata (10 kB)
Collecting accelerate>=1.4.0 (from trl)
  Using cached accelerate-1.9.0-py3-none-any.whl.metadata (19 kB)
Using cached trl-0.19.1-py3-none-any.whl (376 kB)
Using cached accelerate-1.9.0-py3-none-any.whl (367 kB)
Installing collected packages: accelerate, trl
Successfully installed accelerate-1.9.0 trl-0.19.1


In [None]:
import torch
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM, SFTConfig # setting up SFT training process

In [None]:
# helper function for inference

def generate_responses(model, tokenizer, user_message, system_message=None, max_new_tokens=100):
    messages = []
    if system_message:
        messages.append({"role": "system", "content": system_message})
    
    # We assume the data are all single-turn conversation
    messages.append({"role": "user", "content": user_message})
        
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False,
    )

    # tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device) # pt: PyTorch tensors
    # can use vLLM, sglang or TensorRT here for more efficient inference
    with torch.no_grad(): # we won't call backprop
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    # extract only the generated output
    input_len = inputs["input_ids"].shape[1]
    generated_ids = outputs[0][input_len:] # generated token_ids, slice off the prompt part
    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip() # decode the token_ids

    return response

In [12]:
# helper function to test model with questions

def test_model(model, tokenizer, questions, system_message=None, title="Model output"):
    print(f"\n****** {title} ******")
    for i, question in enumerate(questions, 1): # start indexing from 1
        response = generate_responses(model, tokenizer, question, system_message)
        print(f"\nModel input {i}: \n{question} \nModel output: {response} \n")

In [11]:
# helper function to load model and tokenizer

def load_model_and_tokenizer(model_name, cuda=False):
    # loading the base model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name) # using AutoTokenizer from HF
    model = AutoModelForCausalLM.from_pretrained(model_name) # using AutoModeFCLM from HF
    
    # for GPU off-load
    if cuda:
        model.to("cuda")
    
    # if there is no chat template, just create one:
    if not tokenizer.chat_template:
        tokenizer.chat_template = """{% for message in messages %}
                {% if message['role'] == 'system' %}System: {{ message['content'] }}\n
                {% elif message['role'] == 'user' %}User: {{ message['content'] }}\n
                {% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }} <|endoftext|>
                {% endif %}
                {% endfor %}"""
    
    # if no pad_token exists, pad it with EOS
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token
        
    return model, tokenizer

In [10]:
# Load a base model and test:
# using Qwen3-0.6B-Base
# without a GPU

CUDA = False
questions = [
    "Introduce quantum mechanics in 1-line?",
    "Calculate 2+3",
    "What's the difference between linear and logistic regression?"
]

In [15]:
# load the model and tokenizer
model, tokenizer = load_model_and_tokenizer("Qwen/Qwen3-0.6B-Base", CUDA)

# infer
test_model(model, tokenizer, questions, title="Base Qwen3 (No SFT) Output")

del model, tokenizer

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]


****** Base Qwen3 (No SFT) Output ******

Model input 1: 
Introduce quantum mechanics in 1-line? 
Model output: ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ � 


Model input 2: 
Calculate 2+3 
Model output: ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ � 


Model input 3: 
What's the difference between linear and logistic regression? 
Model output: ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ � 



#### Performing SFT on a base model