## Packages

In [2]:
import os, importlib.util

!pip install --upgrade -qqq uv

if importlib.util.find_spec("torch") is None or "COLAB_" in "".join(os.environ.keys()):
    try: import numpy, PIL; get_numpy = f"numpy=={numpy.__version__}"; get_pil = f"pillow=={PIL.__version__}"
    except: get_numpy = "numpy"; get_pil = "pillow"
    !uv pip install -qqq \
        "torch>=2.8.0" "triton>=3.4.0" {get_numpy} {get_pil} torchvision bitsandbytes "transformers==4.56.2" \
        "unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo" \
        "unsloth[base] @ git+https://github.com/unslothai/unsloth" \
        git+https://github.com/triton-lang/triton.git@05b2c186c1b6c9a08375389d5efe9cb4c401c075#subdirectory=python/triton_kernels
elif importlib.util.find_spec("unsloth") is None:
    !uv pip install -qqq unsloth

!uv pip install --upgrade --no-deps transformers==4.56.2 tokenizers trl==0.22.2 unsloth unsloth_zoo

[2mUsing Python 3.12.12 environment at: /usr[0m
[37m⠋[0m [2mResolving dependencies...                                                     [0m[2K[37m⠋[0m [2mResolving dependencies...                                                     [0m[2K[37m⠙[0m [2mResolving dependencies...                                                     [0m[2K[37m⠙[0m [2mtransformers==4.56.2                                                          [0m[2K[37m⠙[0m [2mtrl==0.22.2                                                                   [0m[2K[37m⠙[0m [2mtokenizers==0.22.1                                                            [0m[2K[37m⠙[0m [2munsloth==2025.11.3                                                            [0m[2K[37m⠙[0m [2munsloth-zoo==2025.11.3                                                        [0m[2K[37m⠙[0m [2m                                                                              [0m[2K[2mResolved [1m5 packages[0m 

## Imports

In [4]:
from unsloth import FastLanguageModel
import torch
from transformers import TextStreamer

## Model

In [5]:
max_seq_length = 1024
dtype = None
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME = "unsloth/gpt-oss-20b"

# Define and load the base model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME,
    dtype = dtype,
    max_seq_length = max_seq_length,
    load_in_4bit = True,
    full_finetuning = False
)

# Add lora adapters for PEFT
model = FastLanguageModel.get_peft_model(
    model,
    r = 8,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None
).to(device)

==((====))==  Unsloth 2025.11.3: Fast Gpt_Oss patching. Transformers: 4.56.2.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.37G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.16G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/165 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/27.9M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Unsloth: Making `model.base_model.model.model` require gradients


## Inputs

In [28]:
messages = [
    {"role": "user", "content": "What is 78645 * 1290?"}
]

In [29]:
raw_inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    tokenize = False
)
raw_inputs

"<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\nKnowledge cutoff: 2024-06\nCurrent date: 2025-11-12\n\nReasoning: medium\n\n# Valid channels: analysis, commentary, final. Channel must be included for every message.\nCalls to these tools must go to the commentary channel: 'functions'.<|end|><|start|>user<|message|>What is 78645 * 1290?<|end|><|start|>assistant"

In [30]:
# Inference test

# Converts messages into LLM understandable format i.e. "<start> <role> <message> ..."
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # adds the special token at the end indicating it's model's turn
    return_tensors = "pt",
    return_dict = True,
    reasoning_effort = "low" # reasoning intensity - low for easy and high for complex queries
).to(device) # defaults to CPU, need to move to GPU

# Notes:
# 1. Changing the roles in messages might lead to unpredictable outputs (i.e. "assistant" in place of "user")

In [31]:
_ = model.generate(**inputs, max_new_tokens = 128, streamer = TextStreamer(tokenizer))

<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
Knowledge cutoff: 2024-06
Current date: 2025-11-12

Reasoning: low

# Valid channels: analysis, commentary, final. Channel must be included for every message.
Calls to these tools must go to the commentary channel: 'functions'.<|end|><|start|>user<|message|>What is 78645 * 1290?<|end|><|start|>assistant<|channel|>analysis<|message|>We need to compute 78645 * 1290. 78645*1000=78,645,000. 78645*200=15,729,000. 78645*90=7,078,050? Wait 78645*90=7,078,050 (yes). Sum: 78,645,000+15,729,000=94,374,000. Add 7,078,050=101,452,050. Check: 78645*(1000+200+90)=78645*1000+78645*200+78645
