In [None]:
%pip install torch --index-url https://download.pytorch.org/whl/cu128

In [None]:
%pip install "trl>=0.20.0" "peft>=0.17.0" "transformers>=4.55.0" trackio

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from datasets import load_dataset

dataset_name = "AI-MO/NuminaMath-CoT"

train_dataset = load_dataset(dataset_name, split="train")
test_dataset = load_dataset(dataset_name, split="test")

print(f"Train size: {len(train_dataset)}")
print(f"Test size: {len(test_dataset)}")

In [None]:
train_dataset[0]

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b")

In [None]:
messages = train_dataset[0]["messages"]
conversation = tokenizer.apply_chat_template(messages, tokenize=False)
print(conversation)

In [None]:
import torch
from transformers import AutoModelForCausalLM, Mxfp4Config

quantization_config = Mxfp4Config(dequantize=True)
model_kwargs = dict(
    attn_implementation="eager",
    torch_dtype=torch.bfloat16,
    quantization_config=quantization_config,
    use_cache=False,
    device_map="auto",
)

model = AutoModelForCausalLM.from_pretrained("openai/gpt-oss-20b", **model_kwargs)

In [None]:
messages = [
{
"content": "A farmer has a rectangular field with dimensions $3m+8$ and $m-3$ where $m$ is a positive integer. If the field has an area of 76 square meters, find the value of $m$.",
"role": "user"
},
{
"content": "Using the given dimensions, we set up the area equation:\n\\[\n(3m+8)(m-3) = 76.\n\\]\nExpanding this, we get:\n\\[\n3m^2 - 9m + 8m - 24 = 76,\n\\]\n\\[\n3m^2 - m - 24 = 76,\n\\]\n\\[\n3m^2 - m - 100 = 0.\n\\]\nFactoring the quadratic, we find:\n\\[\n(3m+25)(m-4) = 0.\n\\]\nThis gives two potential solutions for $m$: $m=-\\frac{25}{3}$ and $m=4$. Since $m$ must be a positive integer, the only valid solution is $m = \\boxed{4}$.",
"role": "assistant"
}
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt",
).to(model.device)

output_ids = model.generate(input_ids, max_new_tokens=512)
response = tokenizer.batch_decode(output_ids)[0]
print(response)

In [None]:
from trl import SFTConfig

training_args = SFTConfig(
    learning_rate=2e-4,
    gradient_checkpointing=True,
    num_train_epochs=1,
    logging_steps=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    max_length=2048,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine_with_min_lr",
    lr_scheduler_kwargs={"min_lr_rate": 0.1},
    output_dir="gpt-oss-20b-aimo-numina-cot-sft",
    report_to="trackio",
    push_to_hub=True,
    # Evaluation settings
    eval_strategy="steps",  # or "epoch" to evaluate at end of each epoch
    eval_steps=100,  # evaluate every 100 steps
    per_device_eval_batch_size=4,
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    processing_class=tokenizer,
)
trainer.train()

In [None]:
# Final evaluation on test set
eval_results = trainer.evaluate()
print("Final evaluation results:")
for key, value in eval_results.items():
    print(f"{key}: {value}")