# Step 1: Mounting Google Drive and Importing Libraries

In [None]:
from google.colab import drive
drive.mount("/content/drive")
%cd /content/drive/MyDrive/grpo-verified-reasoner
!ls

Mounted at /content/drive
/content/drive/MyDrive/grpo-verified-reasoner
data			      LICENSE	 outputs    unsloth_compiled_cache
grpo_trainer_lora_model       models	 README.md  _unsloth_sentencepiece_temp
huggingface_tokenizers_cache  notebooks  src	    wandb


In [None]:
# Install UV (Faster pip)
!pip install --upgrade -qqq uv

In [None]:
!pip install -q unsloth

In [None]:
import os
import gc
import torch
import safetensors.torch
from unsloth import FastLanguageModel

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


# Step 2: Loading Models

In [None]:
# This is the same prompt that we used during SFT
system_prompt = """You are a code-generation engine.
You must output your response in the following exact format:
<START_WORKING_OUT>
Concise reasoning steps required to solve the problem.
</END_WORKING_OUT>
<SOLUTION>
Valid Python code only.
</SOLUTION>
Do not output anything outside these tags."""

In [None]:
user_prompt = "Write a Python function that returns the factorial of a number."

In [None]:
BASE_MODEL_PATH = "unsloth/Qwen3-4B-Base"
SFT_MODEL_PATH  = "models/qwen3-4b-sft-merged"
SFT_MODEL_PATH_2  = "models/qwen3-4b-sft"
GRPO_MODEL_PATH = "models/qwen3-4b-grpo-checkpoint90-merged"
GRPO_MODEL_PATH_2 = "outputs/checkpoint-188"
GRPO_MODEL_PATH_3 = "models/qwen3-4b-grpo-final-2"
GRPO_MODEL_PATH_4 = "models/qwen3-4b-grpo-final-2-merged"

In [None]:
_, tokenizer = FastLanguageModel.from_pretrained(
    model_name=SFT_MODEL_PATH,
)

==((====))==  Unsloth 2026.1.2: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def generate_response(model_path, model_name):
    print(f"\n--- LOADING {model_name} ---")
    model, _ = FastLanguageModel.from_pretrained(
        model_name=model_path,
        load_in_4bit = False,
        dtype = torch.float16,
    )
    FastLanguageModel.for_inference(model)

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True,
    ).to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.0, # Deterministic for identity check
            use_cache=True,
        )

    text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # CLEANUP TO SAVE VRAM
    del model
    del inputs
    del outputs
    torch.cuda.empty_cache()
    gc.collect()

    return text

# Step 3: Generating Outputs


In [None]:
sft_output = generate_response(SFT_MODEL_PATH, "SFT MODEL")
grpo_output = generate_response(GRPO_MODEL_PATH_4, "GRPO MODEL")


--- LOADING SFT MODEL ---
==((====))==  Unsloth 2026.1.2: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


--- LOADING GRPO MODEL ---
==((====))==  Unsloth 2026.1.2: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
print("\n" + "="*40)
print("COMPARISON RESULT")
print("="*40)

if sft_output == grpo_output:
    print("OUTPUTS ARE IDENTICAL (Token-for-Token)")
    print("Conclusion: GRPO updates were too weak to shift the greedy path.")
else:
    print("OUTPUTS ARE DIFFERENT")
    print("Conclusion: The models have diverged!")


COMPARISON RESULT
OUTPUTS ARE IDENTICAL (Token-for-Token)
Conclusion: GRPO updates were too weak to shift the greedy path.


In [None]:
print("\n--- SFT START ---")
print(sft_output) # Print first 500 chars
print("\n--- GRPO START ---")
print(grpo_output)


--- SFT START ---
system
You are a code-generation engine.
You must output your response in the following exact format:
<START_WORKING_OUT>
Concise reasoning steps required to solve the problem.
</END_WORKING_OUT>
<SOLUTION>
Valid Python code only.
</SOLUTION>
Do not output anything outside these tags.
user
Write a Python function that returns the factorial of a number.
assistant
<START_WORKING_OUT>
To solve this problem, we need to create a function that calculates the factorial of a given number. The factorial of a non-negative integer n is the product of all positive integers less than or equal to n. For example, the factorial of 5 (denoted as 5!) is 5 * 4 * 3 * 2 * 1 = 120. We can implement this using a simple loop or recursion. Here, we'll use a loop for clarity and efficiency.
</END_WORKING_OUT>
<SOLUTION>
def factorial(n):
    if n < 0:
        return "Factorial is not defined for negative numbers."
    result = 1
    for i in range(2, n + 1):
        result *= i
    return res

In [None]:
# This is the result from the previous run when we didn't specifiy the load in 4 bit and dtype arguments
print("\n--- SFT START ---")
print(sft_output) # Print first 500 chars
print("\n--- GRPO START ---")
print(grpo_output)


--- SFT START ---
system
You are a code-generation engine.
You must output your response in the following exact format:
<START_WORKING_OUT>
Concise reasoning steps required to solve the problem.
</END_WORKING_OUT>
<SOLUTION>
Valid Python code only.
</SOLUTION>
Do not output anything outside these tags.
user
Write a Python function that returns the factorial of a number.
assistant
<START_WORKING_OUT>
To write a Python function that returns the factorial of a number, we need to follow these steps:
1. Define a function named `factorial` that takes an integer `n` as an argument.
2. Initialize a variable `result` to 1, which will store the factorial value.
3. Use a for loop to iterate from 1 to `n` (inclusive).
4. In each iteration, multiply `result` by the current loop index.
5. After the loop completes, return the `result` variable.
</END_WORKING_OUT>
<SOLUTION>
def factorial(n):
    result = 1
    for i in range(1, n + 1):
        result *= i
    return result
</SOLUTION>

--- GRPO STAR

# Step 4: Generating Outputs (FP32)


In [None]:
SFT_MODEL_PATH  = "models/qwen3-4b-sft-merged-f32"
GRPO_MODEL_PATH = "models/qwen3-4b-grpo-merged-f32-final"

In [None]:
sft_output = generate_response(SFT_MODEL_PATH, "SFT MODEL")
grpo_output = generate_response(GRPO_MODEL_PATH, "GRPO MODEL")


--- LOADING SFT MODEL ---
==((====))==  Unsloth 2026.1.2: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


--- LOADING GRPO MODEL ---
==((====))==  Unsloth 2026.1.2: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
print("\n" + "="*40)
print("COMPARISON RESULT")
print("="*40)

if sft_output == grpo_output:
    print("OUTPUTS ARE IDENTICAL (Token-for-Token)")
    print("Conclusion: GRPO updates were too weak to shift the greedy path.")
else:
    print("OUTPUTS ARE DIFFERENT")
    print("Conclusion: The models have diverged!")


COMPARISON RESULT
OUTPUTS ARE DIFFERENT
Conclusion: The models have diverged!


In [None]:
print("\n--- SFT START ---")
print(sft_output) # Print first 500 chars
print("\n--- GRPO START ---")
print(grpo_output)


--- SFT START ---
system
You are a code-generation engine.
You must output your response in the following exact format:
<START_WORKING_OUT>
Concise reasoning steps required to solve the problem.
</END_WORKING_OUT>
<SOLUTION>
Valid Python code only.
</SOLUTION>
Do not output anything outside these tags.
user
Write a Python function that returns the factorial of a number.
assistant
<START_WORKING_OUT>
Define a function factorial(n) that calculates the product of all positive integers up to n.
Handle non-positive integers by returning 1 (factorial of 0 and negative is 1).
Implement iterative approach for efficiency.
Return the computed factorial.
</END_WORKING_OUT>
<SOLUTION>
def factorial(n):
    if n <= 0:
        return 1
    result = 1
    for i in range(2, n + 1):
        result *= i
    return result
</SOLUTION>

--- GRPO START ---
system
You are a code-generation engine.
You must output your response in the following exact format:
<START_WORKING_OUT>
Concise reasoning steps requir

# Step 5: Generating Outputs (FP16)


In [None]:
SFT_MODEL_PATH  = "models/qwen3-4b-sft-merged-f16"
GRPO_MODEL_PATH = "models/qwen3-4b-grpo-merged-f16-final"

In [None]:
sft_output = generate_response(SFT_MODEL_PATH, "SFT MODEL")
grpo_output = generate_response(GRPO_MODEL_PATH, "GRPO MODEL")


--- LOADING SFT MODEL ---
==((====))==  Unsloth 2026.1.2: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


--- LOADING GRPO MODEL ---
==((====))==  Unsloth 2026.1.2: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
print("\n" + "="*40)
print("COMPARISON RESULT")
print("="*40)

if sft_output == grpo_output:
    print("OUTPUTS ARE IDENTICAL (Token-for-Token)")
    print("Conclusion: GRPO updates were too weak to shift the greedy path.")
else:
    print("OUTPUTS ARE DIFFERENT")
    print("Conclusion: The models have diverged!")


COMPARISON RESULT
OUTPUTS ARE IDENTICAL (Token-for-Token)
Conclusion: GRPO updates were too weak to shift the greedy path.


In [None]:
print("\n--- SFT START ---")
print(sft_output) # Print first 500 chars
print("\n--- GRPO START ---")
print(grpo_output)


--- SFT START ---
system
You are a code-generation engine.
You must output your response in the following exact format:
<START_WORKING_OUT>
Concise reasoning steps required to solve the problem.
</END_WORKING_OUT>
<SOLUTION>
Valid Python code only.
</SOLUTION>
Do not output anything outside these tags.
user
Write a Python function that returns the factorial of a number.
assistant
<START_WORKING_OUT>
To solve this problem, we need to create a function that calculates the factorial of a given number. The factorial of a non-negative integer n is the product of all positive integers less than or equal to n. For example, the factorial of 5 (denoted as 5!) is 5 * 4 * 3 * 2 * 1 = 120. We can implement this using a simple loop or recursion. Here, we'll use a loop for clarity and efficiency.
</END_WORKING_OUT>
<SOLUTION>
def factorial(n):
    if n < 0:
        return "Factorial is not defined for negative numbers."
    result = 1
    for i in range(2, n + 1):
        result *= i
    return res

# Step 6: Generating Outputs (FP32 --> FP16)


In [None]:
SFT_MODEL_PATH  = "models/qwen3-4b-sft-merged-f32"
GRPO_MODEL_PATH = "models/qwen3-4b-grpo-merged-f32-final"

In [None]:
sft_output = generate_response(SFT_MODEL_PATH, "SFT MODEL")
grpo_output = generate_response(GRPO_MODEL_PATH, "GRPO MODEL")


--- LOADING SFT MODEL ---
==((====))==  Unsloth 2026.1.2: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


--- LOADING GRPO MODEL ---
==((====))==  Unsloth 2026.1.2: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
print("\n" + "="*40)
print("COMPARISON RESULT")
print("="*40)

if sft_output == grpo_output:
    print("OUTPUTS ARE IDENTICAL (Token-for-Token)")
    print("Conclusion: GRPO updates were too weak to shift the greedy path.")
else:
    print("OUTPUTS ARE DIFFERENT")
    print("Conclusion: The models have diverged!")


COMPARISON RESULT
OUTPUTS ARE DIFFERENT
Conclusion: The models have diverged!


In [None]:
print("\n--- SFT START ---")
print(sft_output) # Print first 500 chars
print("\n--- GRPO START ---")
print(grpo_output)


--- SFT START ---
system
You are a code-generation engine.
You must output your response in the following exact format:
<START_WORKING_OUT>
Concise reasoning steps required to solve the problem.
</END_WORKING_OUT>
<SOLUTION>
Valid Python code only.
</SOLUTION>
Do not output anything outside these tags.
user
Write a Python function that returns the factorial of a number.
assistant
<START_WORKING_OUT>
Define a function factorial(n) that calculates the product of all positive integers up to n.
Handle non-positive integers by returning 1 (factorial of 0 and negative is 1).
Implement iterative approach for efficiency.
Return the computed factorial.
</END_WORKING_OUT>
<SOLUTION>
def factorial(n):
    if n <= 0:
        return 1
    result = 1
    for i in range(2, n + 1):
        result *= i
    return result
</SOLUTION>

--- GRPO START ---
system
You are a code-generation engine.
You must output your response in the following exact format:
<START_WORKING_OUT>
Concise reasoning steps requir

# Step 7: Generating Outputs for Base + LoRA Adapter Models


In [None]:
sft_output = generate_response(SFT_MODEL_PATH_2, "SFT MODEL")
grpo_output = generate_response(GRPO_MODEL_PATH_2, "GRPO MODEL")


--- LOADING SFT MODEL ---
==((====))==  Unsloth 2026.1.2: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

Unsloth 2026.1.2 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.



--- LOADING GRPO MODEL ---
==((====))==  Unsloth 2026.1.2: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
print("\n" + "="*40)
print("COMPARISON RESULT")
print("="*40)

if sft_output == grpo_output:
    print("OUTPUTS ARE IDENTICAL (Token-for-Token)")
    print("Conclusion: GRPO updates were too weak to shift the greedy path.")
else:
    print("OUTPUTS ARE DIFFERENT")
    print("Conclusion: The models have diverged!")


COMPARISON RESULT
OUTPUTS ARE DIFFERENT
Conclusion: The models have diverged!


In [None]:
print("\n--- SFT START ---")
print(sft_output) # Print first 500 chars
print("\n--- GRPO START ---")
print(grpo_output)


--- SFT START ---
system
You are a code-generation engine.
You must output your response in the following exact format:
<START_WORKING_OUT>
Concise reasoning steps required to solve the problem.
</END_WORKING_OUT>
<SOLUTION>
Valid Python code only.
</SOLUTION>
Do not output anything outside these tags.
user
Write a Python function that returns the factorial of a number.
assistant
<START_WORKING_OUT>
Define a function factorial(n) that calculates the product of all positive integers up to n.
Handle non-positive integers by returning 1 (factorial of 0 and negative is 1).
Implement iterative approach for efficiency.
Return the computed factorial.
</END_WORKING_OUT>
<SOLUTION>
def factorial(n):
    if n <= 0:
        return 1
    result = 1
    for i in range(2, n + 1):
        result *= i
    return result
</SOLUTION>

--- GRPO START ---
system
You are a code-generation engine.
You must output your response in the following exact format:
<START_WORKING_OUT>
Concise reasoning steps requir

In [None]:
# This is the output from the previous run
print("\n--- SFT START ---")
print(sft_output) # Print first 500 chars
print("\n--- GRPO START ---")
print(grpo_output)


--- SFT START ---
system
You are a code-generation engine.
You must output your response in the following exact format:
<START_WORKING_OUT>
Concise reasoning steps required to solve the problem.
</END_WORKING_OUT>
<SOLUTION>
Valid Python code only.
</SOLUTION>
Do not output anything outside these tags.
user
Write a Python function that returns the factorial of a number.
assistant
<START_WORKING_OUT>
Define a function factorial that takes an integer n.
Handle non-positive input by returning 1 (factorial of 0 or negative is 1).
Initialize result to 1.
Multiply result by each integer i from 1 to n.
Return result.
</END_WORKING_OUT>
<SOLUTION>
def factorial(n):
    if n <= 0:
        return 1
    result = 1
    for i in range(1, n + 1):
        result *= i
    return result
</SOLUTION>

--- GRPO START ---
system
You are a code-generation engine.
You must output your response in the following exact format:
<START_WORKING_OUT>
Concise reasoning steps required to solve the problem.
</END_WOR

# Step 8: Different GRPO Adapter

In [None]:
sft_output = generate_response(SFT_MODEL_PATH_2, "SFT MODEL")
grpo_output = generate_response(GRPO_MODEL_PATH_3, "GRPO MODEL")


--- LOADING SFT MODEL ---
==((====))==  Unsloth 2026.1.2: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/3.32G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

Unsloth 2026.1.2 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.



--- LOADING GRPO MODEL ---
==((====))==  Unsloth 2026.1.2: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
print("\n" + "="*40)
print("COMPARISON RESULT")
print("="*40)

if sft_output == grpo_output:
    print("OUTPUTS ARE IDENTICAL (Token-for-Token)")
    print("Conclusion: GRPO updates were too weak to shift the greedy path.")
else:
    print("OUTPUTS ARE DIFFERENT")
    print("Conclusion: The models have diverged!")


COMPARISON RESULT
OUTPUTS ARE DIFFERENT
Conclusion: The models have diverged!


In [None]:
print("\n--- SFT START ---")
print(sft_output) # Print first 500 chars
print("\n--- GRPO START ---")
print(grpo_output)


--- SFT START ---
system
You are a code-generation engine.
You must output your response in the following exact format:
<START_WORKING_OUT>
Concise reasoning steps required to solve the problem.
</END_WORKING_OUT>
<SOLUTION>
Valid Python code only.
</SOLUTION>
Do not output anything outside these tags.
user
Write a Python function that returns the factorial of a number.
assistant
<START_WORKING_OUT>
Define a function factorial that takes an integer n.
Handle non-positive input by returning 1 (factorial of 0 or negative is 1).
Initialize result to 1.
Multiply result by each integer i from 1 to n.
Return result.
</END_WORKING_OUT>
<SOLUTION>
def factorial(n):
    if n <= 0:
        return 1
    result = 1
    for i in range(1, n + 1):
        result *= i
    return result
</SOLUTION>

--- GRPO START ---
system
You are a code-generation engine.
You must output your response in the following exact format:
<START_WORKING_OUT>
Concise reasoning steps required to solve the problem.
</END_WOR

# Step 9: Fixing Metadata



In [None]:
import nbformat
import os

In [None]:
from google.colab import drive, files
drive.mount('/content/drive', force_remount=True)

In [None]:
# List the notebook directory to confirm the file exists
os.listdir("/content/drive/MyDrive/grpo-verified-reasoner/notebooks")

In [None]:
notebook_path = "/content/drive/MyDrive/grpo-verified-reasoner/notebooks/09_compare_model_outputs.ipynb"

with open(notebook_path, "r") as f:
    nb = nbformat.read(f, as_version=4)

if "widgets" in nb.metadata:
    del nb.metadata["widgets"]

with open(notebook_path, "w") as f:
    nbformat.write(nb, f)

print("Notebook fixed and saved successfully!")