# Step 1: Mounting Google Drive and Importing Libraries


In [None]:
from google.colab import drive
drive.mount("/content/drive")
%cd /content/drive/MyDrive/grpo-verified-reasoner
!ls

In [None]:
# Install UV (Faster pip)
!pip install --upgrade -qqq uv

In [None]:
!pip install -q unsloth

In [None]:
import os
import gc
import torch
import safetensors.torch
from safetensors import safe_open
from unsloth import FastLanguageModel

In [None]:
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

# Step 2: Loading the Base Model and the GRPO LoRA Adapter

In [4]:
BASE_MODEL_PATH = "unsloth/Qwen3-4B-Base"
SFT_MODEL_PATH  = "models/qwen3-4b-sft"
CHECKPOINT_PATH = "outputs/checkpoint-188"
MERGED_PATH = "models/qwen3-4b-grpo-final-2-merged"
GRPO_MODEL_PATH = "models/qwen3-4b-grpo-final-2"

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = CHECKPOINT_PATH,
    max_seq_length = 3072,
    load_in_4bit = False,    # Must be False for merging
    dtype = torch.float16,   # Standard 16-bit precision
)

In [5]:
# This physically modifies the weights: W_new = W_base + (A * B)
model.merge_and_unload()

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 2560, padding_idx=151654)
    (layers): ModuleList(
      (0-35): 36 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layerno

In [6]:
# Saves as a standard model (no adapters folder, just model.safetensors)
model.save_pretrained_merged(
    MERGED_PATH,
    tokenizer,
    save_method = "merged_16bit",
)

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...


Unsloth: Copying 2 files from cache to `models/qwen3-4b-grpo-final-2-merged`: 100%|██████████| 2/2 [00:22<00:00, 11.33s/it]


Successfully copied all 2 files from cache to `models/qwen3-4b-grpo-final-2-merged`
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files: 100%|██████████| 2/2 [00:00<00:00, 20410.24it/s]
Unsloth: Merging weights into 16bit: 100%|██████████| 2/2 [00:46<00:00, 23.37s/it]


Unsloth: Merge process complete. Saved to `/content/drive/MyDrive/grpo-verified-reasoner/models/qwen3-4b-grpo-final-2-merged`


In [4]:
sft = safetensors.torch.load_file("outputs/checkpoint-90/adapter_model.safetensors")
grpo = safetensors.torch.load_file("outputs/checkpoint-188/adapter_model.safetensors")

# Pick any key
k = list(sft.keys())[0]
torch.norm(sft[k] - grpo[k])

tensor(0.0062)

In [5]:
sum(torch.norm(sft[k] - grpo[k]) for k in sft.keys()) / sum(torch.norm(sft[k]) for k in sft.keys())

tensor(0.0023)

# Step 3: Merging at 32-Bit Precision (GRPO Model)

In [7]:
SAVE_PATH = "models/qwen3-4b-grpo-merged-f32-final"
SAVE_PATH_2 = "models/qwen3-4b-grpo-merged-f16-final"

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = CHECKPOINT_PATH,
    max_seq_length = 3072,
    load_in_4bit = False,    # Must be False for merging
    dtype = torch.float32,   # Standard 16-bit precision
)

In [10]:
# This physically modifies the weights: W_new = W_base + (A * B)
# The addition now happens in 32-bit. The 0.002 signal is preserved.
model.merge_and_unload()

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 2560, padding_idx=151654)
    (layers): ModuleList(
      (0-35): 36 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layerno

In [None]:
# Use HuggingFace's native save_pretrained on the underlying HF model to preserve exact weights
# and file layout (avoid any adapter-wrapping behavior).
# Avoid Unsloth's convenience save which can implicitly downcast tensors (e.g., to float16)
# or alter serialization settings.
# safe_serialization=True forces the zip-based, non-pickle format for portability and safety.
# Save the tokenizer as well so the repository contains both model weights and tokenization files.
model.model.save_pretrained(SAVE_PATH, safe_serialization=True)
tokenizer.save_pretrained(SAVE_PATH)

('models/qwen3-4b-grpo-merged-f32-final/tokenizer_config.json',
 'models/qwen3-4b-grpo-merged-f32-final/special_tokens_map.json',
 'models/qwen3-4b-grpo-merged-f32-final/chat_template.jinja',
 'models/qwen3-4b-grpo-merged-f32-final/vocab.json',
 'models/qwen3-4b-grpo-merged-f32-final/merges.txt',
 'models/qwen3-4b-grpo-merged-f32-final/added_tokens.json',
 'models/qwen3-4b-grpo-merged-f32-final/tokenizer.json')

In [12]:
# Saves as a standard model (no adapters folder, just model.safetensors)
model.save_pretrained_merged(
    SAVE_PATH_2,
    tokenizer,
    save_method = "merged_16bit",
)

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...


Unsloth: Copying 2 files from cache to `models/qwen3-4b-grpo-merged-f16-final`: 100%|██████████| 2/2 [00:45<00:00, 22.60s/it]


Successfully copied all 2 files from cache to `models/qwen3-4b-grpo-merged-f16-final`
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files: 100%|██████████| 2/2 [00:00<00:00, 17403.75it/s]
Unsloth: Merging weights into 16bit: 100%|██████████| 2/2 [00:43<00:00, 21.58s/it]


Unsloth: Merge process complete. Saved to `/content/drive/MyDrive/grpo-verified-reasoner/models/qwen3-4b-grpo-merged-f16-final`


In [14]:
del model
del tokenizer
torch.cuda.empty_cache()
gc.collect()

78

# Step 4: Merging at BF16 Precision (GRPO Model)

In [5]:
SAVE_PATH = "models/qwen3-4b-grpo-merged-bf16-final"

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = GRPO_MODEL_PATH,
    max_seq_length = 3072,
    load_in_4bit = False,    # Must be False for merging
    dtype = torch.bfloat16,
)

In [None]:
# Merge LoRA adapter deltas into the base weights in-place:
# W_new = W_base + (A @ B)  (LoRA rank-factor product)
# Addition performed in 32-bit to avoid downcast/precision loss and preserve small update signals (~0.002)
# This permanently applies the adapter and unloads adapter structures to free memory.
model.merge_and_unload()

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 2560, padding_idx=151654)
    (layers): ModuleList(
      (0-35): 36 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layerno

In [None]:
model.model.save_pretrained(SAVE_PATH, safe_serialization=True)
tokenizer.save_pretrained(SAVE_PATH)

('models/qwen3-4b-grpo-merged-bf16-final/tokenizer_config.json',
 'models/qwen3-4b-grpo-merged-bf16-final/special_tokens_map.json',
 'models/qwen3-4b-grpo-merged-bf16-final/chat_template.jinja',
 'models/qwen3-4b-grpo-merged-bf16-final/vocab.json',
 'models/qwen3-4b-grpo-merged-bf16-final/merges.txt',
 'models/qwen3-4b-grpo-merged-bf16-final/added_tokens.json',
 'models/qwen3-4b-grpo-merged-bf16-final/tokenizer.json')

# Step 5: Merging SFT Model

In [6]:
MERGED_PATH = "models/qwen3-4b-sft-merged"

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = SFT_MODEL_PATH,
    max_seq_length = 3072,
    load_in_4bit = False,    # Must be False for merging
    dtype = torch.float16,   # Standard 16-bit precision
)

In [8]:
# This physically modifies the weights: W_new = W_base + (A * B)
model.merge_and_unload()

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 2560, padding_idx=151654)
    (layers): ModuleList(
      (0-35): 36 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layerno

In [None]:
model.save_pretrained_merged(
    MERGED_PATH,
    tokenizer,
    save_method = "merged_16bit",
)

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...


Unsloth: Copying 2 files from cache to `models/qwen3-4b-sft-merged`: 100%|██████████| 2/2 [00:22<00:00, 11.06s/it]


Successfully copied all 2 files from cache to `models/qwen3-4b-sft-merged`
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files: 100%|██████████| 2/2 [00:00<00:00, 16980.99it/s]
Unsloth: Merging weights into 16bit: 100%|██████████| 2/2 [00:50<00:00, 25.35s/it]


Unsloth: Merge process complete. Saved to `/content/drive/MyDrive/grpo-verified-reasoner/models/qwen3-4b-sft-merged`


# Step 6: Merging at 32-Bit Precision (SFT Model)

In [15]:
SAVE_PATH = "models/qwen3-4b-sft-merged-f32"
SAVE_PATH_2 = "models/qwen3-4b-sft-merged-f16"

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = SFT_MODEL_PATH,
    max_seq_length = 3072,
    load_in_4bit = False,    # Must be False for merging
    dtype = torch.float32,   # Standard 16-bit precision
)

In [None]:
model.merge_and_unload()

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 2560, padding_idx=151654)
    (layers): ModuleList(
      (0-35): 36 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layerno

In [None]:
model.model.save_pretrained(SAVE_PATH, safe_serialization=True)
tokenizer.save_pretrained(SAVE_PATH)

In [None]:
model.save_pretrained_merged(
    SAVE_PATH_2,
    tokenizer,
    save_method = "merged_16bit",
)

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...


Unsloth: Copying 2 files from cache to `models/qwen3-4b-sft-merged-f16`: 100%|██████████| 2/2 [00:31<00:00, 15.60s/it]


Successfully copied all 2 files from cache to `models/qwen3-4b-sft-merged-f16`
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files: 100%|██████████| 2/2 [00:00<00:00, 13530.01it/s]
Unsloth: Merging weights into 16bit: 100%|██████████| 2/2 [00:56<00:00, 28.07s/it]


Unsloth: Merge process complete. Saved to `/content/drive/MyDrive/grpo-verified-reasoner/models/qwen3-4b-sft-merged-f16`


In [20]:
del model
del tokenizer
torch.cuda.empty_cache()
gc.collect()

104

# Step 3: Merging Model at Checkpoint 90

In [4]:
CHECKPOINT_PATH = "outputs/checkpoint-90"
MERGED_PATH = "models/qwen3-4b-grpo-checkpoint90-merged"

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = CHECKPOINT_PATH,
    max_seq_length = 3072,
    load_in_4bit = False,    # Must be False for merging
    dtype = torch.float16,   # Standard 16-bit precision
)

In [None]:
model.merge_and_unload()

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 2560, padding_idx=151654)
    (layers): ModuleList(
      (0-35): 36 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layerno

In [None]:
model.save_pretrained_merged(
    MERGED_PATH,
    tokenizer,
    save_method = "merged_16bit",
)

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...


Unsloth: Copying 2 files from cache to `models/qwen3-4b-grpo-checkpoint90-merged`: 100%|██████████| 2/2 [00:23<00:00, 11.81s/it]


Successfully copied all 2 files from cache to `models/qwen3-4b-grpo-checkpoint90-merged`
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files: 100%|██████████| 2/2 [00:00<00:00, 3818.21it/s]
Unsloth: Merging weights into 16bit: 100%|██████████| 2/2 [01:44<00:00, 52.33s/it]


Unsloth: Merge process complete. Saved to `/content/drive/MyDrive/grpo-verified-reasoner/models/qwen3-4b-grpo-checkpoint90-merged`


# Step 4: Checking Precision

In [None]:
paths_to_check = {
    "SFT Adapter (Manual Save)": "models/qwen3-4b-sft/adapter_model.safetensors",
    "SFT Adapter (Merged, fp16)": "models/qwen3-4b-sft-merged-f16/model-00001-of-00002.safetensors",
    "SFT Adapter (Merged, fp32)": "models/qwen3-4b-sft-merged-f32/model-00001-of-00004.safetensors",
    "GRPO Checkpoint (Adapter, Auto Save)": "outputs/checkpoint-188/adapter_model.safetensors",
    "GRPO Checkpoint (Adapter, save_lora method)": "models/qwen3-4b-grpo-final-2/adapter_model.safetensors",
    "GRPO Checkpoint (Merged, bf16)": "models/qwen3-4b-grpo-merged-bf16-final/model-00001-of-00002.safetensors",
    "GRPO Checkpoint (Merged, fp16)": "models/qwen3-4b-grpo-merged-f16-final/model-00001-of-00002.safetensors",
    "GRPO Checkpoint (Merfed, fp32)": "models/qwen3-4b-grpo-merged-f32-final/model-00001-of-00004.safetensors",
}

In [None]:
# Print table header
print(f"{'FILE PATH':<50} | {'SIZE (MB)':<10} | {'PRECISION':<10}")
print("-" * 75)

# Iterate files to report existence, size and tensor precision
for label, path in paths_to_check.items():
    # Skip missing files with clear message
    if not os.path.exists(path):
        print(f"{label:<50} | FILE NOT FOUND")
        continue

    # 1. Check File Size (MB)
    size_mb = os.path.getsize(path) / (1024 * 1024)

    # 2. Check Precision (Dtype)
    # Open safetensors metadata-only (no full tensor load) and read dtype of first tensor
    dtype_str = "Unknown"
    try:
        with safe_open(path, framework="pt", device="cpu") as f:
            # Inspect the first tensor entry's dtype via a lightweight slice
            first_key = list(f.keys())[0]
            tensor_slice = f.get_slice(first_key)
            dtype_str = str(tensor_slice.get_dtype())
    except Exception as e:
        # Report errors reading tensor metadata
        dtype_str = "Error reading"

    # Print concise row: label, size and dtype
    print(f"{label:<50} | {size_mb:<10.1f} | {dtype_str:<10}")

FILE PATH                                          | SIZE (MB)  | PRECISION 
---------------------------------------------------------------------------
SFT Adapter (Manual Save)                          | 252.1      | F32       
SFT Adapter (Merged, fp16)                         | 4737.1     | BF16      
SFT Adapter (Merged, fp32)                         | 4758.9     | F32       
GRPO Checkpoint (Adapter, Auto Save)               | 252.1      | F32       
GRPO Checkpoint (Adapter, save_lora method)        | 126.1      | BF16      
GRPO Checkpoint (Merged, bf16)                     | 4737.1     | BF16      
GRPO Checkpoint (Merged, fp16)                     | 4737.1     | BF16      
GRPO Checkpoint (Merfed, fp32)                     | 4758.9     | F32       
