## this script tests out the adapter if it works well. Then it also saves the params and prepares them to be importent in Unity.
Note i set this script to be used because saving and loading with unsloth is a little strange. You must know to load the model well if it was in 4bit before, dequantize it then save.
   sometimes i got errors when trying to load from a save. SO JUST USE THIS SCRIPT AND THAT's ALL

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_len = 2048
dtype = torch.bfloat16  # or float32 if you want full precision
load_in_4bit = True   # or False

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-3-270m-it-unsloth-bnb-4bit",      # same base you trained on           # your unsloth adapter folder
    max_seq_length = max_seq_len,
    torch_dtype    = dtype,
    load_in_4bit   = load_in_4bit,
)

from peft import PeftModel

model = PeftModel.from_pretrained(
    model,
    "../models/gemma-3-270m-it_2912_v6/checkpoint-155",
)

messages = [
    {
        'role': "system",
        "content": "You are Kira Solara, a hardened scavenger and former solar architect from the sciâ€‘fi RPG 'Echoes of the Void'."
    },
    {
        "role": "user",
        "content": "Who are you?"
    }
]
tokenized = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True)
input_ids = tokenized.to(model.device)
model = model.merge_and_unload()
FastLanguageModel.for_inference(model)
out = model.generate(input_ids=input_ids, max_new_tokens=128)
print(tokenizer.decode(out[0], skip_special_tokens=True))


In [None]:
model.dequantize()
# model.save_pretrained("../models/gemma-3-270m-ft", tokenizer)

In [None]:
import math
import os
import torch
import numpy as np
from tqdm import tqdm
model._modules["model"]

norm = model._modules["model"].norm

if not os.path.exists("params"):
    os.makedirs("params")
with open(f"params/norm.bin", "wb") as f:
    f.write(norm.weight.detach().cpu().numpy().astype(np.float32).tobytes())


lm_head = model._modules["lm_head"].weight.detach()
lm_head_flat = lm_head.flatten()

os.makedirs("params/lm_head", exist_ok=True)

num_chunks = 14
chunks = torch.chunk(lm_head_flat, num_chunks)

for idx, chunk in enumerate(chunks):
    # Convert to float32 numpy array
    np_chunk = chunk.cpu().float().numpy()
    
    # Write raw binary
    with open(f"params/lm_head/part_{idx}.bin", "wb") as f:
        f.write(np_chunk.tobytes())
    
    print(f"Saved chunk {idx} with {np_chunk.size} weights")

In [None]:
import os
import numpy as np
from tqdm import tqdm
for idx, layer in tqdm(enumerate(model._modules["model"].layers)):
    self_attn = layer.self_attn
    mlp = layer.mlp
    input_layernorm = layer.input_layernorm
    post_attention_layernorm = layer.post_attention_layernorm
    pre_feedforward_layernorm = layer.pre_feedforward_layernorm
    post_feedforward_layernorm = layer.post_feedforward_layernorm
    
    os.makedirs(f"params/layer_{idx}", exist_ok = True)
    
    # ================================================================ GQA =====================================================
    with open(f"params/layer_{idx}/self_attn_q_proj.bin", "wb")as f:
        f.write(self_attn.q_proj.weight.detach().cpu().float().flatten().numpy().tobytes())
        
    with open(f"params/layer_{idx}/self_attn_k_proj.bin", "wb")as f:
        f.write(self_attn.k_proj.weight.detach().cpu().float().flatten().numpy().tobytes())
        
    with open(f"params/layer_{idx}/self_attn_v_proj.bin", "wb")as f:
        f.write(self_attn.v_proj.weight.detach().cpu().float().flatten().numpy().tobytes())
    with open(f"params/layer_{idx}/self_attn_o_proj.bin", "wb")as f:
        f.write(self_attn.o_proj.weight.detach().cpu().float().flatten().numpy().tobytes())
        
    with open(f"params/layer_{idx}/self_attn_q_norm.bin", "wb")as f:
        f.write(self_attn.q_norm.weight.detach().cpu().float().flatten().numpy().tobytes())
    with open(f"params/layer_{idx}/self_attn_k_norm.bin", "wb")as f:
        f.write(self_attn.k_norm.weight.detach().cpu().float().flatten().numpy().tobytes())
        
    
    # =============================================================== MLP ======================================================
    with open(f"params/layer_{idx}/mlp_gate_proj.bin", "wb")as f:
        f.write(mlp.gate_proj.weight.detach().cpu().float().flatten().numpy().tobytes())
    with open(f"params/layer_{idx}/mlp_up_proj.bin", "wb")as f:
        f.write(mlp.up_proj.weight.detach().cpu().float().flatten().numpy().tobytes())
    with open(f"params/layer_{idx}/mlp_down_proj.bin", "wb")as f:
        f.write(mlp.down_proj.weight.detach().cpu().float().flatten().numpy().tobytes())
        
    # ================================================================ RMS =====================================================
    with open(f"params/layer_{idx}/input_layernorm.bin", "wb") as f:
        f.write(input_layernorm.weight.detach().cpu().float().flatten().numpy().tobytes())
    with open(f"params/layer_{idx}/post_attention_layernorm.bin", "wb")as f:
        f.write(post_attention_layernorm.weight.detach().cpu().float().flatten().numpy().tobytes())
    with open(f"params/layer_{idx}/pre_feedforward_layernorm.bin", "wb")as f:
        f.write(pre_feedforward_layernorm.weight.detach().cpu().float().flatten().numpy().tobytes())
    with open(f"params/layer_{idx}/post_feedforward_layernorm.bin", "wb")as f:
        f.write(post_feedforward_layernorm.weight.detach().cpu().float().flatten().numpy().tobytes())
        
    # print(vars(layer))