In [1]:
import torch

# Load the checkpoint
checkpoint = torch.load("checkpoints/model.pt", map_location="cpu")  # map_location ensures no GPU needed

# Inspect keys
print(checkpoint.keys())  # usually you'll see 'model_state_dict' or similar

# Example: get all parameters as NumPy arrays
import numpy as np

params_np = {}
for k, v in checkpoint['model'].items():
    params_np[k] = v.cpu().numpy()  # convert tensor to numpy

# now params_np is a dict of numpy arrays


dict_keys(['model', 'optimizer', 'scheduler', 'scaler'])


In [2]:
params_np

{'pos_emb': array([[ 0.01765298,  0.1509603 , -0.02300312, ..., -0.02270602,
          0.02222843,  0.01217857],
        [ 0.05000131,  0.03053647, -0.03375666, ..., -0.03249162,
          0.0179548 ,  0.00756455],
        [ 0.05904716, -0.0301312 , -0.03745729, ..., -0.01414995,
          0.01011304, -0.04104885],
        ...,
        [-0.02065871, -0.07007099,  0.0541271 , ...,  0.02158256,
         -0.01228334,  0.01542529],
        [ 0.00090509, -0.04632602,  0.04650491, ...,  0.02214322,
         -0.02901969, -0.0014961 ],
        [-0.01636651, -0.01464839,  0.03936961, ..., -0.01202546,
         -0.00017766,  0.040844  ]], shape=(512, 1024), dtype=float32),
 'token_emb.weight': array([[ 0.01789019,  0.01250056,  0.02061453, ..., -0.02228284,
          0.01846989, -0.0053018 ],
        [-0.01292583, -0.07405201, -0.06376181, ...,  0.00762339,
          0.02328694, -0.01242082],
        [ 0.01697432, -0.00109404,  0.00110288, ..., -0.01558423,
         -0.00830132,  0.01966907],
  

In [None]:
total = 0
for k, arr in checkpoint['model'].items():
    if "lora" in k:
        continue
    n_params = 1
    for dim in arr.shape:
        n_params *= dim
    total += n_params
total


In [1]:
import sys, pathlib
sys.path.append(str(pathlib.Path(r"c:\ml-projects").resolve()))

In [3]:
import sys
import os

# Add dlx to path
sys.path.insert(0, r'c:\ml-projects\dlx')

# Force NumPy backend
import dlx.utils.backend as backend_module
import numpy as np
from dlx.nn.optim import AdamW
from dlx.nn.tensor import Tensor
backend_module.xp = np

# Now import
from training.model import Model

CUDA not available
CUDA not available


In [4]:
VOCAB_SIZE = 51682
D_MODEL = 1024
N_HEADS = 16
MAX_SEQ_LEN = 512
PAD_IDX = 0
DEPTH = 12

model = Model(
    vocab_size=VOCAB_SIZE,
    d_model=D_MODEL,
    max_seq_len=MAX_SEQ_LEN,
    pad_idx=PAD_IDX,
    n_heads=N_HEADS,
    transformer_depth=DEPTH,
    checkpoint_interval_seconds=3600,
    train_dir="data/train",
    validation_dir="data/validation",
    checkpoint_dir="checkpoints",
    epochs=1,
    mini_batch_per_step=8
)

In [5]:
model.num_parameters

257575394

In [6]:
optim = AdamW(model.parameters(), precision=(np.float32, np.float32))
optim.load_state("checkpoints/optimizer.pt")

In [None]:
257575394

In [None]:
params_np

In [50]:
model.parameters()

{'0_1_embedding_1_embed': Tensor(data=[[ 0.07019392 -0.2832381  -1.1384004  ... -1.1295269  -0.20130764
   -0.14001897]
  [-0.33148566  0.384914   -0.35404798 ... -0.87128067  1.2180834
   -0.08382234]
  [-0.55190337  1.1672734   0.35614648 ...  0.5316783  -0.17524076
    1.9957914 ]
  ...
  [ 0.17484015 -0.08599275  0.3767216  ...  0.12994485  0.55956554
   -2.006995  ]
  [-2.0693846  -0.69263977 -1.4579229  ...  0.47035637  1.1721647
   -1.0219879 ]
  [ 1.068521    0.5603047   0.8409303  ... -0.876948   -0.82341367
   -2.0573068 ]], shape=(51682, 1024), dtype=float32),
 '0_1_embedding_1_pe': Tensor(data=[[-1.0783703  -1.6108844  -0.95125586 ...  0.04364998 -1.5458739
   -0.37326247]
  [ 1.5877779  -0.59582114 -0.10530231 ... -0.8420451   0.48114806
    0.6374782 ]
  [ 0.22568625 -0.5068762   0.38410732 ...  1.3213568   0.33657807
   -1.0035617 ]
  ...
  [ 1.4168954   1.5867693   1.1641514  ... -1.1059155  -0.15548274
   -1.4258014 ]
  [ 1.5364544   1.1958843  -0.19473498 ...  0.86139

In [None]:
num_blocks = 12
pt_to_my_lib = {
    # embeddings
    "token_emb.weight": "0_1_embedding_1_embed",
    "pos_emb": "0_1_embedding_1_pe",
    # lm head
    "lm_head.weight": "linear_1_linear_1_project_weight",
    "lm_head.bias": "linear_1_linear_1_project_bias",
}

for i in range(num_blocks):
    lib_block_idx = i + 1  # your library blocks start at 1
    pt_prefix = f"blocks.{i}"
    lib_prefix = f"transformer_{lib_block_idx}"

    # LayerNorm
    pt_to_my_lib[f"{pt_prefix}.ln1.weight"] = f"{lib_prefix}_layernorm_1_gamma"
    pt_to_my_lib[f"{pt_prefix}.ln1.bias"]   = f"{lib_prefix}_layernorm_1_beta"
    pt_to_my_lib[f"{pt_prefix}.ln2.weight"] = f"{lib_prefix}_layernorm_2_gamma"
    pt_to_my_lib[f"{pt_prefix}.ln2.bias"]   = f"{lib_prefix}_layernorm_2_beta"

    # Attention
    pt_to_my_lib[f"{pt_prefix}.attn.qkv.weight"]     = f"{lib_prefix}_linear_1_qkv_weight"
    pt_to_my_lib[f"{pt_prefix}.attn.qkv.bias"]       = f"{lib_prefix}_linear_1_qkv_bias"
    pt_to_my_lib[f"{pt_prefix}.attn.out_proj.weight"]= f"{lib_prefix}_linear_2_o_weight"
    pt_to_my_lib[f"{pt_prefix}.attn.out_proj.bias"]  = f"{lib_prefix}_linear_2_o_bias"

    # MLP / Feed-forward
    pt_to_my_lib[f"{pt_prefix}.mlp.0.weight"] = f"{lib_prefix}_linear_3_proj_up_weight"
    pt_to_my_lib[f"{pt_prefix}.mlp.0.bias"]   = f"{lib_prefix}_linear_3_proj_up_bias"
    pt_to_my_lib[f"{pt_prefix}.mlp.2.weight"] = f"{lib_prefix}_linear_4_proj_down_weight"
    pt_to_my_lib[f"{pt_prefix}.mlp.2.bias"]   = f"{lib_prefix}_linear_4_proj_down_bias"

# Now pt_to_my_lib contains all mappings for embeddings + all 12 blocks + LM head


In [36]:
d = model.parameters()

In [52]:
for pt_k, lib_k in pt_to_my_lib.items():
    if "weight" in pt_k and "layernorm" not in pt_k:
        if "emb" in pt_k:
            param = params_np[pt_k]
        else:
            param = params_np[pt_k].T
    else:
        param = params_np[pt_k]
    
    if d[lib_k].shape != param.shape:
        raise ValueError(f"Shape mismatch for {lib_k}: {d[lib_k].shape} != {param.shape}")
        
    d[lib_k] = Tensor(param, requires_grad=False)


In [53]:
pt_to_my_lib

{'token_emb.weight': '0_1_embedding_1_embed',
 'pos_emb': '0_1_embedding_1_pe',
 'lm_head.weight': 'linear_1_linear_1_project_weight',
 'lm_head.bias': 'linear_1_linear_1_project_bias',
 'blocks.0.ln1.weight': 'transformer_1_layernorm_1_gamma',
 'blocks.0.ln1.bias': 'transformer_1_layernorm_1_beta',
 'blocks.0.ln2.weight': 'transformer_1_layernorm_2_gamma',
 'blocks.0.ln2.bias': 'transformer_1_layernorm_2_beta',
 'blocks.0.attn.qkv.weight': 'transformer_1_linear_1_qkv_weight',
 'blocks.0.attn.qkv.bias': 'transformer_1_linear_1_qkv_bias',
 'blocks.0.attn.out_proj.weight': 'transformer_1_linear_2_o_weight',
 'blocks.0.attn.out_proj.bias': 'transformer_1_linear_2_o_bias',
 'blocks.0.mlp.0.weight': 'transformer_1_linear_3_proj_up_weight',
 'blocks.0.mlp.0.bias': 'transformer_1_linear_3_proj_up_bias',
 'blocks.0.mlp.2.weight': 'transformer_1_linear_4_proj_down_weight',
 'blocks.0.mlp.2.bias': 'transformer_1_linear_4_proj_down_bias',
 'blocks.1.ln1.weight': 'transformer_2_layernorm_1_gamma',

In [54]:
optimizer = AdamW(d, precision=(np.float32, np.float32))

In [55]:
optimizer.save_state("checkpoints/optimizer.pt")