# Prune InkubaLM

In [1]:
from importlib.metadata import version

pkgs = [
    "numpy",
    "matplotlib",
    "torch",
    "transformers",
    "accelerate",
    "sentencepiece",
    "protobuf",
    "sacrebleu"
]
for p in pkgs:
    print(f"{p} version: {version(p)}")

numpy version: 1.26.3
matplotlib version: 3.7.3
torch version: 2.6.0
transformers version: 4.49.0
accelerate version: 1.5.2
sentencepiece version: 0.2.0
protobuf version: 4.25.6
sacrebleu version: 2.0.0


In [None]:
from pathlib import Path
basedir = Path("./")

In [None]:
model_configs = {
    "50M": {
        "save_model_weights_path": basedir/"pruned_model_init.pth",
        # if `custom_tokenizer_path` is not specified then it will use the default tokenizer from the model
        "custom_tokenizer_path": basedir/"tokenizer",
        "reduce_dim_ratio": 2,
    },
    "100M": {
        "save_model_weights_path": basedir/"pruned_model_60k_init.pth",
        "reduce_dim_ratio": 2,
    },
    "40M": {
        "save_model_weights_path": basedir/"pruned_model_6layer_init.pth",
        "custom_tokenizer_path": basedir/"tokenizer",
        "reduce_dim_ratio": 2,
        "num_hidden_layers": 6,
    }
}

In [4]:
model_type = "50M"

In [5]:
base_model_name = "lelapa/InkubaLM-0.4B"
custom_config = model_configs[model_type]
save_path = custom_config["save_model_weights_path"]
tokenizer_path = custom_config.get("custom_tokenizer_path", base_model_name)
reduce_dim_ratio = custom_config.get("reduce_dim_ratio", 1)
num_hidden_layers = custom_config.get("num_hidden_layers", 8)

In [6]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
from transformers import AutoTokenizer

if custom_config is not None:
    tokenizer_path = custom_config.get("custom_tokenizer_path", base_model_name)
else:
    tokenizer_path = base_model_name
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

In [8]:
from copy import deepcopy
from collections import OrderedDict
from transformers import AutoModelForCausalLM

model_name = "lelapa/InkubaLM-0.4B"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    return_dict=True,
    low_cpu_mem_usage=True,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float32,
)

# Remap the tokenizer weights. For example, if the token for "hello" is 1234 in the
# old tokenizer and 5678 in the new tokenizer, we need to remap the weights accordingly.
# The resulting model will have the same weights as the original model, the only difference
# is that weights for tokens that are not in the new tokenizer are removed from the 
# embedding layers.
old_weights = deepcopy(model.state_dict())
old_tokenizer = AutoTokenizer.from_pretrained(model_name)
vocab = [tokenizer.convert_ids_to_tokens(i) for i in range(tokenizer.vocab_size)]
keep = [old_tokenizer.convert_tokens_to_ids(t) for t in vocab]

new_weights = OrderedDict()
for k, v in old_weights.items():
    if k in ("model.embed_tokens.weight", "lm_head.weight"):
        new_weights[k] = v[keep].clone()
    else:
        new_weights[k] = v.clone()

config = deepcopy(model.config)
config.vocab_size = tokenizer.vocab_size
model = AutoModelForCausalLM.from_config(config).to(torch.float32).to(device)
model.load_state_dict(new_weights)
model.eval();

  warn(
2025-04-09 15:37:04.586882: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-09 15:37:04.626513: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-09 15:37:04.626566: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-09 15:37:04.627777: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-09 15:37:04.635292: I tensorflow/core/platform/cpu_feat

Size of the model with the modified tokenizer, but before any dimension or layer pruning:

In [9]:
print(f"memory (GB): {model.get_memory_footprint() / 1024**3:.2f}")
print(f"num params:  {model.num_parameters():,.0f}")

memory (GB): 0.76
num params:  201,885,696


Look at the number of parameters per layer:

In [10]:
for n, p in model.named_parameters():
    print(f"{n :<50}", p.shape)

model.embed_tokens.weight                          torch.Size([8064, 2048])
model.layers.0.self_attn.q_proj.weight             torch.Size([2048, 2048])
model.layers.0.self_attn.k_proj.weight             torch.Size([2048, 2048])
model.layers.0.self_attn.v_proj.weight             torch.Size([2048, 2048])
model.layers.0.self_attn.o_proj.weight             torch.Size([2048, 2048])
model.layers.0.mlp.gate_proj.weight                torch.Size([5632, 2048])
model.layers.0.mlp.down_proj.weight                torch.Size([2048, 5632])
model.layers.0.mlp.up_proj.weight                  torch.Size([5632, 2048])
model.layers.0.input_layernorm.weight              torch.Size([2048])
model.layers.0.post_attention_layernorm.weight     torch.Size([2048])
model.layers.1.self_attn.q_proj.weight             torch.Size([2048, 2048])
model.layers.1.self_attn.k_proj.weight             torch.Size([2048, 2048])
model.layers.1.self_attn.v_proj.weight             torch.Size([2048, 2048])
model.layers.1.self_attn

Update the model config by reducing dimensions and/or reducing the number of layers.

In [11]:
model.config

LlamaConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "lelapa/InkubaLM-0.4B",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoModelForCausalLM": "lelapa/InkubaLM-0.4B--vulavulaslm.VulavulaLlamaForCausalLM"
  },
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5632,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 8,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.49.0",
  "use_cache": true,
  "vocab_size": 8064
}

In [12]:
config_prn = deepcopy(model.config)
config_prn.hidden_size = config.hidden_size // reduce_dim_ratio
config_prn.intermediate_size = config.intermediate_size // reduce_dim_ratio
config_prn.max_position_embeddings = config.max_position_embeddings // reduce_dim_ratio
config_prn.num_hidden_layers = num_hidden_layers
config_prn

LlamaConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "lelapa/InkubaLM-0.4B",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoModelForCausalLM": "lelapa/InkubaLM-0.4B--vulavulaslm.VulavulaLlamaForCausalLM"
  },
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 2816,
  "max_position_embeddings": 1024,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 8,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.49.0",
  "use_cache": true,
  "vocab_size": 8064
}

Initialize the pruned model:

In [13]:
model_prn = AutoModelForCausalLM.from_config(config_prn).to(device)
model_prn.eval();
print(f"memory (GB): {model_prn.get_memory_footprint() / 1024**3:.2f}")
print(f"num params:  {model_prn.num_parameters():,.0f}")

memory (GB): 0.22
num params:  58,737,664


The size of the model can be further reduced by setting `tie_word_embeddings = True` in the config. This changes the model so that the token embeddings in the first and last layer of the model are shared. We do that in the `finetune` notebooks.
 - For the `50M` model the number of parameters is reduced by a further 8064*1024 = 8,257,536
 - For the `40M` model the number of parameters is reduced by a further 8064*1024 = 8,257,536
 - For the `100M` model the number of parameters is reduced by a further 61788*1024 = 63,270912 parameters

Compare the pruned model to the original layer-by-layer:

In [14]:
for n, p in model.named_parameters():
    try:
        print(f"{n :<50} {str(p.shape) :<25} ->  {model_prn.get_parameter(n).shape}")
    except AttributeError:
        print(f"{n :<50} {str(p.shape) :<25} ->  Parameter not in pruned model")

model.embed_tokens.weight                          torch.Size([8064, 2048])  ->  torch.Size([8064, 1024])
model.layers.0.self_attn.q_proj.weight             torch.Size([2048, 2048])  ->  torch.Size([1024, 1024])
model.layers.0.self_attn.k_proj.weight             torch.Size([2048, 2048])  ->  torch.Size([1024, 1024])
model.layers.0.self_attn.v_proj.weight             torch.Size([2048, 2048])  ->  torch.Size([1024, 1024])
model.layers.0.self_attn.o_proj.weight             torch.Size([2048, 2048])  ->  torch.Size([1024, 1024])
model.layers.0.mlp.gate_proj.weight                torch.Size([5632, 2048])  ->  torch.Size([2816, 1024])
model.layers.0.mlp.down_proj.weight                torch.Size([2048, 5632])  ->  torch.Size([1024, 2816])
model.layers.0.mlp.up_proj.weight                  torch.Size([5632, 2048])  ->  torch.Size([2816, 1024])
model.layers.0.input_layernorm.weight              torch.Size([2048])        ->  torch.Size([1024])
model.layers.0.post_attention_layernorm.weight     t

We need to account for shared parameters when mapping weights from the original model to the pruned version. The function below identifies all shared weights in InkubaLM.

In [15]:
def _get_shared_params(model):
    param_names = set([n for n, _ in model.named_parameters()])
    weight_names = list(model.state_dict().keys())
    shared_params = []
    shared_params_names = []
    for n1 in weight_names:
        # Check if the weight is a nn.Parameter object
        try:
            _ = model.get_parameter(n1)
        except AttributeError:
            continue
        # Check if n1 is already in the list of shared params
        for i, p in enumerate(shared_params):
            if model.get_parameter(n1).data_ptr() == p.data_ptr():
                shared_params_names[i].append(n1)
                break
        else:
            shared_params.append(model.get_parameter(n1))
            shared_params_names.append([n1])
    shared_params_names = [group for group in shared_params_names if len(group) > 1]

    # create dict of shared params
    shared_params_dict = {}
    for group in shared_params_names:
        name = set(group)&param_names
        assert len(name) == 1
        shared_params_dict[name.pop()] = group
    return shared_params_dict

In [16]:
shared_params_names = _get_shared_params(model)
shared_params_names

{'model.layers.0.mlp.gate_proj.weight': ['model.layers.0.mlp.gate_proj.weight',
  'model.layers.1.mlp.gate_proj.weight',
  'model.layers.2.mlp.gate_proj.weight',
  'model.layers.3.mlp.gate_proj.weight',
  'model.layers.4.mlp.gate_proj.weight',
  'model.layers.5.mlp.gate_proj.weight',
  'model.layers.6.mlp.gate_proj.weight',
  'model.layers.7.mlp.gate_proj.weight'],
 'model.layers.0.mlp.down_proj.weight': ['model.layers.0.mlp.down_proj.weight',
  'model.layers.1.mlp.down_proj.weight',
  'model.layers.2.mlp.down_proj.weight',
  'model.layers.3.mlp.down_proj.weight',
  'model.layers.4.mlp.down_proj.weight',
  'model.layers.5.mlp.down_proj.weight',
  'model.layers.6.mlp.down_proj.weight',
  'model.layers.7.mlp.down_proj.weight'],
 'model.layers.0.mlp.up_proj.weight': ['model.layers.0.mlp.up_proj.weight',
  'model.layers.1.mlp.up_proj.weight',
  'model.layers.2.mlp.up_proj.weight',
  'model.layers.3.mlp.up_proj.weight',
  'model.layers.4.mlp.up_proj.weight',
  'model.layers.5.mlp.up_proj.we

In [17]:
# reverse the dict
lup_shared_params = {}
for k, v in shared_params_names.items():
    for n in v:
        lup_shared_params[n] = k
lup_shared_params

{'model.layers.0.mlp.gate_proj.weight': 'model.layers.0.mlp.gate_proj.weight',
 'model.layers.1.mlp.gate_proj.weight': 'model.layers.0.mlp.gate_proj.weight',
 'model.layers.2.mlp.gate_proj.weight': 'model.layers.0.mlp.gate_proj.weight',
 'model.layers.3.mlp.gate_proj.weight': 'model.layers.0.mlp.gate_proj.weight',
 'model.layers.4.mlp.gate_proj.weight': 'model.layers.0.mlp.gate_proj.weight',
 'model.layers.5.mlp.gate_proj.weight': 'model.layers.0.mlp.gate_proj.weight',
 'model.layers.6.mlp.gate_proj.weight': 'model.layers.0.mlp.gate_proj.weight',
 'model.layers.7.mlp.gate_proj.weight': 'model.layers.0.mlp.gate_proj.weight',
 'model.layers.0.mlp.down_proj.weight': 'model.layers.0.mlp.down_proj.weight',
 'model.layers.1.mlp.down_proj.weight': 'model.layers.0.mlp.down_proj.weight',
 'model.layers.2.mlp.down_proj.weight': 'model.layers.0.mlp.down_proj.weight',
 'model.layers.3.mlp.down_proj.weight': 'model.layers.0.mlp.down_proj.weight',
 'model.layers.4.mlp.down_proj.weight': 'model.layer

## Let's do the pruning!

We reduce the number of dimensions of a layer by essentially applying dropout to the layer. In other words, we randomly remove a percentage of neurons in each layer.

In [18]:
def reduce_dim(x, dim, out_sz):
    rank = len(x.shape)
    assert 0 <= dim < rank, "`dim` must be in the range [0, rank of x)"
    keep = torch.rand_like(x).argsort(dim=dim)
    if dim == 0:
        keep = keep[:out_sz]
    else:
        keep = keep[:, :out_sz]
    keep.shape, keep

    w = torch.zeros(keep.shape, dtype=x.dtype, device=x.device)
    if dim == 0:
        if rank == 1:
            keep = keep.sort().values
            w = x[keep]
        else:
            for i in range(x.shape[1]):
                keep_i = keep[:, i].sort().values
                w[:, i] = x[keep_i, i]
    else:
        for i in range(x.shape[0]):
            keep_i = keep[i].sort().values
            w[i] = x[i, keep_i]
    return w.clone()

Let's check with a toy example:

In [19]:
import torch
v = torch.randn(1024, 1024)
# v = torch.randn(1024)

In [20]:
v.shape

torch.Size([1024, 1024])

In [21]:
w = reduce_dim(v, 1, 256)
w.shape

torch.Size([1024, 256])

In [22]:
w = reduce_dim(w, 0, 256)
w.shape

torch.Size([256, 256])

We start with InkubaLM's original weights:

In [23]:
weights = model.state_dict()

We use the keys of the newly initialized pruned model to make sure we're only loading weights from the original model that are also in the pruned model

In [24]:
weights_keys_prn = set(model_prn.state_dict().keys())

In [25]:
torch.manual_seed(123)

<torch._C.Generator at 0x7f8dac119890>

We go through the model layer-by-layer. If a layer was pruned entirely, we skip it. Otherwise, we reduce the dimension of the layer by `reduce_dim_ratio`. We keep track of shared parameters to that we don't create the parameters for those.

In [26]:
weights_prn = OrderedDict()
for k, v in weights.items():
    # check if param is pruned
    if k not in weights_keys_prn:
        continue

    # check if param is shared
    shared_param = k in lup_shared_params
    if shared_param:
        shared_name = lup_shared_params[k]
        if shared_name in weights_prn:
            weights_prn[k] = weights_prn[shared_name]
            continue
        k = shared_name

    if k in ("model.embed_tokens.weight", "lm_head.weight"):
        weights_prn[k] = reduce_dim(v, 1, config_prn.hidden_size)
    else:
        rank = len(v.shape)
        out_sz = v.shape[0] // reduce_dim_ratio
        w = reduce_dim(v, 0, out_sz)
        if rank == 2:
            out_sz = v.shape[1] // reduce_dim_ratio
            w = reduce_dim(w, 1, out_sz)
        weights_prn[k] = w

model_prn.load_state_dict(weights_prn)

<All keys matched successfully>

Let's look at the output of the pruned model without fine-tuning. Since we pruned quite aggressively, we expect that we'll need some finetuning before the model will generate coherent responses.

In [27]:
inputs = tokenizer.encode("This is a test", return_tensors="pt").to(device)
tokenizer.decode(model_prn.generate(inputs)[0].cpu().tolist())

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


'<s> This is a test kasiativeative kasiative kasi kasi kasi kasi kasi kasi kasi kasi kasi kasijidjidjidjidjid'

Save the pruned model's weights:

In [None]:
torch.save(weights_prn, save_path)