In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch import nn
from torch.utils.data import DataLoader
import os
from tqdm import tqdm
# from evaluation_pruning import PPLMetric
import copy
from model_config import ModelConfig

from evaluation_pruning import global_evaluation
from data_loading import get_wikitext2


In [5]:

model_config = ModelConfig(model_name='meta-llama/Llama-3.2-1B')



# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [6]:
prompt = "Tell me a storie. Once upon a time, there was a little"

In [7]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

In [8]:
model_name = 'meta-llama/Llama-3.2-1B'
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

original_param_count = count_parameters(model)

In [9]:
max_seq_len = 250
model.eval()
with torch.no_grad():
    input_ids = tokenizer(prompt, return_tensors="pt")['input_ids'].to(device)

    generation_output = model.generate(
        input_ids=input_ids,
        do_sample=True,
        top_k=50,
        max_length=max_seq_len,
        temperature=1,
    )
        
result = tokenizer.decode(generation_output[0])
print(result)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|begin_of_text|>Tell me a storie. Once upon a time, there was a little girl who loved to eat. Her mother used to say, “Eat a lot of fish, sweet potatoes, and oranges. They are good for your bones.” This was her secret.
One day, the little girl’s mother was not home, and the little girl was very hungry. She decided to take the little girl’s bicycle to the market.
At the market she saw lots of vegetables and fruits. The little girl was not interested in the vegetables and fruits, but her eyes went toward the fish.
The little girl said, “I see fish at the market. What kind of fish are they?”
The little girl said, “Are they good for my bones?”
She said, “Yes, they are, because they are very rich in calcium.”
The little girl said, “I would like to eat the fish, but I am not allowed to by my mother.” The little girl saw the owner of the fish. The owner of the fish looked sad and angry. “I would love to sell you some fish, but I can’t do it.”
The little girl said, “Why?”
The little girl said

In [7]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm):

In [10]:
def compute_neuron_pair_importance_product(gate_weight, up_weight):

    gate_norms = torch.norm(gate_weight, p=1, dim=1)
    up_norms = torch.norm(up_weight, p=1, dim=1)
    importance_scores = gate_norms * up_norms
    return importance_scores


def compute_neuron_pair_importance_variance(gate_weight, up_weight):
    gate_variance = torch.var(gate_weight, dim=1)
    up_variance = torch.var(up_weight, dim=1)
    importance_scores = gate_variance + up_variance
    return importance_scores


def compute_neuron_pair_importance_absolute(gate_weight, up_weight):
  gate_max_abs = torch.max(gate_weight, dim=1).values + torch.abs(torch.min(gate_weight, dim=1).values)
  up_max_abs = torch.max(up_weight, dim=1).values + torch.abs(torch.min(up_weight, dim=1).values)
  importance_scores = gate_max_abs + up_max_abs
  return importance_scores

In [11]:
def prune_neuron_pairs(mlp, prune_percent, type):
    assert type in ['product', 'variance', 'absolute']
    gate_weight = mlp.gate_proj.weight.data.float()
    up_weight = mlp.up_proj.weight.data.float()

   
    if type == 'product':
        importance_scores = compute_neuron_pair_importance_product(gate_weight, up_weight)
    elif type == 'variance':
        importance_scores = compute_neuron_pair_importance_variance(gate_weight, up_weight)
    elif type == 'absolute':
        importance_scores = compute_neuron_pair_importance_absolute(gate_weight, up_weight)

    original_intermediate_size = gate_weight.size(0)

    num_neuron_pairs_to_prune = min(int(prune_percent * original_intermediate_size), original_intermediate_size - 1)

    k = original_intermediate_size - num_neuron_pairs_to_prune

    if k <= 0:
        raise ValueError(f"Invalid number of neuron pairs to keep: {k}. Adjust the prune_percent.")

    _, indices_to_keep = torch.topk(importance_scores, k, largest=True, sorted=True)
    indices_to_keep = indices_to_keep.sort().values

    #create the new layers
    new_gate_proj = nn.Linear(mlp.gate_proj.in_features, k, bias=False).to(device)
    new_up_proj = nn.Linear(mlp.up_proj.in_features, k, bias=False).to(device)
    new_down_proj = nn.Linear(k, mlp.down_proj.out_features, bias=False).to(device)

    #copy weights to the new layers.
    new_gate_proj.weight.data = mlp.gate_proj.weight.data[indices_to_keep, :]
    new_up_proj.weight.data = mlp.up_proj.weight.data[indices_to_keep, :]
    new_down_proj.weight.data = mlp.down_proj.weight.data[:, indices_to_keep]

    #return new layers and intermediate size.
    return new_gate_proj, new_up_proj, new_down_proj, k

In [12]:
def update_model(model, prune_percent, type):
    assert type in ['product', 'variance', 'absolute']
    new_intermediate_size = None

    #loop for each model layer.
    for idx, layer in enumerate(model.model.layers):
        #Since each layer is a LlamaDecoderLayer it contains multiple components
        # Attention, MLP and Layer norms. We're targetting MLP component
        # by accesing layer.mlp.
        mlp = layer.mlp

        #Call the prune_neiron_pairs with the layers and receiving the pruned.
        new_gate_proj, new_up_proj, new_down_proj, new_size = prune_neuron_pairs(mlp, prune_percent, type)

        #Replace the Origiginal Layers with Pruned Layers.
        mlp.gate_proj = new_gate_proj
        mlp.up_proj = new_up_proj
        mlp.down_proj = new_down_proj

        #new_intermediate_size only needs to be set once
        if new_intermediate_size is None:
            new_intermediate_size = new_size

    #Update the model config file.
    model.config.intermediate_size = new_intermediate_size

    return model

In [13]:
prune_percent = 0.2  # Prune 20% of neurons

# ppl_model = PPLMetric(model, tokenizer, ['wikitext2'], device)

model_absolute = update_model(model, prune_percent, 'absolute')




In [14]:
model_config.model = model_absolute

In [15]:
pruned_param_count = count_parameters(model_config.model)
reduction_in_params = original_param_count - pruned_param_count
percentage_savings = (reduction_in_params / original_param_count) * 100

print(f"Pruned model parameters: {pruned_param_count}")
print(f"Reduction in parameters: {reduction_in_params}")
print(f"Percentage of weight savings: {percentage_savings:.2f}%")

Pruned model parameters: 1074792448
Reduction in parameters: 161021952
Percentage of weight savings: 13.03%


In [16]:
trainloader , testloader = get_wikitext2(model_config.nsamples, model_config.seed, model_config.seqlen, model_config.tokenizer)

global_evaluation(model_config, ratio='30', trainloader=trainloader, testloader=testloader, is_structured=True, device=device)


Token indices sequence length is longer than the specified maximum sequence length for this model (2458791 > 131072). Running this sequence through the model will result in indexing errors


Average Inference Time: 0.0245 seconds


AttributeError: 'LlamaForCausalLM' object has no attribute 'seqlen'

In [18]:
with torch.no_grad():
    input_ids = tokenizer(prompt, return_tensors="pt")['input_ids'].to(device)

    generation_output = model_absolute.generate(
        input_ids=input_ids,
        do_sample=True,
        top_k=50,
        max_length=max_seq_len,
        temperature=1,
    )
        
result = tokenizer.decode(generation_output[0])
print(result)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


<|begin_of_text|>Tell me a storie. Once upon a time, there was a little boy and his mother who owned a chicken egg business. The child wanted to help, but didn't even know how to get into a tractor box. So he asked the mother how to open the box.
The mother told him to keep the finger under her lip and punch the door.
Oh, oh, oh!
In the end, the boy helped the mother in removing the chicken egg and filling it with egg shells.
The child returned to the truck box.
Oh, oh, ow!
The father turned and said "How did you do that?"
The child told him he simply pushed his thumb over the door.
He didn't even know how to manage a little thing, but he did get the egg out of the container.
The mother thanked him for his help.
What's wrong with this boy? You shouldn't have to work at all! Just let the kid do whatever he likes.
My mother was the chicken egg woman
When she asked me, how do you open a chicken egg box?
It's okay, my dear,
Tell me a storie, if you want to be a happy chicken!
Once I was go

In [None]:


global_evaluation(model_config, ,  ['wikitext2'], 1, 1, 1)