## 

In [1]:
from model_config import ModelConfig
from pruning_methods.wanda import wanda_pruning
from pruning_methods.magnitude import magnitude_pruning
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import os
from dotenv import load_dotenv
from evaluation_pruning import generate_text


load_dotenv()

token = os.getenv("HUGGINGFACE_TOKEN")

llama_model = "meta-llama/Llama-3.2-1B"
modelConfig = ModelConfig(token=token)
model = modelConfig.load_llm()

Loading model 'facebook/opt-350m' from cache directory '.cache/llm_weights/'...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Magnitude Pruning

In [6]:
def count_parameters(model):
    """
    Count the total number of non-zero parameters in a model.
    
    Args:
        model (torch.nn.Module): The model to count parameters for
    
    Returns:
        tuple: (total non-zero parameters, trainable non-zero parameters)
    """
    total_nonzero_params = 0
    trainable_nonzero_params = 0
    
    for param in model.parameters():
        num_nonzero_params = torch.count_nonzero(param).item()  # Count non-zero elements
        total_nonzero_params += num_nonzero_params
        if param.requires_grad:
            trainable_nonzero_params += num_nonzero_params
    
    return total_nonzero_params, trainable_nonzero_params


In [7]:
original_model = modelConfig.model

prunned_model = modelConfig.copy_model()

pruning_result = magnitude_pruning(prunned_model, 0.5)

print(f"number of parameters in original model: {count_parameters(original_model)}")
print(f"number of parameters in prunned model: {count_parameters(prunned_model)}")

number of parameters in original model: (331195120, 331195120)
number of parameters in prunned model: (166761506, 166761506)


In [8]:
from evaluation_pruning import generate_text

print(generate_text(prunned_model, modelConfig.tokenizer, "the cat is", 50))

the cat is a big, and he is in the, he, the one, is the only, no one. no no, but, just no he.


In [9]:
from evaluation_pruning import global_evaluation

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
original_model.to(device)
prunned_model.to(device)

global_evaluation(modelConfig, original_model, prunned_model, modelConfig.tokenizer, device=device)

Wikitext Perplexity: 100%|██████████| 20/20 [01:26<00:00,  4.34s/it]
Wikitext Perplexity: 100%|██████████| 20/20 [01:26<00:00,  4.34s/it]


Original Model Perplexity:  23.599618911743164
Pruned Model Perplexity:  1771.341064453125


AttributeError: 'OPTForCausalLM' object has no attribute 'tokenizer'

## Wanda Pruning

In [None]:
# wanda_pruning(modelConfig)