In [1]:
from pruning import *
from train_utils import *
import json
import copy 
from datasets import load_dataset
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# # Generate all the acceptable model sizes for the GPT-2

# num_heads_options = [8, 10, 12]
# hidden_size_options = [2.5, 3, 3.5, 4]
# embed_size_options = [512, 640, 768]

# param_range = (115_000_000, 135_000_000)

# model_name = "openai-community/gpt2-medium"
# base_model, tokenizer = load_model(model_name)

# acceptable_params = find_acceptable_model_sizes(base_model, tokenizer, num_heads_options, hidden_size_options, embed_size_options, param_range)

# Load params directly from file
with open("pruning_params.json", "r") as f:
    acceptable_params = json.load(f)


In [3]:
# Do forward pass 
dataset = load_dataset("stas/openwebtext-10k", trust_remote_code=True)
model_name = "openai-community/gpt2-medium"
base_model, tokenizer = load_model(model_name)
calibration_pass(model=base_model,
                 tokenizer=tokenizer,
                 dataset=dataset,
                 sample_size=128,
                 batch_size=4,)

100%|██████████| 32/32 [00:12<00:00,  2.48it/s]


In [4]:
import copy
import torch

original_model = copy.deepcopy(base_model)

# Run pruning with the same embedding size
# new_embed_dim = base_model.config.hidden_size  # Should match existing embed size
# prune_embeddings(base_model, new_embed_dim)
prune_mlp(base_model, 1024*4)

# Function to check if parameters have changed
def compare_models(model1, model2):
    changes_detected = False
    for (name1, param1), (name2, param2) in zip(model1.named_parameters(), model2.named_parameters()):
        if name1 != name2:
            print(f"WARNING: Parameter name mismatch {name1} != {name2}")
        elif not torch.equal(param1, param2):
            print(f"Layer {name1} has changed!")
            changes_detected = True
    if not changes_detected:
        print("No changes detected. Function is behaving as expected.")

# Compare original vs. pruned model
compare_models(original_model, base_model)


Layer transformer.h.0.mlp.c_fc.weight has changed!
Layer transformer.h.0.mlp.c_fc.bias has changed!
Layer transformer.h.0.mlp.c_proj.weight has changed!
Layer transformer.h.0.mlp.c_proj.bias has changed!
Layer transformer.h.1.mlp.c_fc.weight has changed!
Layer transformer.h.1.mlp.c_fc.bias has changed!
Layer transformer.h.1.mlp.c_proj.weight has changed!
Layer transformer.h.1.mlp.c_proj.bias has changed!
Layer transformer.h.2.mlp.c_fc.weight has changed!
Layer transformer.h.2.mlp.c_fc.bias has changed!
Layer transformer.h.2.mlp.c_proj.weight has changed!
Layer transformer.h.2.mlp.c_proj.bias has changed!
Layer transformer.h.3.mlp.c_fc.weight has changed!
Layer transformer.h.3.mlp.c_fc.bias has changed!
Layer transformer.h.3.mlp.c_proj.weight has changed!
Layer transformer.h.3.mlp.c_proj.bias has changed!
Layer transformer.h.4.mlp.c_fc.weight has changed!
Layer transformer.h.4.mlp.c_fc.bias has changed!
Layer transformer.h.4.mlp.c_proj.weight has changed!
Layer transformer.h.4.mlp.c_pro

In [5]:
# Select a layer to compare
layer_name = "transformer.h.0.mlp.c_proj.weight"

# Print original model's weight
print("🔹 Original Model Weights:")
print(original_model.state_dict()[layer_name])

# Print modified model's weight
print("\n🔸 Modified Model Weights:")
print(base_model.state_dict()[layer_name])


🔹 Original Model Weights:
tensor([[ 0.0081,  0.0388,  0.0474,  ...,  0.0294,  0.0138,  0.0815],
        [-0.0767, -0.0549,  0.1030,  ..., -0.0155,  0.0830, -0.0381],
        [ 0.0076, -0.0200,  0.1875,  ..., -0.0320,  0.0437,  0.0430],
        ...,
        [-0.1138, -0.1455, -0.0947,  ..., -0.1650, -0.0216,  0.0559],
        [-0.0413,  0.0464,  0.1475,  ...,  0.1147, -0.0249,  0.1299],
        [-0.0036,  0.0415, -0.0116,  ..., -0.0131,  0.0422,  0.0181]],
       device='cuda:0', dtype=torch.bfloat16)

🔸 Modified Model Weights:
tensor([[ 0.0320, -0.1670,  0.0874,  ...,  0.0591,  0.0408, -0.0034],
        [ 0.0806, -0.1660,  0.0562,  ..., -0.1030,  0.2539, -0.0068],
        [-0.0923, -0.0583, -0.0036,  ..., -0.0713, -0.0206, -0.1206],
        ...,
        [-0.0320,  0.0535,  0.1631,  ..., -0.0649, -0.0089,  0.0113],
        [-0.0171,  0.0337,  0.0023,  ..., -0.0035, -0.0981,  0.0052],
        [ 0.0540,  0.0025,  0.0378,  ...,  0.1069,  0.0618,  0.0674]],
       device='cuda:0', dtype=tor

In [6]:
base_model.config.hidden_size

1024

In [7]:
raise ValueError("Stop here")

ValueError: Stop here

In [None]:
tokenized_dataset = tokenize_dataset(tokenizer, dataset)

os.makedirs("./saved_metrics", exist_ok=True)

training_metrics_path = "./saved_metrics/training_metrics.json"
eval_metrics_path = "./saved_metrics/eval_metrics.json"

if os.path.exists(training_metrics_path):
    with open(training_metrics_path, "r") as f:
        training_metrics = json.load(f)
else:
    training_metrics = {}

if os.path.exists(eval_metrics_path):
    with open(eval_metrics_path, "r") as f:
        eval_metrics = json.load(f)
else:
    eval_metrics = {}

for param in acceptable_params:
    num_heads = param["num_heads"]
    mult_hidden = param["hidden_size"]
    embed_size = param["embed_size"]

    param_key = f"num_heads={num_heads}_hidden_size={mult_hidden}_embed_size={embed_size}"

    if param_key in training_metrics:
        print(f"Skipping training for {param_key}, already exists.")
    else:
        print(f"Training model for {param_key}...")
        
        model = copy.deepcopy(base_model)

        # prune_model(model, int(mult_hidden * embed_size), num_heads, embed_size)
        # prune_mlp(model, int(mult_hidden * embed_size))
        prune_embeddings(model, 1024)
        # prune_heads(model, 16)

        remove_all_forward_hooks(model)
        torch.cuda.empty_cache()
        
        trainer = trainer_gpt2(model, tokenizer, tokenized_dataset, batch_size=4, num_epochs=2)
        # trainer.evaluate()
        trainer.train()
        training_metrics[param_key] = trainer.state.log_history

        with open(training_metrics_path, "w") as f:
            json.dump(training_metrics, f, indent=4)
        torch.cuda.empty_cache()
        
    if param_key in eval_metrics:
        print(f"Skipping evaluation for {param_key}, already exists.")
    else:
        print(f"Evaluating perplexity for {param_key}...")

        model = copy.deepcopy(base_model)
        eval_metrics[param_key] = evaluate_perplexity(model, tokenizer, stride=1024)

        with open(eval_metrics_path, "w") as f:
            json.dump(eval_metrics, f, indent=4)

        del model
        torch.cuda.empty_cache()

Training model for num_heads=8_hidden_size=3_embed_size=640...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
2,318.9316,297.56781


KeyboardInterrupt: 

In [None]:
trainer.train()

ValueError: Attempting to unscale FP16 gradients.

In [None]:
trainer.state.log_history

[]

In [None]:
import torch

# Example 3x3 matrix
matrix = torch.tensor([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
])

# Indices selected in descending order
idx = torch.tensor([2, 1])  # Selecting indices 2 and 1

# Without sorting (order gets flipped)
unsorted_selection = matrix[:, idx]
print("Unsorted Selection:\n", unsorted_selection)

# With sorting (preserving original order)
sorted_idx, _ = torch.sort(idx)
sorted_selection = matrix[:, sorted_idx]
print("\nSorted Selection:\n", sorted_selection)


Unsorted Selection:
 tensor([[3, 2],
        [6, 5],
        [9, 8]])

Sorted Selection:
 tensor([[2, 3],
        [5, 6],
        [8, 9]])
