In [1]:
from lw_retrain_utils import *
import json
import copy 
from datasets import load_dataset
import os
from pruning import *
import gc
import shutil

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Generate all the acceptable model sizes for the GPT-2

# num_heads_options = [6, 8, 10, 12]
# hidden_size_options = [2.5, 3, 3.5, 4]
# embed_size_options = [1024]

# param_range = (240_000_000, 270_000_000)

# model_name = "openai-community/gpt2-medium"
# base_model, tokenizer = load_model(model_name)

# acceptable_params = find_acceptable_model_sizes(base_model, tokenizer, num_heads_options, hidden_size_options, embed_size_options, param_range)

# Load params directly from file
with open("pruning_params.json", "r") as f:
    acceptable_params = json.load(f)


In [3]:
acceptable_params

[{'num_heads': 6,
  'hidden_size': 3.5,
  'embed_size': 1024,
  'model_size': 266684416},
 {'num_heads': 10,
  'hidden_size': 3,
  'embed_size': 1024,
  'model_size': 266690560},
 {'num_heads': 12,
  'hidden_size': 2.5,
  'embed_size': 1024,
  'model_size': 254104576}]

In [4]:
# Do forward pass 
dataset = load_dataset("stas/openwebtext-10k", trust_remote_code=True)
model_name = "openai-community/gpt2-medium"
base_model, tokenizer = load_model(model_name)
calibration_pass(model=base_model,
                 tokenizer=tokenizer,
                 dataset=dataset,
                 sample_size=128,
                 batch_size=4,)


100%|██████████| 32/32 [00:10<00:00,  2.97it/s]


In [None]:
# num_heads = 16
# mult_hidden = 4096
# embed_size = 1020

# param_key = f"num_heads={num_heads}_hidden_size={mult_hidden}_embed_size={embed_size}"
# model = copy.deepcopy(base_model)
# prune_model_width(model, int(mult_hidden * embed_size), num_heads, embed_size)
# print(sum(t.numel() for t in model.parameters()))
# remove_all_forward_hooks(model)
# torch.cuda.empty_cache()
# gc.collect()

# print(evaluate_perplexity(model, tokenizer, stride=1024).item())
# del model

In [31]:
from transformers import GPT2LMHeadModel, GPT2Config, GPT2Tokenizer

config = GPT2Config()  # Default GPT-2 small config

# Create an untrained GPT-2 model
model = GPT2LMHeadModel(config).to('cuda')

# Load tokenizer (for encoding/decoding text)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
tokenized_dataset = tokenize_dataset(tokenizer, dataset)

param_key = "gpt_2_small_untrained"
# num_heads = 10
# mult_hidden = 2.5
# embed_size = 640

# param_key = f"num_heads={num_heads}_hidden_size={mult_hidden}_embed_size={embed_size}"
# model = copy.deepcopy(base_model)
# prune_model_width(model, int(mult_hidden * embed_size), num_heads, embed_size)
# print(sum(t.numel() for t in model.parameters()))
# remove_all_forward_hooks(model)
# torch.cuda.empty_cache()
# gc.collect()


print(evaluate_perplexity(model, tokenizer, stride=1024).item())

training_metrics = {}
eval_metrics = {}
    
trainer = trainer_gpt2(model, tokenizer, tokenized_dataset, batch_size=4, num_epochs=2, lr=1e-3)
trainer.train()
training_metrics[param_key] = trainer.state.log_history
torch.cuda.empty_cache()
gc.collect()
    
eval_metrics[param_key] = evaluate_perplexity(model, tokenizer, stride=1024).item()
print(eval_metrics[param_key])


# del model
# torch.cuda.empty_cache()
# gc.collect()
# shutil.rmtree("/tmp", ignore_errors=True)

Map: 100%|██████████| 9000/9000 [00:49<00:00, 180.98 examples/s]
Map: 100%|██████████| 1000/1000 [00:07<00:00, 141.80 examples/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (287644 > 1024). Running this sequence through the model will result in indexing errors
100%|█████████▉| 280/281 [00:10<00:00, 27.40it/s]


Perplexity on Wikitext-2: 54769.81
54769.8125


Step,Training Loss,Validation Loss
30,7.4871,7.468016
60,7.2515,7.226694
90,6.9022,6.910994
120,6.6835,6.660523
150,6.5053,6.540416
180,6.4214,6.454809
210,6.3792,6.393113
240,6.3444,6.363271
270,6.3266,6.354678


100%|█████████▉| 280/281 [00:04<00:00, 64.13it/s]

Perplexity on Wikitext-2: 2586.67
2586.666259765625





In [33]:
k = training_metrics[param_key]

In [21]:
param_key

'num_heads=10_hidden_size=2.5_embed_size=640'

In [34]:
training_metrics_path = "./saved_metrics/training_metrics_embedding.json"

if os.path.exists(training_metrics_path):
    with open(training_metrics_path, "r") as f:
        training_metrics = json.load(f)
else:
    training_metrics = {}

In [35]:
param_key

'gpt_2_small_untrained'

In [36]:
training_metrics[param_key] = k

In [37]:
with open(training_metrics_path, "w") as f:
    json.dump(training_metrics, f, indent=4)

In [None]:
# Debugging embedding pruning

import torch

def is_A_equal_to_B_except_one(A, B, dim=1):
    """
    Check if A is equal to B except for one row (dim=0) or one column (dim=1).
    
    Parameters:
        A (torch.Tensor): The smaller matrix.
        B (torch.Tensor): The original matrix.
        dim (int): 0 to check for row removal, 1 to check for column removal.

    Returns:
        bool: True if A is B with exactly one row/column removed, False otherwise.
    """
    if dim == 0:  # Check for row removal
        if A.shape[1] != B.shape[1] or A.shape[0] != B.shape[0] - 1:
            return False
        for i in range(B.shape[0]):
            B_removed = torch.cat((B[:i, :], B[i+1:, :]), dim=0)
            if torch.equal(A, B_removed):
                return True
    
    elif dim == 1:  # Check for column removal
        if A.shape[0] != B.shape[0] or A.shape[1] != B.shape[1] - 1:
            return False
        for i in range(B.shape[1]):
            B_removed = torch.cat((B[:, :i], B[:, i+1:]), dim=1)
            if torch.equal(A, B_removed):
                return True
    
    return False  # No match found

def is_a_equal_to_b_except_one(a, b):
    """
    Check if a 1D tensor `a` is equal to `b` except for one removed element.
    
    Parameters:
        a (torch.Tensor): The smaller 1D tensor.
        b (torch.Tensor): The original 1D tensor.
        
    Returns:
        bool: True if `a` is `b` with exactly one element removed, False otherwise.
    """
    if a.shape[0] != b.shape[0] - 1:
        return False  # a must have one less element than b

    for i in range(b.shape[0]):
        b_removed = torch.cat((b[:i], b[i+1:]))  # Remove the i-th element
        if torch.equal(a, b_removed):
            return True
    
    return False  # No match found



import copy
from torch.nn.modules.normalization import LayerNorm
from transformers.pytorch_utils import Conv1D
from torch.nn import Embedding, Linear


num_heads = 16
mult_hidden = 4096
embed_size = base_model.config.hidden_size - 1

param_key = f"num_heads={num_heads}_hidden_size={mult_hidden}_embed_size={embed_size}"
model = copy.deepcopy(base_model)
prune_model_width(model, int(mult_hidden * embed_size), num_heads, embed_size)
print(sum(t.numel() for t in model.parameters()))
remove_all_forward_hooks(model)
torch.cuda.empty_cache()
gc.collect()

def compare_models(model, base_model):
    base_modules = dict(base_model.named_modules())  # Store base model modules in a dictionary

    for name1, param1 in model.named_modules():
        if name1 not in base_modules:  # Ensure matching layers exist in base_model
            print(f"Layer {name1} is new!")
            continue
        
        param2 = base_modules[name1]

        # Check for weight attribute
        if hasattr(param1, "weight") and hasattr(param2, "weight"):
            if isinstance(param1, (Conv1D, Embedding, Linear)):
                if not is_A_equal_to_B_except_one(param1.weight, param2.weight, dim=1) and \
                   not is_a_equal_to_b_except_one(param1.weight, param2.weight):
                    print(f"Layer {name1} weight has changed!")

        # Check for bias attribute
        if hasattr(param1, "bias") and hasattr(param2, "bias") and param1.bias is not None and param2.bias is not None:
            if not is_a_equal_to_b_except_one(param1.bias, param2.bias) and not torch.equal(param1.bias, param2.bias):
                print(f"Layer {name1} bias has changed!")

        # Check LayerNorm separately
        if isinstance(param1, LayerNorm):
            if not is_a_equal_to_b_except_one(param1.weight, param2.weight) or \
               not is_a_equal_to_b_except_one(param1.bias, param2.bias):
                print(f"Layer {name1} (LayerNorm) has changed!")


354476829


In [48]:
compare_models(model, base_model)