In [None]:
from train_utils import *
import json
import copy 
from datasets import load_dataset
import os
from pruning import *
import gc
import shutil

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Generate all the acceptable model sizes for the GPT-2

# num_heads_options = [6, 8, 10, 12]
# hidden_size_options = [2.5, 3, 3.5, 4]
# embed_size_options = [1024]

# param_range = (240_000_000, 270_000_000)

# model_name = "openai-community/gpt2-medium"
# base_model, tokenizer = load_model(model_name)

# acceptable_params = find_acceptable_model_sizes(base_model, tokenizer, num_heads_options, hidden_size_options, embed_size_options, param_range)

# Load params directly from file
with open("pruning_params.json", "r") as f:
    acceptable_params = json.load(f)


In [8]:
acceptable_params

[{'num_heads': 6,
  'hidden_size': 3.5,
  'embed_size': 1024,
  'model_size': 266684416},
 {'num_heads': 10,
  'hidden_size': 3,
  'embed_size': 1024,
  'model_size': 266690560},
 {'num_heads': 12,
  'hidden_size': 2.5,
  'embed_size': 1024,
  'model_size': 254104576}]

In [9]:
# Do forward pass 
dataset = load_dataset("stas/openwebtext-10k", trust_remote_code=True)
model_name = "openai-community/gpt2-medium"
base_model, tokenizer = load_model(model_name)
calibration_pass(model=base_model,
                 tokenizer=tokenizer,
                 dataset=dataset,
                 sample_size=128,
                 batch_size=4,)


100%|██████████| 32/32 [00:12<00:00,  2.51it/s]


In [None]:
tokenized_dataset = tokenize_dataset(tokenizer, dataset)

os.makedirs("./saved_metrics", exist_ok=True)

training_metrics_path = "./saved_metrics/training_metrics.json"
eval_metrics_path = "./saved_metrics/eval_metrics.json"

if os.path.exists(training_metrics_path):
    with open(training_metrics_path, "r") as f:
        training_metrics = json.load(f)
else:
    training_metrics = {}

if os.path.exists(eval_metrics_path):
    with open(eval_metrics_path, "r") as f:
        eval_metrics = json.load(f)
else:
    eval_metrics = {}

for param in acceptable_params:
    num_heads = param["num_heads"]
    mult_hidden = param["hidden_size"]
    embed_size = param["embed_size"]

    param_key = f"num_heads={num_heads}_hidden_size={mult_hidden}_embed_size={embed_size}"
    model = copy.deepcopy(base_model)
    prune_model(model, int(mult_hidden * embed_size), num_heads, embed_size)
    print(sum(t.numel() for t in model.parameters()))
    remove_all_forward_hooks(model)
    torch.cuda.empty_cache()
    gc.collect()

    if param_key in training_metrics:
        print(f"Skipping training for {param_key}, already exists.")
    else:
        print(f"Training model for {param_key}...")

        
        trainer = trainer_gpt2(model, tokenizer, tokenized_dataset, batch_size=4, num_epochs=2, lr=5e-4)
        trainer.train()
        training_metrics[param_key] = trainer.state.log_history

        with open(training_metrics_path, "w") as f:
            json.dump(training_metrics, f, indent=4)
        torch.cuda.empty_cache()
        gc.collect()
        
    if param_key in eval_metrics:
        print(f"Skipping evaluation for {param_key}, already exists.")
    else:
        print(f"Evaluating perplexity for {param_key}...")
        eval_metrics[param_key] = evaluate_perplexity(model, tokenizer, stride=1024)

        with open(eval_metrics_path, "w") as f:
            json.dump(eval_metrics, f, indent=4)

        del model
        torch.cuda.empty_cache()
        gc.collect()
    shutil.rmtree("/tmp", ignore_errors=True)

Training model for num_heads=8_hidden_size=3_embed_size=640...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


320189268


Step,Training Loss,Validation Loss,Model Preparation Time
30,7.6023,7.527124,0.003
60,7.0718,7.035684,0.003
90,6.7219,6.696625,0.003
120,6.6854,6.646874,0.003


Evaluating perplexity for num_heads=8_hidden_size=3_embed_size=640...


Token indices sequence length is longer than the specified maximum sequence length for this model (287644 > 1024). Running this sequence through the model will result in indexing errors
  0%|          | 0/281 [00:00<?, ?it/s]


AttributeError: 'LayerNorm' object has no attribute 'importance_buffer'