In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3"

In [2]:
import torch
from clm_utils import *
from utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda:1")

# Regularization Experiments

# distilgpt2

In [3]:
model_base, tokenizer = get_classification_gpt2_model(pre_trained_model_name="distilgpt2", embd_pdrop=0.0)

In [6]:
lm_datasets, data_collator, encodings_for_eval_pt = get_wikitext_dataset(tokenizer, block_size=128, fraction_of_train=0)

Token indices sequence length is longer than the specified maximum sequence length for this model (287644 > 1024). Running this sequence through the model will result in indexing errors


In [7]:
model_base.load_state_dict(torch.load("/scr/models/LC/models_archive/GPT2/distilgpt2/vanilla_adam_no_drop.pth", map_location=device))

<All keys matched successfully>

In [5]:
eval_perplexity(model_base, encodings_for_eval_pt, 32, device, max_length=128)

NameError: name 'encodings_for_eval_pt' is not defined

In [12]:
model_LC = get_LC_model_gpt2(model_base, num_GP_layers=1)

Setting up gated layer transformer.h.5.mlp


In [13]:
model_LC.load_state_dict(torch.load("/scr/models/LC/models_archive/GPT2/distilgpt2/LC_adam_no_drop.pth", map_location=device))

<All keys matched successfully>

In [14]:
get_layer_gates_loss(model_LC)

Layer transformer.h.5.mlp: loss:  tensor([0.0009], grad_fn=<PowBackward0>)


In [15]:
eval_perplexity(model_LC, encodings_for_eval_pt, 64, device, max_length=128)

100%|█████████▉| 4493/4495 [00:34<00:00, 130.37it/s]


23.9670467376709

In [16]:
collapse_model(model_LC)

Collapsing layer transformer.h.5.mlp


In [20]:
eval_perplexity(model_LC, encodings_for_eval_pt, 1, device, max_length=128)

100%|█████████▉| 287516/287644 [36:15<00:00, 132.14it/s]


23.539182662963867

In [18]:
print(f"Number of parameters in the model LC: {get_num_parameters(model_LC)}")
print(f"Number of parameters in the model base: {get_num_parameters(model_base)}")

Number of parameters in the model LC: 77780736
Number of parameters in the model base: 81912576


# GPT2

In [48]:
model_base, tokenizer = get_classification_gpt2_model(pre_trained_model_name="gpt2", embd_pdrop=0.0)

In [49]:
model_base.load_state_dict(torch.load("/scr/models/LC/models_archive/GPT2/gpt2/vanilla_adam_no_drop.pth", map_location=device))

<All keys matched successfully>

In [22]:
eval_perplexity(model_base, encodings_for_eval_pt, 32, device, max_length=128)

100%|█████████▉| 8985/8989 [01:06<00:00, 134.75it/s]


18.870046615600586

In [25]:
print(f"Number of parameters in the model base: {get_num_parameters(model_base)}")

Number of parameters in the model base: 124439808


In [26]:
model_LC = get_LC_model_gpt2(model_base, num_GP_layers=1)

Setting up gated layer transformer.h.11.mlp


In [27]:
model_LC.load_state_dict(torch.load("/scr/models/LC/models_archive/GPT2/gpt2/LC_adam_no_drop.pth", map_location=device))

<All keys matched successfully>

In [28]:
get_layer_gates_loss(model_LC)

Layer transformer.h.11.mlp: loss:  tensor([0.0004], grad_fn=<PowBackward0>)


In [29]:
collapse_model(model_LC)

Collapsing layer transformer.h.11.mlp


In [30]:
eval_perplexity(model_LC, encodings_for_eval_pt, 32, device, max_length=128)

100%|█████████▉| 8985/8989 [01:03<00:00, 140.49it/s]


19.467987060546875

In [31]:
print(f"Number of parameters in the model LC: {get_num_parameters(model_LC)}")

Number of parameters in the model LC: 120307968


# Start temp

In [4]:
model_base, tokenizer = get_classification_gpt2_model(pre_trained_model_name="gpt2", embd_pdrop=0.0)

In [5]:
model_base.load_state_dict(torch.load("/scr/models/LC/models_archive/GPT2/gpt2/vanilla_adam.pth", map_location=device))

<All keys matched successfully>

In [7]:
eval_perplexity(model_base, encodings_for_eval_pt, 32, device, max_length=128)

100%|█████████▉| 8985/8989 [02:28<00:00, 60.56it/s] 


18.90892219543457

In [10]:
model_pruned_base = prune_l1_model(model_base, amount=0.1)
model_pruned_base = remove_prune(model_pruned_base).to(device)

In [11]:
eval_perplexity(model_pruned_base, encodings_for_eval_pt, 32, device, max_length=128)

100%|█████████▉| 8985/8989 [02:19<00:00, 64.45it/s] 


18.964061737060547

In [12]:
get_num_parameters(model_pruned_base, count_nonzero_only=True)

tensor(115946340, device='cuda:1')

In [14]:
dummy_input = torch.ones((1, 128), dtype=torch.long)

In [16]:
get_model_macs(model_pruned_base.to("cpu"), dummy_input)



16135421952

In [17]:
get_model_macs(model_base.to("cpu"), dummy_input)



16135421952

# End temp

## Layer 6

In [59]:
model_LC = get_LC_model_gpt2(model_base, num_GP_layers=1, only_list=["6"])

Setting up gated layer transformer.h.6.mlp


In [60]:
model_LC.load_state_dict(torch.load("/scr/models/LC/models_archive/GPT2/gpt2/LC_adam_no_drop_middle.pth", map_location=device))

<All keys matched successfully>

In [61]:
collapse_model(model_LC)   

Collapsing layer transformer.h.6.mlp


In [8]:
eval_perplexity(model_LC, encodings_for_eval_pt, 32, device, max_length=128)

100%|█████████▉| 8985/8989 [01:03<00:00, 142.05it/s]


19.09252166748047

# layer 10

In [5]:
model_GP = get_LC_model_gpt2(model_base, num_GP_layers=1, only_list=["10"])

Setting up gated layer transformer.h.10.mlp


In [6]:
model_GP.load_state_dict(torch.load("/scr/models/LC/models_archive/GPT2/gpt2/LC_adam_no_drop_middle_layer10.pth", map_location=device))

<All keys matched successfully>

In [7]:
collapse_model(model_GP)

Collapsing layer transformer.h.10.mlp


In [10]:
eval_perplexity(model_GP, encodings_for_eval_pt, 32, device, max_length=128)

100%|█████████▉| 8985/8989 [01:03<00:00, 142.41it/s]


19.230045318603516

## 2 layers

In [4]:
model_LC = get_LC_model_gpt2(model_base, num_GP_layers=2)

Setting up gated layer transformer.h.11.mlp
Setting up gated layer transformer.h.10.mlp


In [5]:
model_LC.load_state_dict(torch.load("/scr/models/LC/models_archive/GPT2/gpt2/LC_adam_no_drop_2layer.pth", map_location=device))

<All keys matched successfully>

In [8]:
eval_perplexity(model_LC, encodings_for_eval_pt, 32, device, max_length=128)

100%|█████████▉| 8985/8989 [01:04<00:00, 140.30it/s]


20.066349029541016

In [9]:
collapse_model(model_LC)

Collapsing layer transformer.h.11.mlp
Collapsing layer transformer.h.10.mlp


In [10]:
eval_perplexity(model_LC, encodings_for_eval_pt, 32, device, max_length=128)

100%|█████████▉| 8985/8989 [01:01<00:00, 145.39it/s]


20.086347579956055

In [11]:
print(f"Number of parameters in the model LC 2 layers: {get_num_parameters(model_LC)}")

Number of parameters in the model LC 2 layers: 116176128


## With GP

### Layer 6

In [46]:
model_base, tokenizer = get_classification_gpt2_model(pre_trained_model_name="gpt2", embd_pdrop=0.0)

In [16]:
model_GP = get_GP_model_gpt2(model_base, num_GP_layers=1, only_list=["6"])

Setting up gated layer transformer.h.6


In [17]:
model_GP.load_state_dict(torch.load("/scr/models/LC/models_archive/GPT2-GP/gpt2/LC_adam_no_drop_layer6.pth", map_location=device))

<All keys matched successfully>

In [18]:
collapse_GP_model(model_GP)

Collapsing layer transformer.h.6


In [19]:
eval_perplexity(model_GP, encodings_for_eval_pt, 32, device, max_length=128)

100%|█████████▉| 8985/8989 [01:03<00:00, 141.63it/s]


19.295211791992188

In [20]:
get_num_parameters(model_GP)

119715841

## pruning

In [8]:
import torch
from torch import nn
import torch.nn.utils.prune as prune
import torch.nn.functional as F

In [9]:
from transformers.pytorch_utils import Conv1D
def prune_l1_model(model, amount):
    copy_model = copy.deepcopy(model)
    for name, module in copy_model.named_modules():
        if isinstance(module, Conv1D):
            prune.l1_unstructured(module, name='weight', amount=amount)
    return copy_model
def remove_prune(model):
    for name, module in model.named_modules():
        if isinstance(module, Conv1D):
            prune.remove(module, name='weight')
    return model 

In [65]:
model_pruned_base = prune_l1_model(model_base, amount=0.2)
model_pruned_base = remove_prune(model_pruned_base).to(device)

In [66]:
get_num_parameters(model_pruned_base, count_nonzero_only=True)

tensor(107452884, device='cuda:1')

In [67]:
eval_perplexity(model_pruned_base, encodings_for_eval_pt, 32, device, max_length=128)

100%|█████████▉| 8985/8989 [02:13<00:00, 67.20it/s] 


19.17877197265625

In [68]:
model_LC_pruned = prune_l1_model(model_LC, amount=0.2)
model_LC_pruned = remove_prune(model_LC_pruned).to(device)

In [69]:
get_num_parameters(model_LC_pruned, count_nonzero_only=True)

tensor(104146797, device='cuda:1')

In [70]:
eval_perplexity(model_LC_pruned, encodings_for_eval_pt, 32, device, max_length=128)

100%|█████████▉| 8985/8989 [01:03<00:00, 141.94it/s]


19.40266227722168

# gpt2 large

In [85]:
model_base, tokenizer = get_classification_gpt2_model(pre_trained_model_name="gpt2-large", embd_pdrop=0.0)

In [86]:
model_base.load_state_dict(torch.load("/scr/models/LC/models_archive/GPT2/gpt2-large/vanilla_adam_no_drop.pth", map_location=device))

<All keys matched successfully>

In [8]:
eval_perplexity(model_base, encodings_for_eval_pt, 32, device, max_length=128)

100%|█████████▉| 8985/8989 [03:16<00:00, 45.71it/s]


17.6498966217041

In [6]:
get_num_parameters(model_base)

774030080

In [7]:
model_LC = get_LC_model_gpt2(model_base, num_GP_layers=1, only_list=["32", "16", "11"])

Setting up gated layer transformer.h.32.mlp
Setting up gated layer transformer.h.16.mlp
Setting up gated layer transformer.h.11.mlp


In [8]:
model_LC.load_state_dict(torch.load("/scr/models/LC/models_archive/GPT2/gpt2-large/LC_adam_no_drop_middle.pth", map_location=device))

<All keys matched successfully>

In [9]:
collapse_model(model_LC)

Collapsing layer transformer.h.32.mlp
Collapsing layer transformer.h.16.mlp
Collapsing layer transformer.h.11.mlp


In [10]:
eval_perplexity(model_LC, encodings_for_eval_pt, 32, device, max_length=128)

100%|█████████▉| 8985/8989 [03:08<00:00, 47.73it/s]


15.175317764282227

In [11]:
get_num_parameters(model_LC)

739608320

## with gp

In [6]:
model_GP = get_GP_model_gpt2(model_base, num_GP_layers=3, only_list=["32", "16", "11"])

Setting up gated layer transformer.h.32
Setting up gated layer transformer.h.16
Setting up gated layer transformer.h.11


In [7]:
model_GP.load_state_dict(torch.load("/scr/models/LC/models_archive/GPT2-GP/gpt2-large/LC_adam_no_drop.pth", map_location=device))

<All keys matched successfully>

In [10]:
collapse_GP_model(model_GP)

Collapsing layer transformer.h.32
Collapsing layer transformer.h.16
Collapsing layer transformer.h.11


In [11]:
eval_perplexity(model_GP, encodings_for_eval_pt, 32, device, max_length=128)

100%|█████████▉| 8985/8989 [06:38<00:00, 22.53it/s]


15.615042686462402

In [12]:
get_num_parameters(model_GP)

734681603

## with pruning

In [89]:
model_pruned_base = prune_l1_model(model_base, amount=0.05)
model_pruned_base = remove_prune(model_pruned_base).to(device)

In [90]:
get_num_parameters(model_pruned_base, count_nonzero_only=True)

tensor(738640640, device='cuda:1')

In [91]:
eval_perplexity(model_pruned_base, encodings_for_eval_pt, 32, device, max_length=128)

100%|█████████▉| 8985/8989 [03:17<00:00, 45.44it/s]


17.650970458984375

# temp

In [4]:
model_LC = get_LC_model_gpt2(model_base, num_GP_layers=100)

Setting up gated layer transformer.h.35.mlp
Setting up gated layer transformer.h.34.mlp
Setting up gated layer transformer.h.33.mlp
Setting up gated layer transformer.h.32.mlp
Setting up gated layer transformer.h.31.mlp
Setting up gated layer transformer.h.30.mlp
Setting up gated layer transformer.h.29.mlp
Setting up gated layer transformer.h.28.mlp
Setting up gated layer transformer.h.27.mlp
Setting up gated layer transformer.h.26.mlp
Setting up gated layer transformer.h.25.mlp
Setting up gated layer transformer.h.24.mlp
Setting up gated layer transformer.h.23.mlp
Setting up gated layer transformer.h.22.mlp
Setting up gated layer transformer.h.21.mlp
Setting up gated layer transformer.h.20.mlp
Setting up gated layer transformer.h.19.mlp
Setting up gated layer transformer.h.18.mlp
Setting up gated layer transformer.h.17.mlp
Setting up gated layer transformer.h.16.mlp
Setting up gated layer transformer.h.15.mlp
Setting up gated layer transformer.h.14.mlp
Setting up gated layer transform

In [5]:
collapse_model(model_LC)

Collapsing layer transformer.h.35.mlp
Collapsing layer transformer.h.34.mlp
Collapsing layer transformer.h.33.mlp
Collapsing layer transformer.h.32.mlp
Collapsing layer transformer.h.31.mlp
Collapsing layer transformer.h.30.mlp
Collapsing layer transformer.h.29.mlp
Collapsing layer transformer.h.28.mlp
Collapsing layer transformer.h.27.mlp
Collapsing layer transformer.h.26.mlp
Collapsing layer transformer.h.25.mlp
Collapsing layer transformer.h.24.mlp
Collapsing layer transformer.h.23.mlp
Collapsing layer transformer.h.22.mlp
Collapsing layer transformer.h.21.mlp
Collapsing layer transformer.h.20.mlp
Collapsing layer transformer.h.19.mlp
Collapsing layer transformer.h.18.mlp
Collapsing layer transformer.h.17.mlp
Collapsing layer transformer.h.16.mlp
Collapsing layer transformer.h.15.mlp
Collapsing layer transformer.h.14.mlp
Collapsing layer transformer.h.13.mlp
Collapsing layer transformer.h.12.mlp
Collapsing layer transformer.h.11.mlp
Collapsing layer transformer.h.10.mlp
Collapsing l

In [6]:
get_num_parameters(model_LC)

360968960

In [7]:
(774030080 - 360968960)/774030080

0.533649958409885

In [9]:
dummy_input = torch.ones((1, 128), dtype=torch.long).to(device)

In [11]:
get_model_macs(model_LC.to(device), dummy_input)



47504752640

In [12]:
get_model_macs(model_base.to(device), dummy_input)



100447354880

In [13]:
(100447354880 - 47504752640) / 100447354880

0.527068157277493