In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3"

In [1]:
import torch
import transformers
import numpy as np
from utils import *
from clm_utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda:2")

In [3]:
MODELS = ["distilgpt2", "gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"]

In [4]:
model_base, tokenizer = get_classification_gpt2_model(pre_trained_model_name="distilgpt2")

In [7]:
lm_datasets, data_collator, encodings_for_eval_pt = get_wikitext_dataset(tokenizer, block_size=10, fraction_of_train=0.01)

Map:   0%|          | 0/18013 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (69515 > 1024). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 18013/18013 [00:03<00:00, 5099.36 examples/s]
Map: 100%|██████████| 1215587/1215587 [00:01<00:00, 683136.77 examples/s]


In [6]:
model_LC = get_LC_model_gpt2(model_base, num_GP_layers=1)

Setting up gated layer transformer.h.5.mlp


In [7]:
eval_perplexity_with_trainer(model_base, data_collator, lm_datasets["test"])

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


562.0391891125148

In [7]:
collapse_model(model_LC)

Collapsing layer transformer.h.5.mlp


In [8]:
get_num_parameters(model_LC)

77780736

In [8]:
eval_perplexity_with_trainer(model_LC, data_collator, lm_datasets["test"])

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


49589.38216628706

In [5]:
model_LC = get_LC_model_gpt2(model_base, num_GP_layers=1)

Setting up gated layer transformer.h.5.mlp


In [6]:
collapse_bypass_only(model_LC)

Collapsing layer transformer.h.5.mlp


In [8]:
eval_perplexity_with_trainer(model_LC, data_collator, lm_datasets["test"])

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


49589.38216628706

# testing the gpt2 large sensitivity

In [15]:
model_base, tokenizer = get_classification_gpt2_model(pre_trained_model_name="gpt2-large", embd_pdrop=0.0)

In [16]:
model_base.load_state_dict(torch.load("/scr/models/LC/models_archive/GPT2/gpt2-large/vanilla_adam_no_drop.pth", map_location=device))

<All keys matched successfully>

In [17]:
eval_perplexity_with_trainer(model_base, data_collator, lm_datasets["test"])

20.960155566243113

In [18]:
model_LC = get_LC_model_gpt2(model_base, num_GP_layers=35)

Setting up gated layer transformer.h.35.mlp
Setting up gated layer transformer.h.34.mlp
Setting up gated layer transformer.h.33.mlp
Setting up gated layer transformer.h.32.mlp
Setting up gated layer transformer.h.31.mlp
Setting up gated layer transformer.h.30.mlp
Setting up gated layer transformer.h.29.mlp
Setting up gated layer transformer.h.28.mlp
Setting up gated layer transformer.h.27.mlp
Setting up gated layer transformer.h.26.mlp
Setting up gated layer transformer.h.25.mlp
Setting up gated layer transformer.h.24.mlp
Setting up gated layer transformer.h.23.mlp
Setting up gated layer transformer.h.22.mlp
Setting up gated layer transformer.h.21.mlp
Setting up gated layer transformer.h.20.mlp
Setting up gated layer transformer.h.19.mlp
Setting up gated layer transformer.h.18.mlp
Setting up gated layer transformer.h.17.mlp
Setting up gated layer transformer.h.16.mlp
Setting up gated layer transformer.h.15.mlp
Setting up gated layer transformer.h.14.mlp
Setting up gated layer transform

In [19]:
lm_datasets, data_collator, encodings_for_eval_pt = get_wikitext_dataset(tokenizer, block_size=128, fraction_of_train=0)

Token indices sequence length is longer than the specified maximum sequence length for this model (287644 > 1024). Running this sequence through the model will result in indexing errors


In [20]:
def collapse_bypass_only(model, num_layers = 100, single_layer_id=-1):
    if single_layer_id == -1:
        for name, module in list(model.named_modules())[::-1]:
            if (isinstance(module, GPT2MLPLC)) and num_layers > 0:
                print("Collapsing layer {}".format(name))
                module.act_LC.weight.data = torch.ones_like(module.act_LC.weight.data)
                num_layers -= 1
    else:
        for name, module in list(model.named_modules())[::-1]:
            if (isinstance(module, GPT2MLPLC)) and str(single_layer_id) == name.split(".")[-2]:
                print("Collapsing layer {}".format(name))
                module.act_LC.weight.data = torch.ones_like(module.act_LC.weight.data)

In [21]:
for i in range(1,36):
    temp_model = copy.deepcopy(model_LC)
    collapse_bypass_only(temp_model, single_layer_id=str(i))
    print("collapsing layer {}".format(i) + str(eval_perplexity_with_trainer(temp_model, data_collator, lm_datasets["test"])))

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Collapsing layer transformer.h.1.mlp


collapsing layer 114266.617445617192
Collapsing layer transformer.h.2.mlp


collapsing layer 26753.542468566336
Collapsing layer transformer.h.3.mlp


collapsing layer 35759.173816557974
Collapsing layer transformer.h.4.mlp


collapsing layer 414322.127596294511
Collapsing layer transformer.h.5.mlp


collapsing layer 52728.768902380818
Collapsing layer transformer.h.6.mlp


collapsing layer 63029.95504705818
Collapsing layer transformer.h.7.mlp


collapsing layer 75922.375533390121
Collapsing layer transformer.h.8.mlp


collapsing layer 82753.071900420797
Collapsing layer transformer.h.9.mlp


collapsing layer 92125.897101566556
Collapsing layer transformer.h.10.mlp


collapsing layer 102211.5194431576865
Collapsing layer transformer.h.11.mlp


collapsing layer 112065.905006995702
Collapsing layer transformer.h.12.mlp


collapsing layer 122216.305875465615
Collapsing layer transformer.h.13.mlp


collapsing layer 134480.456600043595
Collapsing layer transformer.h.14.mlp


collapsing layer 144130.035252441012
Collapsing layer transformer.h.15.mlp


collapsing layer 1514085.326633934228
Collapsing layer transformer.h.16.mlp


collapsing layer 164431.1641565180435
Collapsing layer transformer.h.17.mlp


collapsing layer 174062.667709125684
Collapsing layer transformer.h.18.mlp


collapsing layer 189339.730003818844
Collapsing layer transformer.h.19.mlp


collapsing layer 1921360.146470175074
Collapsing layer transformer.h.20.mlp


collapsing layer 2051318.378095415304
Collapsing layer transformer.h.21.mlp


collapsing layer 211238389.8976556729
Collapsing layer transformer.h.22.mlp


collapsing layer 22795026.7937400625
Collapsing layer transformer.h.23.mlp


collapsing layer 235542842.411090464
Collapsing layer transformer.h.24.mlp


collapsing layer 2410529115.117897887
Collapsing layer transformer.h.25.mlp


collapsing layer 2517513056.74005842
Collapsing layer transformer.h.26.mlp


collapsing layer 26947112.5823798378
Collapsing layer transformer.h.27.mlp


collapsing layer 272577417.750996246
Collapsing layer transformer.h.28.mlp


collapsing layer 2813959910.031898038
Collapsing layer transformer.h.29.mlp


collapsing layer 294618017.925338021
Collapsing layer transformer.h.30.mlp


collapsing layer 30882501.2799779467
Collapsing layer transformer.h.31.mlp


collapsing layer 3189276.80870329628
Collapsing layer transformer.h.32.mlp


collapsing layer 323952.300106145225
Collapsing layer transformer.h.33.mlp


collapsing layer 333992.939910207551
Collapsing layer transformer.h.34.mlp


collapsing layer 343.832860555905695e+35
Collapsing layer transformer.h.35.mlp


collapsing layer 3510474387911.711779


# sensitivity test for gpt2

In [22]:
model_base, tokenizer = get_classification_gpt2_model(pre_trained_model_name="gpt2", embd_pdrop=0.0)

In [23]:
model_base.load_state_dict(torch.load("/scr/models/LC/models_archive/GPT2/gpt2/vanilla_adam_no_drop.pth", map_location=device))

<All keys matched successfully>

In [24]:
model_LC = get_LC_model_gpt2(model_base, num_GP_layers=12)

Setting up gated layer transformer.h.11.mlp
Setting up gated layer transformer.h.10.mlp
Setting up gated layer transformer.h.9.mlp
Setting up gated layer transformer.h.8.mlp
Setting up gated layer transformer.h.7.mlp
Setting up gated layer transformer.h.6.mlp
Setting up gated layer transformer.h.5.mlp
Setting up gated layer transformer.h.4.mlp
Setting up gated layer transformer.h.3.mlp
Setting up gated layer transformer.h.2.mlp
Setting up gated layer transformer.h.1.mlp
Setting up gated layer transformer.h.0.mlp


In [26]:
eval_perplexity_with_trainer(model_LC, data_collator, lm_datasets["test"])



302.94478695273517

In [27]:
for i in range(0,13):
    temp_model = copy.deepcopy(model_LC)
    collapse_bypass_only(temp_model, single_layer_id=str(i))
    print("collapsing layer {}".format(i) + str(eval_perplexity_with_trainer(temp_model, data_collator, lm_datasets["test"])))

Collapsing layer transformer.h.1.mlp


collapsing layer 13210.4160152565
Collapsing layer transformer.h.2.mlp


collapsing layer 23121.042249534902
Collapsing layer transformer.h.3.mlp


collapsing layer 310175.220937934318
Collapsing layer transformer.h.4.mlp


collapsing layer 48964.618584207288
Collapsing layer transformer.h.5.mlp


collapsing layer 53792.91279070221
Collapsing layer transformer.h.6.mlp


collapsing layer 65257.848682024506
Collapsing layer transformer.h.7.mlp


collapsing layer 73571.4093076164586
Collapsing layer transformer.h.8.mlp


collapsing layer 825512.408079763332
Collapsing layer transformer.h.9.mlp


collapsing layer 93580.9211779836132
Collapsing layer transformer.h.10.mlp


collapsing layer 102800.8703570191306
Collapsing layer transformer.h.11.mlp


collapsing layer 11733567.1077700461


collapsing layer 12302.94478695273517


In [29]:
i = 12
temp_model = copy.deepcopy(model_LC)
collapse_bypass_only(temp_model, single_layer_id=str(i))
print("collapsing layer {}".format(i) + str(eval_perplexity_with_trainer(temp_model, data_collator, lm_datasets["test"])))

collapsing layer 12302.94478695273517
