In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
model, tokenizer = get_classification_bert_model("gchhablani/bert-base-cased-finetuned-qnli")
model.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/base-ft-qnli/orig_qnli.pth"))
model_LC = get_LC_model_bert(model, num_GP_layers=1)

Setting up gated layer bert.encoder.layer.11


In [30]:
train_dataloader, validation_dataloader, dataset = get_glue_task_dataset('qnli', tokenizer)

In [31]:
standard_evaluate(model, dataset["validation"], "qnli")



{'eval_loss': 0.2749437689781189,
 'eval_accuracy': 0.9099395936298736,
 'eval_runtime': 20.7598,
 'eval_samples_per_second': 263.153,
 'eval_steps_per_second': 8.237}

In [5]:
get_layer_gates_loss(model_LC)

Layer bert.encoder.layer.11: loss:  tensor([0.9801], grad_fn=<PowBackward0>)


In [9]:
train_with_trainer(model_LC, dataset["train"], dataset["validation"], epochs=0.3,
                    eval_batch_size=8, train_batch_size=8, gp_weight=5, learning_rate=0.0001, task="sst2", lr_decay=0.99, weight_decay=0, use_sgd=False)

Step,Training Loss
10,29.1046
20,28.8169
30,28.0649
40,27.6302
50,27.1051
60,26.4942
70,26.1052
80,25.3684
90,25.0743
100,24.4711


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


In [10]:
get_layer_gates_loss(model_LC)

Layer bert.encoder.layer.11: loss:  tensor([0.0943], device='cuda:0', grad_fn=<PowBackward0>)


In [7]:
collapse_model(model_LC)

Collapsing layer bert.encoder.layer.11


In [11]:
standard_evaluate(model_LC, dataset["validation"], "sst2")

{'eval_loss': 0.36678770184516907,
 'eval_accuracy': 0.8864678899082569,
 'eval_runtime': 3.3662,
 'eval_samples_per_second': 259.045,
 'eval_steps_per_second': 8.318}

In [22]:
get_num_parameters(model_LC)

105351938

In [23]:
get_num_parameters(model)

109483778

In [24]:
d = model_LC.state_dict()

In [28]:
for k in d.keys():
    if "act" in k:
        print(k)

In [27]:
# remove key "bert.encoder.layer.11.intermediate.act.weight" from model_LC.state_dict()
d.pop("bert.encoder.layer.11.intermediate.act.weight")

tensor([0.6930], device='cuda:0')

# mac limit

In [2]:
huggingface_models = {"base": "bert-base-uncased",
                        "large": "bert-large-uncased",
                      "base-ft-sst2": "yoshitomo-matsubara/bert-base-uncased-sst2",
                        "large-ft-sst2": "yoshitomo-matsubara/bert-large-uncased-sst2",
                      "base-ft-stsb": "gchhablani/bert-base-cased-finetuned-stsb",
                        "large-ft-stsb": "yoshitomo-matsubara/bert-large-uncased-stsb",
                      "base-ft-mrpc": "textattack/bert-base-uncased-MRPC",
                        "large-ft-mrpc": "yoshitomo-matsubara/bert-large-uncased-mrpc",
                      "base-ft-cola": "yoshitomo-matsubara/bert-base-uncased-cola",
                        "large-ft-cola": "yoshitomo-matsubara/bert-large-uncased-cola",
                      "base-ft-qnli": "gchhablani/bert-base-cased-finetuned-qnli",
                        "large-ft-qnli": "yoshitomo-matsubara/bert-large-uncased-qnli",
                      "base-ft-mnli": "yoshitomo-matsubara/bert-base-uncased-mnli",
                        "large-ft-mnli": "yoshitomo-matsubara/bert-large-uncased-mnli",
                      "base-ft-rte": "anirudh21/bert-base-uncased-finetuned-rte",
                        "large-ft-rte": "yoshitomo-matsubara/bert-large-uncased-rte",
                      "base-ft-qqp": "A-bhimany-u08/bert-base-cased-qqp",
                        "large-ft-qqp": "yoshitomo-matsubara/bert-large-uncased-qqp",
                      "base-ft-wnli": "gchhablani/bert-base-cased-finetuned-wnli",
                        "large-ft-wnli": "yoshitomo-matsubara/bert-large-uncased-wnli",
                      }

In [3]:
model_base, tokenizer = get_classification_bert_model(pre_trained_model_name=huggingface_models["large-ft-qqp"])

In [4]:
num_params_base = get_num_parameters(model_base)

In [5]:
model_LC = get_LC_model_bert(model_base, num_GP_layers=100)

Setting up gated layer bert.encoder.layer.23
Setting up gated layer bert.encoder.layer.22
Setting up gated layer bert.encoder.layer.21
Setting up gated layer bert.encoder.layer.20
Setting up gated layer bert.encoder.layer.19
Setting up gated layer bert.encoder.layer.18
Setting up gated layer bert.encoder.layer.17
Setting up gated layer bert.encoder.layer.16
Setting up gated layer bert.encoder.layer.15
Setting up gated layer bert.encoder.layer.14
Setting up gated layer bert.encoder.layer.13
Setting up gated layer bert.encoder.layer.12
Setting up gated layer bert.encoder.layer.11
Setting up gated layer bert.encoder.layer.10
Setting up gated layer bert.encoder.layer.9
Setting up gated layer bert.encoder.layer.8
Setting up gated layer bert.encoder.layer.7
Setting up gated layer bert.encoder.layer.6
Setting up gated layer bert.encoder.layer.5
Setting up gated layer bert.encoder.layer.4
Setting up gated layer bert.encoder.layer.3
Setting up gated layer bert.encoder.layer.2
Setting up gated l

In [13]:
collapse_model(model_LC)

Collapsing layer bert.encoder.layer.23
Collapsing layer bert.encoder.layer.22
Collapsing layer bert.encoder.layer.21
Collapsing layer bert.encoder.layer.20
Collapsing layer bert.encoder.layer.19
Collapsing layer bert.encoder.layer.18
Collapsing layer bert.encoder.layer.17
Collapsing layer bert.encoder.layer.16
Collapsing layer bert.encoder.layer.15
Collapsing layer bert.encoder.layer.14
Collapsing layer bert.encoder.layer.13
Collapsing layer bert.encoder.layer.12
Collapsing layer bert.encoder.layer.11
Collapsing layer bert.encoder.layer.10
Collapsing layer bert.encoder.layer.9
Collapsing layer bert.encoder.layer.8
Collapsing layer bert.encoder.layer.7
Collapsing layer bert.encoder.layer.6
Collapsing layer bert.encoder.layer.5
Collapsing layer bert.encoder.layer.4
Collapsing layer bert.encoder.layer.3
Collapsing layer bert.encoder.layer.2
Collapsing layer bert.encoder.layer.1
Collapsing layer bert.encoder.layer.0


In [14]:
num_params_LC = get_num_parameters(model_LC)

In [15]:
num_params_base

335143938

In [16]:
num_params_LC

158884866

In [17]:
ratio = 1 - (num_params_LC/num_params_base)
print("Compression ratio: ", ratio)

Compression ratio:  0.5259205135913871


In [18]:
dummy_input = torch.ones((1, 128), dtype=torch.long)

In [21]:
get_model_macs(model_LC, dummy_input)



16918907008

In [22]:
get_model_macs(model_base, dummy_input)



39467485312

In [23]:
(39467485312 - 16918907008) / 39467485312

0.5713203698119615