In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
huggingface_models = {"base": "bert-base-uncased",
                        "large": "bert-large-uncased",
                      "base-ft-sst2": "yoshitomo-matsubara/bert-base-uncased-sst2",
                        "large-ft-sst2": "yoshitomo-matsubara/bert-large-uncased-sst2",
                      "base-ft-stsb": "gchhablani/bert-base-cased-finetuned-stsb",
                        "large-ft-stsb": "yoshitomo-matsubara/bert-large-uncased-stsb",
                      "base-ft-mrpc": "textattack/bert-base-uncased-MRPC",
                        "large-ft-mrpc": "yoshitomo-matsubara/bert-large-uncased-mrpc",
                      "base-ft-cola": "yoshitomo-matsubara/bert-base-uncased-cola",
                        "large-ft-cola": "yoshitomo-matsubara/bert-large-uncased-cola",
                      "base-ft-qnli": "gchhablani/bert-base-cased-finetuned-qnli",
                        "large-ft-qnli": "yoshitomo-matsubara/bert-large-uncased-qnli",
                      "base-ft-mnli": "yoshitomo-matsubara/bert-base-uncased-mnli",
                        "large-ft-mnli": "yoshitomo-matsubara/bert-large-uncased-mnli",
                      "base-ft-rte": "anirudh21/bert-base-uncased-finetuned-rte",
                        "large-ft-rte": "gchhablani/bert-large-cased-finetuned-rte",
                      "base-ft-qqp": "A-bhimany-u08/bert-base-cased-qqp",
                        "large-ft-qqp": "yoshitomo-matsubara/bert-large-uncased-qqp",
                      "base-ft-wnli": "gchhablani/bert-base-cased-finetuned-wnli",
                        "large-ft-wnli": "yoshitomo-matsubara/bert-large-uncased-wnli",
                      }

# Bert Base

## QQP

In [3]:
model_base, tokenizer = get_classification_bert_model(pre_trained_model_name=huggingface_models["base-ft-qqp"])

In [4]:
train_dataloader, validation_dataloader, dataset = get_glue_task_dataset('qqp', tokenizer)

In [12]:
model_base.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/base-ft-qqp/orig_qqp.pth"))

<All keys matched successfully>

In [5]:
model_LC = get_LC_model_bert(model_base, num_GP_layers=4)

Setting up gated layer bert.encoder.layer.11
Setting up gated layer bert.encoder.layer.10
Setting up gated layer bert.encoder.layer.9
Setting up gated layer bert.encoder.layer.8


In [6]:
model_LC.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/base-ft-qqp/4_qqp.pth"))

<All keys matched successfully>

In [7]:
collapse_model(model_LC)

Collapsing layer bert.encoder.layer.11
Collapsing layer bert.encoder.layer.10
Collapsing layer bert.encoder.layer.9
Collapsing layer bert.encoder.layer.8


In [9]:
standard_evaluate(model_LC, dataset["validation"], "qqp")



{'eval_loss': 0.2924763560295105,
 'eval_accuracy': 0.8851100667820925,
 'eval_f1': 0.8398331092031309,
 'eval_combined_score': 0.8624715879926117,
 'eval_runtime': 264.3965,
 'eval_samples_per_second': 152.914,
 'eval_steps_per_second': 4.781}

In [13]:
standard_evaluate(model_base, dataset["validation"], "qqp")

{'eval_loss': 0.3376820385456085,
 'eval_accuracy': 0.9099183774424932,
 'eval_f1': 0.8776127427918543,
 'eval_combined_score': 0.8937655601171737,
 'eval_runtime': 175.4034,
 'eval_samples_per_second': 230.497,
 'eval_steps_per_second': 7.206}

In [8]:
print("base model parameters: ", get_num_parameters(model_base))
print("LC model parameters: ", get_num_parameters(model_LC))

base model parameters:  108311810
LC model parameters:  91784450


## SST2 

In [35]:
model_base, tokenizer = get_classification_bert_model(pre_trained_model_name=huggingface_models["base-ft-sst2"])
train_dataloader, validation_dataloader, dataset = get_glue_task_dataset('sst2', tokenizer)
model_LC = get_LC_model_bert(model_base, num_GP_layers=4)
model_LC.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/base-ft-sst2/4_sst2.pth"))
model_base.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/base-ft-sst2/orig_sst2.pth"))
collapse_model(model_LC)

Downloading data: 100%|██████████| 3.11M/3.11M [00:00<00:00, 12.8MB/s]
Downloading data: 100%|██████████| 72.8k/72.8k [00:00<00:00, 690kB/s]
Downloading data: 100%|██████████| 148k/148k [00:00<00:00, 886kB/s]
Generating train split: 100%|██████████| 67349/67349 [00:00<00:00, 906708.07 examples/s]
Generating validation split: 100%|██████████| 872/872 [00:00<00:00, 154641.79 examples/s]
Generating test split: 100%|██████████| 1821/1821 [00:00<00:00, 323581.92 examples/s]
Map: 100%|██████████| 67349/67349 [00:17<00:00, 3884.31 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 3953.92 examples/s]
Map: 100%|██████████| 1821/1821 [00:00<00:00, 3849.85 examples/s]


Setting up gated layer bert.encoder.layer.11
Setting up gated layer bert.encoder.layer.10
Setting up gated layer bert.encoder.layer.9
Setting up gated layer bert.encoder.layer.8
Collapsing layer bert.encoder.layer.11
Collapsing layer bert.encoder.layer.10
Collapsing layer bert.encoder.layer.9
Collapsing layer bert.encoder.layer.8


In [15]:
print("base model parameters: ", get_num_parameters(model_base))
print("LC model parameters: ", get_num_parameters(model_LC))

base model parameters:  109483778
LC model parameters:  92956418


In [16]:
standard_evaluate(model_base, dataset["validation"], "sst2")



{'eval_loss': 0.24907992780208588,
 'eval_accuracy': 0.926605504587156,
 'eval_runtime': 3.5167,
 'eval_samples_per_second': 247.962,
 'eval_steps_per_second': 7.962}

In [36]:
standard_evaluate(model_LC, dataset["validation"], "sst2")

{'eval_loss': 0.320715993642807,
 'eval_accuracy': 0.911697247706422,
 'eval_runtime': 5.7665,
 'eval_samples_per_second': 151.219,
 'eval_steps_per_second': 4.856}

## MRPC

In [18]:
model_base, tokenizer = get_classification_bert_model(pre_trained_model_name=huggingface_models["base-ft-mrpc"])
train_dataloader, validation_dataloader, dataset = get_glue_task_dataset('mrpc', tokenizer)
model_LC = get_LC_model_bert(model_base, num_GP_layers=4)
model_LC.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/base-ft-mrpc/4_mrpc.pth"))
model_base.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/base-ft-mrpc/orig_mrpc.pth"))
collapse_model(model_LC)

Setting up gated layer bert.encoder.layer.11
Setting up gated layer bert.encoder.layer.10
Setting up gated layer bert.encoder.layer.9
Setting up gated layer bert.encoder.layer.8
Collapsing layer bert.encoder.layer.11
Collapsing layer bert.encoder.layer.10
Collapsing layer bert.encoder.layer.9
Collapsing layer bert.encoder.layer.8


In [19]:
print("base model parameters: ", get_num_parameters(model_base))
print("LC model parameters: ", get_num_parameters(model_LC))

base model parameters:  109483778
LC model parameters:  92956418


In [20]:
standard_evaluate(model_base, dataset["validation"], "mrpc")



{'eval_loss': 0.7286306619644165,
 'eval_accuracy': 0.8308823529411765,
 'eval_f1': 0.8848080133555927,
 'eval_combined_score': 0.8578451831483846,
 'eval_runtime': 3.3672,
 'eval_samples_per_second': 121.168,
 'eval_steps_per_second': 3.861}

In [22]:
standard_evaluate(model_LC, dataset["validation"], "mrpc")



{'eval_loss': 0.7689980864524841,
 'eval_accuracy': 0.8627450980392157,
 'eval_f1': 0.9044368600682594,
 'eval_combined_score': 0.8835909790537375,
 'eval_runtime': 2.5633,
 'eval_samples_per_second': 159.172,
 'eval_steps_per_second': 5.072}

## mnli

In [26]:
model_base, tokenizer = get_classification_bert_model(pre_trained_model_name=huggingface_models["base-ft-mnli"])
train_dataloader, validation_dataloader, dataset = get_glue_task_dataset('mnli', tokenizer)
model_LC = get_LC_model_bert(model_base, num_GP_layers=4)
model_LC.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/base-ft-mnli/4_mnli.pth"))
model_base.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/base-ft-mnli/orig_mnli.pth"))
collapse_model(model_LC)

Setting up gated layer bert.encoder.layer.11
Setting up gated layer bert.encoder.layer.10
Setting up gated layer bert.encoder.layer.9
Setting up gated layer bert.encoder.layer.8
Collapsing layer bert.encoder.layer.11
Collapsing layer bert.encoder.layer.10
Collapsing layer bert.encoder.layer.9
Collapsing layer bert.encoder.layer.8


In [27]:
print("base model parameters: ", get_num_parameters(model_base))
print("LC model parameters: ", get_num_parameters(model_LC))

base model parameters:  109484547
LC model parameters:  92957187


In [29]:
standard_evaluate(model_base, dataset["validation_matched"], "mnli")



{'eval_loss': 0.4682498276233673,
 'eval_accuracy': 0.8347427407030056,
 'eval_runtime': 35.9389,
 'eval_samples_per_second': 273.103,
 'eval_steps_per_second': 8.542}

In [32]:
standard_evaluate(model_base, dataset["validation_mismatched"], "mnli")

{'eval_loss': 0.4504593312740326,
 'eval_accuracy': 0.8371643612693247,
 'eval_runtime': 35.9834,
 'eval_samples_per_second': 273.237,
 'eval_steps_per_second': 8.559}

In [33]:
standard_evaluate(model_LC, dataset["validation_matched"], "mnli")

{'eval_loss': 0.5070212483406067,
 'eval_accuracy': 0.8254712175241976,
 'eval_runtime': 34.7803,
 'eval_samples_per_second': 282.2,
 'eval_steps_per_second': 8.827}

In [34]:
standard_evaluate(model_LC, dataset["validation_mismatched"], "mnli")

{'eval_loss': 0.4870189428329468,
 'eval_accuracy': 0.8281122864117169,
 'eval_runtime': 34.9968,
 'eval_samples_per_second': 280.94,
 'eval_steps_per_second': 8.801}

## STSB

In [8]:
model_base, tokenizer = get_classification_bert_model(pre_trained_model_name=huggingface_models["base-ft-stsb"])
train_dataloader, validation_dataloader, dataset = get_glue_task_dataset('stsb', tokenizer)
model_LC = get_LC_model_bert(model_base, num_GP_layers=4)
model_LC.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/base-ft-stsb/4_stsb.pth"))
model_base.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/base-ft-stsb/orig_stsb.pth"))
collapse_model(model_LC)

Downloading data: 100%|██████████| 502k/502k [00:00<00:00, 1.82MB/s]
Downloading data: 100%|██████████| 151k/151k [00:00<00:00, 839kB/s]
Downloading data: 100%|██████████| 114k/114k [00:00<00:00, 547kB/s]
Generating train split: 100%|██████████| 5749/5749 [00:00<00:00, 491301.01 examples/s]
Generating validation split: 100%|██████████| 1500/1500 [00:00<00:00, 268911.61 examples/s]
Generating test split: 100%|██████████| 1379/1379 [00:00<00:00, 250463.14 examples/s]
Map: 100%|██████████| 5749/5749 [00:01<00:00, 5027.05 examples/s]
Map: 100%|██████████| 1500/1500 [00:00<00:00, 4187.63 examples/s]
Map: 100%|██████████| 1379/1379 [00:00<00:00, 6362.51 examples/s]


Setting up gated layer bert.encoder.layer.11
Setting up gated layer bert.encoder.layer.10
Setting up gated layer bert.encoder.layer.9
Setting up gated layer bert.encoder.layer.8
Collapsing layer bert.encoder.layer.11
Collapsing layer bert.encoder.layer.10
Collapsing layer bert.encoder.layer.9
Collapsing layer bert.encoder.layer.8


In [9]:
print("base model parameters: ", get_num_parameters(model_base))
print("LC model parameters: ", get_num_parameters(model_LC))

base model parameters:  108311041
LC model parameters:  91783681


In [11]:
standard_evaluate(model_base, dataset["validation"], "stsb")



{'eval_loss': 0.6192435622215271,
 'eval_pearson': 0.8641723498039111,
 'eval_spearmanr': 0.8636361112510562,
 'eval_combined_score': 0.8639042305274837,
 'eval_runtime': 11.1139,
 'eval_samples_per_second': 134.966,
 'eval_steps_per_second': 4.229}

In [12]:
standard_evaluate(model_LC, dataset["validation"], "stsb")

{'eval_loss': 0.570755660533905,
 'eval_pearson': 0.8723157966536561,
 'eval_spearmanr': 0.872965843659959,
 'eval_combined_score': 0.8726408201568076,
 'eval_runtime': 9.8722,
 'eval_samples_per_second': 151.942,
 'eval_steps_per_second': 4.761}

## CoLA

In [14]:
model_base, tokenizer = get_classification_bert_model(pre_trained_model_name=huggingface_models["base-ft-cola"])
train_dataloader, validation_dataloader, dataset = get_glue_task_dataset('cola', tokenizer)
model_LC = get_LC_model_bert(model_base, num_GP_layers=4)
model_LC.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/base-ft-cola/4_cola.pth"))
# model_base.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/base-ft-cola/orig_cola.pth"))
collapse_model(model_LC)

Map: 100%|██████████| 1043/1043 [00:00<00:00, 4099.87 examples/s]


Setting up gated layer bert.encoder.layer.11
Setting up gated layer bert.encoder.layer.10
Setting up gated layer bert.encoder.layer.9
Setting up gated layer bert.encoder.layer.8
Collapsing layer bert.encoder.layer.11
Collapsing layer bert.encoder.layer.10
Collapsing layer bert.encoder.layer.9
Collapsing layer bert.encoder.layer.8


In [15]:
print("base model parameters: ", get_num_parameters(model_base))
print("LC model parameters: ", get_num_parameters(model_LC))

base model parameters:  109483778
LC model parameters:  92956418


In [16]:
standard_evaluate(model_base, dataset["validation"], "cola")

{'eval_loss': 1.3386995792388916,
 'eval_matthews_correlation': 0.6104966084654571,
 'eval_runtime': 8.1307,
 'eval_samples_per_second': 128.279,
 'eval_steps_per_second': 4.059}

In [17]:
standard_evaluate(model_LC, dataset["validation"], "cola")



{'eval_loss': 0.6876524686813354,
 'eval_matthews_correlation': 0.5963273779713936,
 'eval_runtime': 6.7481,
 'eval_samples_per_second': 154.563,
 'eval_steps_per_second': 4.89}

## QNLI

In [37]:
model_base, tokenizer = get_classification_bert_model(pre_trained_model_name=huggingface_models["base-ft-qnli"])
train_dataloader, validation_dataloader, dataset = get_glue_task_dataset('qnli', tokenizer)
model_LC = get_LC_model_bert(model_base, num_GP_layers=4)
model_LC.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/base-ft-qnli/4_qnli.pth"))
model_base.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/base-ft-qnli/orig_qnli.pth"))
collapse_model(model_LC)

Setting up gated layer bert.encoder.layer.11
Setting up gated layer bert.encoder.layer.10
Setting up gated layer bert.encoder.layer.9
Setting up gated layer bert.encoder.layer.8
Collapsing layer bert.encoder.layer.11
Collapsing layer bert.encoder.layer.10
Collapsing layer bert.encoder.layer.9
Collapsing layer bert.encoder.layer.8


In [38]:
print("base model parameters: ", get_num_parameters(model_base))
print("LC model parameters: ", get_num_parameters(model_LC))

base model parameters:  108311810
LC model parameters:  91784450


In [20]:
standard_evaluate(model_base, dataset["validation"], "qnli")



{'eval_loss': 0.2749437689781189,
 'eval_accuracy': 0.9099395936298736,
 'eval_runtime': 40.4869,
 'eval_samples_per_second': 134.933,
 'eval_steps_per_second': 4.224}

In [39]:
standard_evaluate(model_LC, dataset["validation"], "qnli")

{'eval_loss': 0.3908904194831848,
 'eval_accuracy': 0.8980413692110562,
 'eval_runtime': 19.5088,
 'eval_samples_per_second': 280.028,
 'eval_steps_per_second': 8.765}

## RTE

In [46]:
model_base, tokenizer = get_classification_bert_model(pre_trained_model_name=huggingface_models["base-ft-rte"])
train_dataloader, validation_dataloader, dataset = get_glue_task_dataset('rte', tokenizer)
model_LC = get_LC_model_bert(model_base, num_GP_layers=4)
model_LC.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/base-ft-rte/4_rte.pth"))
model_base.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/base-ft-rte/orig_rte.pth"))
collapse_model(model_LC)

Setting up gated layer bert.encoder.layer.11
Setting up gated layer bert.encoder.layer.10
Setting up gated layer bert.encoder.layer.9
Setting up gated layer bert.encoder.layer.8
Collapsing layer bert.encoder.layer.11
Collapsing layer bert.encoder.layer.10
Collapsing layer bert.encoder.layer.9
Collapsing layer bert.encoder.layer.8


In [43]:
standard_evaluate(model_base, dataset["validation"], "rte")



{'eval_loss': 1.309451699256897,
 'eval_accuracy': 0.6570397111913358,
 'eval_runtime': 1.0872,
 'eval_samples_per_second': 254.775,
 'eval_steps_per_second': 8.278}

In [47]:
standard_evaluate(model_LC, dataset["validation"], "rte")

{'eval_loss': 1.0787622928619385,
 'eval_accuracy': 0.6137184115523465,
 'eval_runtime': 1.0057,
 'eval_samples_per_second': 275.426,
 'eval_steps_per_second': 8.949}

In [48]:
print("base model parameters: ", get_num_parameters(model_base))
print("LC model parameters: ", get_num_parameters(model_LC))

base model parameters:  109483778
LC model parameters:  92956418


# Large

## QQP

In [4]:
model_base, tokenizer = get_classification_bert_model(pre_trained_model_name=huggingface_models["large-ft-qqp"])
train_dataloader, validation_dataloader, dataset = get_glue_task_dataset('qqp', tokenizer)
model_LC = get_LC_model_bert(model_base, num_GP_layers=4)
model_LC.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/large-ft-qqp/4_qqp.pth"))
model_base.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/large-ft-qqp/orig_qqp.pth"))
collapse_model(model_LC)

Setting up gated layer bert.encoder.layer.23
Setting up gated layer bert.encoder.layer.22
Setting up gated layer bert.encoder.layer.21
Setting up gated layer bert.encoder.layer.20
Collapsing layer bert.encoder.layer.23
Collapsing layer bert.encoder.layer.22
Collapsing layer bert.encoder.layer.21
Collapsing layer bert.encoder.layer.20


In [5]:
print("base model parameters: ", get_num_parameters(model_base))
print("LC model parameters: ", get_num_parameters(model_LC))

base model parameters:  335143938
LC model parameters:  305767426


In [6]:
standard_evaluate(model_base, dataset["validation"], "qqp")



{'eval_loss': 0.24002790451049805,
 'eval_accuracy': 0.9133316843927777,
 'eval_f1': 0.885227644939404,
 'eval_combined_score': 0.8992796646660908,
 'eval_runtime': 742.1176,
 'eval_samples_per_second': 54.479,
 'eval_steps_per_second': 1.703}

In [7]:
standard_evaluate(model_LC, dataset["validation"], "qqp")



{'eval_loss': 0.27776554226875305,
 'eval_accuracy': 0.9102151867425179,
 'eval_f1': 0.878236951563129,
 'eval_combined_score': 0.8942260691528234,
 'eval_runtime': 710.5415,
 'eval_samples_per_second': 56.9,
 'eval_steps_per_second': 1.779}

## QNLI

In [8]:
model_base, tokenizer = get_classification_bert_model(pre_trained_model_name=huggingface_models["large-ft-qnli"])
train_dataloader, validation_dataloader, dataset = get_glue_task_dataset('qnli', tokenizer)
model_LC = get_LC_model_bert(model_base, num_GP_layers=4)
model_LC.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/large-ft-qnli/4_qnli.pth"))
model_base.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/large-ft-qnli/orig_qnli.pth"))
collapse_model(model_LC)

Setting up gated layer bert.encoder.layer.23
Setting up gated layer bert.encoder.layer.22
Setting up gated layer bert.encoder.layer.21
Setting up gated layer bert.encoder.layer.20
Collapsing layer bert.encoder.layer.23
Collapsing layer bert.encoder.layer.22
Collapsing layer bert.encoder.layer.21
Collapsing layer bert.encoder.layer.20


In [9]:
print("base model parameters: ", get_num_parameters(model_base))
print("LC model parameters: ", get_num_parameters(model_LC))

base model parameters:  335143938
LC model parameters:  305767426


In [10]:
standard_evaluate(model_base, dataset["validation"], "qnli")



{'eval_loss': 0.9608904719352722,
 'eval_accuracy': 0.8927329306241991,
 'eval_runtime': 114.7363,
 'eval_samples_per_second': 47.614,
 'eval_steps_per_second': 1.49}

In [11]:
standard_evaluate(model_LC, dataset["validation"], "qnli")

{'eval_loss': 0.2726178765296936,
 'eval_accuracy': 0.9139666849716274,
 'eval_runtime': 75.018,
 'eval_samples_per_second': 72.823,
 'eval_steps_per_second': 2.279}

## MNLI

In [36]:
model_base, tokenizer = get_classification_bert_model(pre_trained_model_name=huggingface_models["large-ft-mnli"])
train_dataloader, validation_dataloader, dataset = get_glue_task_dataset('mnli', tokenizer)
model_LC = get_LC_model_bert(model_base, num_GP_layers=4)
model_LC.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/large-ft-mnli/4_mnli.pth"))
model_base.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/large-ft-mnli/orig_mnli.pth"))
collapse_model(model_LC)

Map: 100%|██████████| 9832/9832 [00:02<00:00, 3556.13 examples/s]


Setting up gated layer bert.encoder.layer.23
Setting up gated layer bert.encoder.layer.22
Setting up gated layer bert.encoder.layer.21
Setting up gated layer bert.encoder.layer.20
Collapsing layer bert.encoder.layer.23
Collapsing layer bert.encoder.layer.22
Collapsing layer bert.encoder.layer.21
Collapsing layer bert.encoder.layer.20


In [13]:
print("base model parameters: ", get_num_parameters(model_base))
print("LC model parameters: ", get_num_parameters(model_LC))

base model parameters:  335144963
LC model parameters:  305768451


In [14]:
standard_evaluate(model_base, dataset["validation_matched"], "mnli")

{'eval_loss': 0.39440056681632996,
 'eval_accuracy': 0.862761079979623,
 'eval_runtime': 205.6252,
 'eval_samples_per_second': 47.732,
 'eval_steps_per_second': 1.493}

In [15]:
standard_evaluate(model_base, dataset["validation_mismatched"], "mnli")

{'eval_loss': 0.39168038964271545,
 'eval_accuracy': 0.8610659072416599,
 'eval_runtime': 208.7655,
 'eval_samples_per_second': 47.096,
 'eval_steps_per_second': 1.475}

In [16]:
standard_evaluate(model_LC, dataset["validation_matched"], "mnli")

{'eval_loss': 0.520072340965271,
 'eval_accuracy': 0.8663270504330107,
 'eval_runtime': 191.3468,
 'eval_samples_per_second': 51.294,
 'eval_steps_per_second': 1.604}

In [17]:
standard_evaluate(model_LC, dataset["validation_mismatched"], "mnli")

{'eval_loss': 0.5458422899246216,
 'eval_accuracy': 0.8552685109845403,
 'eval_runtime': 190.7154,
 'eval_samples_per_second': 51.553,
 'eval_steps_per_second': 1.615}

## CoLA

In [18]:
model_base, tokenizer = get_classification_bert_model(pre_trained_model_name=huggingface_models["large-ft-cola"])
train_dataloader, validation_dataloader, dataset = get_glue_task_dataset('cola', tokenizer)
model_LC = get_LC_model_bert(model_base, num_GP_layers=4)
model_LC.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/large-ft-cola/4_cola.pth"))
model_base.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/large-ft-cola/orig_cola.pth"))
collapse_model(model_LC)

Setting up gated layer bert.encoder.layer.23
Setting up gated layer bert.encoder.layer.22
Setting up gated layer bert.encoder.layer.21
Setting up gated layer bert.encoder.layer.20
Collapsing layer bert.encoder.layer.23
Collapsing layer bert.encoder.layer.22
Collapsing layer bert.encoder.layer.21
Collapsing layer bert.encoder.layer.20


In [19]:
print("base model parameters: ", get_num_parameters(model_base))
print("LC model parameters: ", get_num_parameters(model_LC))

base model parameters:  335143938
LC model parameters:  305767426


In [20]:
standard_evaluate(model_base, dataset["validation"], "cola")

{'eval_loss': 1.4323997497558594,
 'eval_matthews_correlation': 0.6256921039386708,
 'eval_runtime': 21.2331,
 'eval_samples_per_second': 49.121,
 'eval_steps_per_second': 1.554}

In [21]:
standard_evaluate(model_LC, dataset["validation"], "cola")



{'eval_loss': 0.6261414289474487,
 'eval_matthews_correlation': 0.6259118543644094,
 'eval_runtime': 20.3845,
 'eval_samples_per_second': 51.166,
 'eval_steps_per_second': 1.619}

## MRPC

In [22]:
model_base, tokenizer = get_classification_bert_model(pre_trained_model_name=huggingface_models["large-ft-mrpc"])
train_dataloader, validation_dataloader, dataset = get_glue_task_dataset('mrpc', tokenizer)
model_LC = get_LC_model_bert(model_base, num_GP_layers=4)
model_LC.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/large-ft-mrpc/4_mrpc.pth"))
model_base.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/large-ft-mrpc/orig_mrpc.pth"))
collapse_model(model_LC)

Setting up gated layer bert.encoder.layer.23
Setting up gated layer bert.encoder.layer.22
Setting up gated layer bert.encoder.layer.21
Setting up gated layer bert.encoder.layer.20
Collapsing layer bert.encoder.layer.23
Collapsing layer bert.encoder.layer.22
Collapsing layer bert.encoder.layer.21
Collapsing layer bert.encoder.layer.20


In [23]:
print("base model parameters: ", get_num_parameters(model_base))
print("LC model parameters: ", get_num_parameters(model_LC))

base model parameters:  335143938
LC model parameters:  305767426


In [24]:
standard_evaluate(model_base, dataset["validation"], "mrpc")



{'eval_loss': 1.272477149963379,
 'eval_accuracy': 0.8333333333333334,
 'eval_f1': 0.8827586206896552,
 'eval_combined_score': 0.8580459770114943,
 'eval_runtime': 8.4007,
 'eval_samples_per_second': 48.568,
 'eval_steps_per_second': 1.547}

In [25]:
standard_evaluate(model_LC, dataset["validation"], "mrpc")



{'eval_loss': 0.5088669061660767,
 'eval_accuracy': 0.8529411764705882,
 'eval_f1': 0.896551724137931,
 'eval_combined_score': 0.8747464503042597,
 'eval_runtime': 7.8904,
 'eval_samples_per_second': 51.708,
 'eval_steps_per_second': 1.648}

## SST2

In [27]:
model_base, tokenizer = get_classification_bert_model(pre_trained_model_name=huggingface_models["large-ft-sst2"])
train_dataloader, validation_dataloader, dataset = get_glue_task_dataset('sst2', tokenizer)
model_LC = get_LC_model_bert(model_base, num_GP_layers=4)
model_LC.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/large-ft-sst2/4_sst2.pth"))
# model_base.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/large-ft-sst2/orig_sst2.pth"))
collapse_model(model_LC)

Setting up gated layer bert.encoder.layer.23
Setting up gated layer bert.encoder.layer.22
Setting up gated layer bert.encoder.layer.21
Setting up gated layer bert.encoder.layer.20
Collapsing layer bert.encoder.layer.23
Collapsing layer bert.encoder.layer.22
Collapsing layer bert.encoder.layer.21
Collapsing layer bert.encoder.layer.20


In [28]:
print("base model parameters: ", get_num_parameters(model_base))
print("LC model parameters: ", get_num_parameters(model_LC))

base model parameters:  335143938
LC model parameters:  305767426


In [29]:
standard_evaluate(model_base, dataset["validation"], "sst2")



{'eval_loss': 0.22841081023216248,
 'eval_accuracy': 0.9346330275229358,
 'eval_runtime': 18.4591,
 'eval_samples_per_second': 47.239,
 'eval_steps_per_second': 1.517}

In [30]:
standard_evaluate(model_LC, dataset["validation"], "sst2")

{'eval_loss': 0.2305382490158081,
 'eval_accuracy': 0.930045871559633,
 'eval_runtime': 16.453,
 'eval_samples_per_second': 52.999,
 'eval_steps_per_second': 1.702}

## STSB

In [31]:
model_base, tokenizer = get_classification_bert_model(pre_trained_model_name=huggingface_models["large-ft-stsb"])
train_dataloader, validation_dataloader, dataset = get_glue_task_dataset('stsb', tokenizer)
model_LC = get_LC_model_bert(model_base, num_GP_layers=4)
model_LC.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/large-ft-stsb/4_stsb.pth"))
model_base.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/large-ft-stsb/orig_stsb.pth"))
collapse_model(model_LC)

Setting up gated layer bert.encoder.layer.23
Setting up gated layer bert.encoder.layer.22
Setting up gated layer bert.encoder.layer.21
Setting up gated layer bert.encoder.layer.20
Collapsing layer bert.encoder.layer.23
Collapsing layer bert.encoder.layer.22
Collapsing layer bert.encoder.layer.21
Collapsing layer bert.encoder.layer.20


In [32]:
print("base model parameters: ", get_num_parameters(model_base))
print("LC model parameters: ", get_num_parameters(model_LC))

base model parameters:  335142913
LC model parameters:  305766401


In [33]:
standard_evaluate(model_base, dataset["validation"], "stsb")

{'eval_loss': 0.5596344470977783,
 'eval_pearson': 0.8770772340838442,
 'eval_spearmanr': 0.8758346837155176,
 'eval_combined_score': 0.876455958899681,
 'eval_runtime': 13.2391,
 'eval_samples_per_second': 113.301,
 'eval_steps_per_second': 3.55}

In [34]:
standard_evaluate(model_LC, dataset["validation"], "stsb")

{'eval_loss': 0.4750456213951111,
 'eval_pearson': 0.8968027940356771,
 'eval_spearmanr': 0.8930373272445522,
 'eval_combined_score': 0.8949200606401146,
 'eval_runtime': 12.647,
 'eval_samples_per_second': 118.605,
 'eval_steps_per_second': 3.716}

# RTE   

In [39]:
model_base, tokenizer = get_classification_bert_model(pre_trained_model_name=huggingface_models["large-ft-rte"])
train_dataloader, validation_dataloader, dataset = get_glue_task_dataset('rte', tokenizer)
model_LC = get_LC_model_bert(model_base, num_GP_layers=4)
model_LC.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/large-ft-rte/4_rte.pth"))
model_base.load_state_dict(torch.load("/scr/models/LC/models_archive/Bert/large-ft-rte/orig_rte.pth"))
collapse_model(model_LC)

Setting up gated layer bert.encoder.layer.23
Setting up gated layer bert.encoder.layer.22
Setting up gated layer bert.encoder.layer.21
Setting up gated layer bert.encoder.layer.20
Collapsing layer bert.encoder.layer.23
Collapsing layer bert.encoder.layer.22
Collapsing layer bert.encoder.layer.21
Collapsing layer bert.encoder.layer.20


In [5]:
print("base model parameters: ", get_num_parameters(model_base))
print("LC model parameters: ", get_num_parameters(model_LC))

base model parameters:  333581314
LC model parameters:  304204802


In [6]:
standard_evaluate(model_base, dataset["validation"], "rte")



{'eval_loss': 0.7359601259231567,
 'eval_accuracy': 0.6642599277978339,
 'eval_runtime': 7.5333,
 'eval_samples_per_second': 36.77,
 'eval_steps_per_second': 1.195}

In [7]:
standard_evaluate(model_LC, dataset["validation"], "rte")

{'eval_loss': 1.2795310020446777,
 'eval_accuracy': 0.7148014440433214,
 'eval_runtime': 2.2664,
 'eval_samples_per_second': 122.223,
 'eval_steps_per_second': 3.971}