In [1]:
import sys
sys.path.insert(0, '../src')

In [2]:
from classify_attention_patterns import load_model
from argparse import Namespace
from run_glue import load_and_cache_examples, set_seed
from model_bert import BertForSequenceClassification
from config_bert import BertConfig
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import BertTokenizer
import numpy as np
import torch
import random
from collections import Counter

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

loading region bounding boxes for computing carbon emissions region, this may take a moment...
 454/454... rate=538.78 Hz, eta=0:00:00, total=0:00:00, wall=22:02 CETT
Done!


In [3]:
head_classifier_model, label2id, min_max_size = load_model("../models/head_classifier/classify_attention_patters.tar")
head_classifier_model = head_classifier_model.eval().cuda()
id2label = {idx:label for label, idx in label2id.items()}

# Fine-tuned model - Super Survivors

In [4]:
set_seed(1337)
for task in ["CoLA", "SST-2", "MRPC", "STS-B", "QQP", "MNLI", "QNLI", "RTE"]:
    for seed in ["seed_1337", "seed_42", "seed_86", "seed_71", "seed_166"]:
        #Load Model
        model_path = f"../models/finetuned/{task}/{seed}/"
        tokenizer = BertTokenizer.from_pretrained(model_path)
        config = BertConfig.from_pretrained(model_path)
        config.output_attentions = True
        transformer_model = BertForSequenceClassification.from_pretrained(model_path,  config=config)
        # Prune
        mask_path = f"../masks/heads_mlps_super/{task}/{seed}/"
        head_mask = np.load(f"{mask_path}/head_mask.npy")
        mlp_mask = np.load(f"{mask_path}/mlp_mask.npy")
        head_mask = torch.from_numpy(head_mask)
        heads_to_prune = {} 
        for layer in range(len(head_mask)):
            heads_to_mask = [h[0] for h in (1 - head_mask[layer].long()).nonzero().tolist()]
            heads_to_prune[layer] = heads_to_mask
        mlps_to_prune = [h[0] for h in (1 - torch.from_numpy(mlp_mask).long()).nonzero().tolist()]

        transformer_model.prune_heads(heads_to_prune)
        transformer_model.prune_mlps(mlps_to_prune)
        transformer_model = transformer_model.eval()
        transformer_model.cuda()
        args = Namespace(data_dir=f"../data/glue/{task}/", local_rank=-1, 
                         model_name_or_path=model_path, 
                         overwrite_cache=False, model_type="bert", max_seq_length=128)
        eval_dataset = load_and_cache_examples(args, task.lower(), tokenizer, evaluate=True)
        eval_sampler = RandomSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=1)
        input_data = None
        layer_head_types = [[] for _ in range(12)]

        k = 0
        for batch in eval_dataloader:
            batch = tuple(t.to("cuda:0") for t in batch)
            input_data =  {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            _, _, attentions = transformer_model(**input_data)
            for layer in range(len(attentions)):
                if attentions[layer] is None:
                    continue
                head_attentions = attentions[layer].transpose(0, 1)
                logits = head_classifier_model(head_attentions)
                label_ids = torch.argmax(logits, dim=-1)
                labels = [id2label[int(label_id.item())] for label_id in label_ids]
                if len(layer_head_types[layer]) == 0:
                    for i in range(len(labels)):
                        c = Counter()
                        layer_head_types[layer].append(c)
                for i, label in enumerate(labels):
                    layer_head_types[layer][i][label] += 1
            k += 1
            if k == 100:
                break
        total_counter = Counter()
        for layer in layer_head_types:
            for head_type_ctr in layer:
                total_counter += head_type_ctr
        print(task, seed, {k:v/sum(total_counter.values()) for k,v in total_counter.most_common()})


CoLA seed_1337 {'vertical': 0.42764705882352944, 'block': 0.4047058823529412, 'other': 0.14941176470588236, 'mix': 0.012941176470588235, 'diagonal': 0.005294117647058823}
CoLA seed_42 {'block': 0.4364705882352941, 'vertical': 0.39705882352941174, 'other': 0.1426470588235294, 'mix': 0.016764705882352942, 'diagonal': 0.007058823529411765}
CoLA seed_86 {'block': 0.4370588235294118, 'vertical': 0.4117647058823529, 'other': 0.14588235294117646, 'mix': 0.003823529411764706, 'diagonal': 0.0014705882352941176}
CoLA seed_71 {'vertical': 0.4485294117647059, 'block': 0.4041176470588235, 'other': 0.14088235294117646, 'mix': 0.005294117647058823, 'diagonal': 0.001176470588235294}
CoLA seed_166 {'block': 0.43911764705882356, 'vertical': 0.3905882352941176, 'other': 0.1561764705882353, 'mix': 0.01, 'diagonal': 0.00411764705882353}
SST-2 seed_1337 {'other': 0.5178947368421053, 'block': 0.22421052631578947, 'vertical': 0.11105263157894738, 'mix': 0.09, 'diagonal': 0.056842105263157895}
SST-2 seed_42 {'

# Pre-trained Model - Super-survivors

In [5]:
set_seed(1337)
for task in ["CoLA", "SST-2", "MRPC", "STS-B", "QQP", "MNLI", "QNLI", "RTE"]:
    for seed in ["seed_1337", "seed_42", "seed_86", "seed_71", "seed_166"]:
        #Load Model
        model_path = f"../models/finetuned/{task}/{seed}/"
        tokenizer = BertTokenizer.from_pretrained(model_path)
        config = BertConfig.from_pretrained(model_path)
        config.output_attentions = True
        transformer_model = BertForSequenceClassification.from_pretrained('bert-base-uncased',  config=config)
        # Prune
        mask_path = f"../masks/heads_mlps_super/{task}/{seed}/"
        head_mask = np.load(f"{mask_path}/head_mask.npy")
        mlp_mask = np.load(f"{mask_path}/mlp_mask.npy")
        head_mask = torch.from_numpy(head_mask)
        heads_to_prune = {} 
        for layer in range(len(head_mask)):
            heads_to_mask = [h[0] for h in (1 - head_mask[layer].long()).nonzero().tolist()]
            heads_to_prune[layer] = heads_to_mask
        mlps_to_prune = [h[0] for h in (1 - torch.from_numpy(mlp_mask).long()).nonzero().tolist()]

        transformer_model.prune_heads(heads_to_prune)
        transformer_model.prune_mlps(mlps_to_prune)
        transformer_model = transformer_model.eval()
        transformer_model.cuda()
        args = Namespace(data_dir=f"../data/glue/{task}/", local_rank=-1, 
                         model_name_or_path=model_path, 
                         overwrite_cache=False, model_type="bert", max_seq_length=128)
        eval_dataset = load_and_cache_examples(args, task.lower(), tokenizer, evaluate=True)
        eval_sampler = RandomSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=1)
        input_data = None
        layer_head_types = [[] for _ in range(12)]

        k = 0
        for batch in eval_dataloader:
            batch = tuple(t.to("cuda:0") for t in batch)
            input_data =  {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            _, _, attentions = transformer_model(**input_data)
            for layer in range(len(attentions)):
                if attentions[layer] is None:
                    continue
                head_attentions = attentions[layer].transpose(0, 1)
                logits = head_classifier_model(head_attentions)
                label_ids = torch.argmax(logits, dim=-1)
                labels = [id2label[int(label_id.item())] for label_id in label_ids]
                if len(layer_head_types[layer]) == 0:
                    for i in range(len(labels)):
                        c = Counter()
                        layer_head_types[layer].append(c)
                for i, label in enumerate(labels):
                    layer_head_types[layer][i][label] += 1
            k += 1
            if k == 100:
                break
        total_counter = Counter()
        for layer in layer_head_types:
            for head_type_ctr in layer:
                total_counter += head_type_ctr
        print(task, seed, {k:v/sum(total_counter.values()) for k,v in total_counter.most_common()})


CoLA seed_1337 {'vertical': 0.4202941176470588, 'block': 0.4047058823529412, 'other': 0.15911764705882353, 'mix': 0.011176470588235295, 'diagonal': 0.004705882352941176}
CoLA seed_42 {'block': 0.42764705882352944, 'vertical': 0.4111764705882353, 'other': 0.13970588235294118, 'mix': 0.015588235294117648, 'diagonal': 0.0058823529411764705}
CoLA seed_86 {'vertical': 0.43441176470588233, 'block': 0.40941176470588236, 'other': 0.15088235294117647, 'mix': 0.003823529411764706, 'diagonal': 0.0014705882352941176}
CoLA seed_71 {'vertical': 0.4576470588235294, 'block': 0.39588235294117646, 'other': 0.14029411764705882, 'mix': 0.005, 'diagonal': 0.001176470588235294}
CoLA seed_166 {'vertical': 0.43441176470588233, 'block': 0.4023529411764706, 'other': 0.15088235294117647, 'mix': 0.008529411764705883, 'diagonal': 0.003823529411764706}
SST-2 seed_1337 {'other': 0.5194736842105263, 'block': 0.2168421052631579, 'vertical': 0.12, 'mix': 0.09368421052631579, 'diagonal': 0.05}
SST-2 seed_42 {'other': 0.

# Fine-tuned Model All heads

In [6]:
set_seed(1337)
for task in ["CoLA", "SST-2", "MRPC", "STS-B", "QQP", "MNLI", "QNLI", "RTE"]:
    for seed in ["seed_1337", "seed_42", "seed_86", "seed_71", "seed_166"]:
        #Load Model
        model_path = f"../models/finetuned/{task}/{seed}/"
        tokenizer = BertTokenizer.from_pretrained(model_path)
        config = BertConfig.from_pretrained(model_path)
        config.output_attentions = True
        transformer_model = BertForSequenceClassification.from_pretrained(model_path,  config=config)
        transformer_model = transformer_model.eval()
        transformer_model.cuda()
        args = Namespace(data_dir=f"../data/glue/{task}/", local_rank=-1, 
                         model_name_or_path=model_path, 
                         overwrite_cache=False, model_type="bert", max_seq_length=128)
        eval_dataset = load_and_cache_examples(args, task.lower(), tokenizer, evaluate=True)
        eval_sampler = RandomSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=1)
        input_data = None
        layer_head_types = [[] for _ in range(12)]

        k = 0
        for batch in eval_dataloader:
            batch = tuple(t.to("cuda:0") for t in batch)
            input_data =  {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            _, _, attentions = transformer_model(**input_data)
            for layer in range(len(attentions)):
                if attentions[layer] is None:
                    continue
                head_attentions = attentions[layer].transpose(0, 1)
                logits = head_classifier_model(head_attentions)
                label_ids = torch.argmax(logits, dim=-1)
                labels = [id2label[int(label_id.item())] for label_id in label_ids]
                if len(layer_head_types[layer]) == 0:
                    for i in range(len(labels)):
                        c = Counter()
                        layer_head_types[layer].append(c)
                for i, label in enumerate(labels):
                    layer_head_types[layer][i][label] += 1
            k += 1
            if k == 100:
                break
        total_counter = Counter()
        for layer in layer_head_types:
            for head_type_ctr in layer:
                total_counter += head_type_ctr
        print(task, seed, {k:v/sum(total_counter.values()) for k,v in total_counter.most_common()})


CoLA seed_1337 {'vertical': 0.4676388888888889, 'block': 0.3804166666666667, 'other': 0.14027777777777778, 'mix': 0.009097222222222222, 'diagonal': 0.0025694444444444445}
CoLA seed_42 {'vertical': 0.4650694444444444, 'block': 0.3977777777777778, 'other': 0.12104166666666667, 'mix': 0.013472222222222222, 'diagonal': 0.002638888888888889}
CoLA seed_86 {'vertical': 0.43166666666666664, 'block': 0.41055555555555556, 'other': 0.14381944444444444, 'mix': 0.010347222222222223, 'diagonal': 0.003611111111111111}
CoLA seed_71 {'vertical': 0.44083333333333335, 'block': 0.36993055555555554, 'other': 0.184375, 'mix': 0.0038194444444444443, 'diagonal': 0.0010416666666666667}
CoLA seed_166 {'block': 0.4388888888888889, 'vertical': 0.38958333333333334, 'other': 0.1659027777777778, 'mix': 0.004861111111111111, 'diagonal': 0.0007638888888888889}
SST-2 seed_1337 {'vertical': 0.32381944444444444, 'other': 0.3097916666666667, 'block': 0.18048611111111112, 'mix': 0.15291666666666667, 'diagonal': 0.032986111

# Pre-trained Model All Heads

In [7]:
set_seed(1337)
for task in ["CoLA", "SST-2", "MRPC", "STS-B", "QQP", "MNLI", "QNLI", "RTE"]:
    for seed in ["seed_1337", "seed_42", "seed_86", "seed_71", "seed_166"]:
        #Load Model
        model_path = f"../models/finetuned/{task}/{seed}/"
        tokenizer = BertTokenizer.from_pretrained(model_path)
        config = BertConfig.from_pretrained(model_path)
        config.output_attentions = True
        transformer_model = BertForSequenceClassification.from_pretrained('bert-base-uncased',  config=config)
        transformer_model = transformer_model.eval()
        transformer_model.cuda()
        args = Namespace(data_dir=f"../data/glue/{task}/", local_rank=-1, 
                         model_name_or_path=model_path, 
                         overwrite_cache=False, model_type="bert", max_seq_length=128)
        eval_dataset = load_and_cache_examples(args, task.lower(), tokenizer, evaluate=True)
        eval_sampler = RandomSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=1)
        input_data = None
        layer_head_types = [[] for _ in range(12)]

        k = 0
        for batch in eval_dataloader:
            batch = tuple(t.to("cuda:0") for t in batch)
            input_data =  {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            _, _, attentions = transformer_model(**input_data)
            for layer in range(len(attentions)):
                if attentions[layer] is None:
                    continue
                head_attentions = attentions[layer].transpose(0, 1)
                logits = head_classifier_model(head_attentions)
                label_ids = torch.argmax(logits, dim=-1)
                labels = [id2label[int(label_id.item())] for label_id in label_ids]
                if len(layer_head_types[layer]) == 0:
                    for i in range(len(labels)):
                        c = Counter()
                        layer_head_types[layer].append(c)
                for i, label in enumerate(labels):
                    layer_head_types[layer][i][label] += 1
            k += 1
            if k == 100:
                break
        total_counter = Counter()
        for layer in layer_head_types:
            for head_type_ctr in layer:
                total_counter += head_type_ctr
        print(task, seed, {k:v/sum(total_counter.values()) for k,v in total_counter.most_common()})


CoLA seed_1337 {'vertical': 0.53125, 'block': 0.3161805555555556, 'other': 0.14180555555555555, 'mix': 0.008125, 'diagonal': 0.002638888888888889}
CoLA seed_42 {'vertical': 0.5222916666666667, 'block': 0.3507638888888889, 'other': 0.11291666666666667, 'mix': 0.01125, 'diagonal': 0.002777777777777778}
CoLA seed_86 {'vertical': 0.4957638888888889, 'block': 0.35118055555555555, 'other': 0.14118055555555556, 'mix': 0.009305555555555555, 'diagonal': 0.0025694444444444445}
CoLA seed_71 {'vertical': 0.46805555555555556, 'block': 0.34833333333333333, 'other': 0.17881944444444445, 'mix': 0.0034027777777777776, 'diagonal': 0.001388888888888889}
CoLA seed_166 {'vertical': 0.48291666666666666, 'block': 0.3452777777777778, 'other': 0.16666666666666666, 'mix': 0.0044444444444444444, 'diagonal': 0.0006944444444444445}
SST-2 seed_1337 {'vertical': 0.3767361111111111, 'other': 0.30444444444444446, 'mix': 0.1540277777777778, 'block': 0.14027777777777778, 'diagonal': 0.02451388888888889}
SST-2 seed_42 {'