## Jupyter notebook for running transformer metrics

In [24]:
from configuration_electra import ElectraConfig
from modeling_electra import ElectraModel
from modeling_electra import ElectraLayer
from transformers import ElectraTokenizerFast

import csv
from datasets import load_dataset
import json
import numpy as np
import torch

from sklearn.metrics import pairwise_distances

In [25]:
np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x7f7b040ec0f0>

In [26]:
configs = []
with open("./nas_configs.json", 'r') as f:
    configs = json.load(f)

In [27]:
# Target dataset is openwebtex
dataset = load_dataset("openwebtext")
tokenizer = ElectraTokenizerFast.from_pretrained("google/electra-small-discriminator")

def encode(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length')

tokenized_dataset = dataset.map(encode, batched=True, num_proc=32)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask'])

Found cached dataset openwebtext (/home/dongpeijie/.cache/huggingface/datasets/openwebtext/plain_text/1.0.0/6f68e85c16ccc770c0dd489f4008852ea9633604995addd0cd76e293aed9e521)


  0%|          | 0/1 [00:00<?, ?it/s]

  table = cls._concat_blocks(blocks, axis=0)
Loading cached processed dataset at /home/dongpeijie/.cache/huggingface/datasets/openwebtext/plain_text/1.0.0/6f68e85c16ccc770c0dd489f4008852ea9633604995addd0cd76e293aed9e521/cache-8daa6c5ff9af9a7e_*_of_00032.arrow


In [28]:
# get sample tokenized batch from dataset
dataloader = torch.utils.data.DataLoader(dataset['train'], batch_size=128)
inputs = tokenizer(next(iter(dataloader))['text'], truncation=True, padding='max_length', return_tensors="pt")

In [29]:
# Covariance calculations for Jacobian covariance and variations
def covariance(jacobs):
    jacob = torch.transpose(jacobs, 0, 1).reshape(jacobs.size(1), -1).cpu().numpy()
    correlations = np.corrcoef(jacob)
    v, _ = np.linalg.eig(correlations)
    k = 1e-5
    return -np.sum(np.log(v + k) + 1.0 / (v + k))

In [30]:
# Cosine calculations for Jacobian cosine and variations
def cosine(jacobs):
    jacob = torch.transpose(jacobs, 0, 1).reshape(jacobs.size(1), -1).cpu().numpy()
    norm = np.linalg.norm(jacob, axis=1)
    normed = jacob / norm[:, None]
    cosines = (-pairwise_distances(normed, metric="cosine") + 1) - np.identity(
        normed.shape[0]
    )
    summed = np.sum(np.power(np.absolute(cosines.flatten()), 1.0 / 20)) / 2
    return 1 - (1 / (pow(cosines.shape[0], 2) - cosines.shape[0]) * summed)

In [31]:
# Synaptic Diversity metric
def synaptic_diversity(model):
    metric_array = []
    for layer in model.modules():
        if isinstance(layer, ElectraLayer):
            for sublayer in layer.operation.modules():
                if isinstance(sublayer, torch.nn.Linear):
                    if (sublayer.weight is not None) and (
                        sublayer.weight.grad is not None
                    ):
                        metric_array.append(
                            torch.abs(
                                torch.norm(sublayer.weight, "nuc")
                                * torch.norm(sublayer.weight.grad, "nuc")
                            )
                        )
    summed = torch.tensor(0.0).to(model.device)
    for j in range(len(metric_array)):
        summed += torch.nansum(metric_array[j])
        
    return summed.detach().item()

In [32]:
def synaptic_diversity_normalized(model):
    metric_array = []
    for layer in model.modules():
        if isinstance(layer, ElectraLayer):
            for sublayer in layer.operation.modules():
                if isinstance(sublayer, torch.nn.Linear):
                    if (sublayer.weight is not None) and (
                        sublayer.weight.grad is not None
                    ):
                        metric_array.append(
                            torch.abs(
                                torch.norm(sublayer.weight, "nuc")
                                * torch.norm(sublayer.weight.grad, "nuc")
                            )
                        )
    
    summed = torch.tensor(0.0).to(model.device)
    for j in range(len(metric_array)):
        summed += torch.nansum(metric_array[j])
    summed /= len(metric_array)
    
    return summed.detach().item()

In [33]:
# Synaptic saliency metric
def synaptic_saliency(model):
    metric_array = []
    for layer in model.modules():
        if isinstance(layer, ElectraLayer):
            for sublayer in layer.intermediate.modules():
                if isinstance(sublayer, torch.nn.Linear):
                    metric_array.append(
                        torch.abs(sublayer.weight * sublayer.weight.grad)
                    )
            for sublayer in layer.output.modules():
                if isinstance(sublayer, torch.nn.Linear):
                    metric_array.append(
                        torch.abs(sublayer.weight * sublayer.weight.grad)
                    )
                    
    summed = torch.tensor(0.0).to("cuda")
    for j in range(len(metric_array)):
        summed += torch.nansum(metric_array[j])
        
    return summed.detach().item()

In [34]:
def synaptic_saliency_normalized(model):
    metric_array = []
    for layer in model.modules():
        if isinstance(layer, ElectraLayer):
            for sublayer in layer.intermediate.modules():
                if isinstance(sublayer, torch.nn.Linear):
                    metric_array.append(
                        torch.abs(sublayer.weight * sublayer.weight.grad)
                    )
            for sublayer in layer.output.modules():
                if isinstance(sublayer, torch.nn.Linear):
                    metric_array.append(
                        torch.abs(sublayer.weight * sublayer.weight.grad)
                    )
                    
    summed = torch.tensor(0.0).to("cuda")
    for j in range(len(metric_array)):
        summed += torch.nansum(metric_array[j])
    summed /= len(metric_array)
        
    return summed.detach().item()

In [35]:
# Activation Distance metric
def activation_distance(outputs):
    metric_array = []
    for output in outputs:
        output = output[0].view(output.size(1), -1)
        x = (output > 0).float()
        K = x @ x.t()
        K2 = (1.0 - x) @ (1.0 - x.t())
        metric_array.append(K + K2)
        
    summed = torch.tensor(0.0).to("cuda")
    for j in range(len(outputs)):
        summed += torch.nansum(metric_array[j])
    
    return summed.detach().item()

In [36]:
def activation_distance_normalized(outputs):
    metric_array = []
    for output in outputs:
        output = output[0].view(output.size(1), -1)
        x = (output > 0).float()
        K = x @ x.t()
        K2 = (1.0 - x) @ (1.0 - x.t())
        metric_array.append(K + K2)
        
    summed = torch.tensor(0.0).to("cuda")
    for j in range(len(outputs)):
        summed += torch.nansum(metric_array[j])
    summed /= len(metric_array)
    
    return summed.detach().item()

In [37]:
def jacobian_score(model):
    jacobs = model.embeddings.position_embeddings.weight.grad.detach()
    return covariance(jacobs)

In [38]:
def jacobian_score_cosine(model):
    jacobs = model.embeddings.position_embeddings.weight.grad.detach()
    return cosine(jacobs)

In [39]:
def num_parameters(model):
    return sum(p.numel() for p in model.parameters())

In [40]:
# Attention Head Importance metric
def head_importance(model):
    metric_array = []
    for layer in model.modules():
        if isinstance(layer, ElectraLayer):
            for sublayer in layer.operation.operation.modules():
                if isinstance(sublayer, torch.nn.Linear):
                    if (sublayer.weight is not None) and (
                        sublayer.weight.grad is not None
                    ) and sublayer.weight.shape[0] >= 128:
                        metric_array.append(
                            torch.abs(sublayer.weight.data * sublayer.weight.grad)
                        )
    summed = torch.tensor(0.0).to(model.device)
    for j in range(len(metric_array)):
        summed += torch.nansum(metric_array[j])
        
    return summed.detach().item()

In [41]:
def head_importance_normalized(model):
    metric_array = []
    for layer in model.modules():
        if isinstance(layer, ElectraLayer):
            for sublayer in layer.operation.operation.modules():
                if isinstance(sublayer, torch.nn.Linear):
                    if (sublayer.weight is not None) and (
                        sublayer.weight.grad is not None
                    ) and sublayer.weight.shape[0] >= 128:
                        metric_array.append(
                            torch.abs(sublayer.weight.data * sublayer.weight.grad)
                        )
    summed = torch.tensor(0.0).to("cuda")
    for j in range(len(metric_array)):
        summed += torch.nansum(metric_array[j])
    summed /= len(metric_array)
        
    return summed.detach().item()

In [42]:
# Attention Confidence metric (for both head and softmax)
def attention_confidence(outputs):
    metric_array = []
    for output in outputs:
        metric_array.append(torch.mean(torch.max(output, 1)[0]))
    
    summed = torch.tensor(0.0).to("cuda")
    for j in range(len(outputs)):
        summed += torch.nansum(metric_array[j])
        
    return summed.detach().item()

In [43]:
def attention_confidence_normalized(outputs):
    metric_array = []
    for output in outputs:
        metric_array.append(torch.mean(torch.max(output, 1)[0]))
    
    summed = torch.tensor(0.0).to(outputs[0].device)
    for j in range(len(metric_array)):
        summed += torch.nansum(metric_array[j])
    summed /= len(metric_array)
    
    return summed.detach().item()

In [44]:
# Run metrics on all model in benchmark
with open("BERT_initialization_ablation.csv", "a") as f:
    writer = csv.writer(f)
    
    header = ["ID",
              "GLUE Score",
              "Synaptic Diversity",
              "Synaptic Diversity Normalized",
              "Synaptic Saliency",
              "Synaptic Saliency Normalized",
              "Activation Distance",
              "Activation Distance Normalized",
              "Jacobian Score",
              "Jacobian Score Normalized",
              "Number of Parameters",
              "Head Importance",
              "Head Importance Normalized",
              "Head Confidence",
              "Head Confidence Normalized",
              "Head Softmax Confidence",
              "Head Softmax Confidence Normalized",
             ]
    writer.writerow(header)
    f.flush()

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    for i in range(500):
        np.random.seed(0)
        torch.manual_seed(0)

        nas_config = configs[i]["hparams"]["model_hparam_overrides"]["nas_config"]

        config = ElectraConfig(
            nas_config=nas_config, num_hidden_layers=len(nas_config["encoder_layers"]), output_hidden_states=True
        )
        model = ElectraModel(config)
        model.to(device)
        inputs.to(device)
        
        # Hooks to get outputs at different layers
        activation_outputs = []
        def activation_hook(module, input, output):
            activation_outputs.append(output)
        for layer in model.modules():
            if isinstance(layer, ElectraLayer):
                sublayer = layer.intermediate.intermediate_act_fn.register_forward_hook(activation_hook)

        head_outputs = []
        def head_hook(module, input, output):
            head_outputs.append(output)

        # Initialize hooks
        for layer in model.modules():
            if isinstance(layer, ElectraLayer):
                sublayer = layer.operation.operation
                if hasattr(sublayer, 'query'):
                    sublayer.query.register_forward_hook(head_hook)
                if hasattr(sublayer, 'key'):
                    sublayer.key.register_forward_hook(head_hook)
                if hasattr(sublayer, 'value'):
                    sublayer.value.register_forward_hook(head_hook)
                if hasattr(sublayer, 'input'):
                    sublayer.input.register_forward_hook(head_hook)
                if hasattr(sublayer, 'weight'):
                    sublayer.weight.register_forward_hook(head_hook)

        softmax_outputs = []
        def softmax_hook(module, input, output):
            softmax_outputs.append(output)

        for layer in model.modules():
            if isinstance(layer, ElectraLayer):
                sublayer = layer.operation.operation
                if hasattr(sublayer, 'softmax'):
                    sublayer.softmax.register_forward_hook(softmax_hook)

        # Run gradient with respect to ones
        model.zero_grad()
        output = model(**inputs).last_hidden_state
        output.backward(torch.ones_like(output))

        row = [configs[i]["id"],
               configs[i]["scores"]["glue"],
               synaptic_diversity(model),
               synaptic_diversity_normalized(model),
               synaptic_saliency(model),
               synaptic_saliency_normalized(model),
               activation_distance(activation_outputs),
               activation_distance_normalized(activation_outputs),
               jacobian_score(model),
               jacobian_score_cosine(model),
               num_parameters(model),
               head_importance(model),
               head_importance_normalized(model),
               attention_confidence(head_outputs),
               attention_confidence_normalized(head_outputs),
               attention_confidence(softmax_outputs),
               attention_confidence_normalized(softmax_outputs),
              ]
        
        writer.writerow(row)
        f.flush()

        print(str(configs[i]["id"]))



0




1




2




3




4




5




6




7




8




9




10




11




12




13




14




15




16




17




18




19




20




21




22




23




24




25




26




27




28




29




30




31




32




33




34




35




36




37




38




39




40




41




42




43




44




45




46




47




48




49




50




51




52




53




54




55




56




57




58




59




60




61




62




63




64




65




66




67




68




69




70




71




72




73




74




75




76




77




78




79




80




81




82




83




84




85




86




87
88




89




90




91




92




93




94




95




96




97




98




99




100




101




102




103




104




105
106




107




108




109




110




111




112




113




114




115




116




117




118




119




120




121




122




123




124




125




126




127




128




129




130




131




132




133
134




135




136




137




138




139




140




141




142




143




144
145




146




147




148




149




150




151




152




153




154




155




156




157




158




159




160




161




162




163




164




165




166




167




168




169




170




171




172




173




174




175




176




177




178




179




180




181




182




183




184




185




186




187




188




189




190




191




192




193




194




195




196




197




198




199




200




201




202




203




204




205




206




207




208




209




210




211




212




213




214




215




216




217




218




219




220




221




222




223




224




225




226




227




228




229




230




231




232




233




234




235




236




237




238




239




240




241




242




243




244




245




246




247




248




249




250




251




252




253




254




255




256




257




258




259




260




261




262




263




264




265




266




267




268




269




270




271




272




273




274




275




276




277




278




279




280




281




282




283




284




285




286




287




288




289




290




291




292




293




294




295




296




297




298




299




300




301




302




303




304




305




306




307




308




309




310




311




312




313




314




315




316




317




318




319




320




321




322




323




324




325




326




327




328




329




330




331




332




333




334




335




336




337




338




339




340




341




342




343




344




345




346




347




348




349




350




351




352




353




354




355




356




357




358




359




360




361




362




363
364




365




366




367




368




369




370




371




372




373




374




375




376




377




378




379




380




381




382




383




384




385




386




387




388




389




390




391




392
393




394




395




396




397




398




399




400




401




402




403




404




405




406




407




408




409




410




411




412




413




414




415




416




417




418




419




420




421




422




423




424




425
426




427




428




429




430




431




432




433




434




435




436




437




438




439




440




441




442




443




444




445




446




447




448




449




450




451




452




453




454




455




456




457




458




459




460




461




462




463




464




465




466




467




468




469




470




471




472




473




474




475




476




477




478




479




480




481




482




483




484




485




486




487




488




489




490




491




492




493




494




495




496




497




498




499


In [45]:
ablation_models = [285,
116,
280,
337,
464,
166,
153,
157,
330,
164,]

In [46]:
# Run metrics on ablation study of different initialization states and minibatch inputs
with open("BERT_batch_ablation.csv", "a") as f:
    writer = csv.writer(f)
    
    header = ["ID",
              "GLUE Score",
              "Synaptic Diversity",
              "Synaptic Diversity Normalized",
              "Synaptic Saliency",
              "Synaptic Saliency Normalized",
              "Activation Distance",
              "Activation Distance Normalized",
              "Jacobian Score",
              "Jacobian Score Normalized",
              "Number of Parameters",
              "Head Importance",
              "Head Importance Normalized",
              "Head Confidence",
              "Head Confidence Normalized",
              "Head Softmax Confidence",
              "Head Softmax Confidence Normalized",
             ]
    writer.writerow(header)
    f.flush()

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    for i in ablation_models:
        # comment to investigate initialization ablation
        np.random.seed(0)
        torch.manual_seed(0)
        
        nas_config = configs[i]["hparams"]["model_hparam_overrides"]["nas_config"]

        config = ElectraConfig(
            nas_config=nas_config, num_hidden_layers=len(nas_config["encoder_layers"]), output_hidden_states=True
        )
        model = ElectraModel(config)
        model.to(device)
        inputs.to(device)
        
        # uncomment to investigate minibatch ablation
        # iterator = iter(dataloader)
        
        for j in range(10):
            # tokenizer(next(iterator)['text'], truncation=True, padding='max_length', return_tensors="pt")
            # inputs.to(device)

            np.random.seed(j)
            torch.manual_seed(j)

            activation_outputs = []
            def activation_hook(module, input, output):
                activation_outputs.append(output)
            for layer in model.modules():
                if isinstance(layer, ElectraLayer):
                    sublayer = layer.intermediate.intermediate_act_fn.register_forward_hook(activation_hook)

            head_outputs = []
            def head_hook(module, input, output):
                head_outputs.append(output)

            for layer in model.modules():
                if isinstance(layer, ElectraLayer):
                    sublayer = layer.operation.operation
                    if hasattr(sublayer, 'query'):
                        sublayer.query.register_forward_hook(head_hook)
                    if hasattr(sublayer, 'key'):
                        sublayer.key.register_forward_hook(head_hook)
                    if hasattr(sublayer, 'value'):
                        sublayer.value.register_forward_hook(head_hook)
                    if hasattr(sublayer, 'input'):
                        sublayer.input.register_forward_hook(head_hook)
                    if hasattr(sublayer, 'weight'):
                        sublayer.weight.register_forward_hook(head_hook)

            softmax_outputs = []
            def softmax_hook(module, input, output):
                softmax_outputs.append(output)

            for layer in model.modules():
                if isinstance(layer, ElectraLayer):
                    sublayer = layer.operation.operation
                    if hasattr(sublayer, 'softmax'):
                        sublayer.softmax.register_forward_hook(softmax_hook)


            model.zero_grad()
            output = model(**inputs).last_hidden_state
            output.backward(torch.ones_like(output))

            row = [configs[i]["id"],
                   configs[i]["scores"]["glue"],
                   synaptic_diversity(model),
                   synaptic_diversity_normalized(model),
                   synaptic_saliency(model),
                   synaptic_saliency_normalized(model),
                   activation_distance(activation_outputs),
                   activation_distance_normalized(activation_outputs),
                   jacobian_score(model),
                   jacobian_score_cosine(model),
                   num_parameters(model),
                   head_importance(model),
                   head_importance_normalized(model),
                   attention_confidence(head_outputs),
                   attention_confidence_normalized(head_outputs),
                   attention_confidence(softmax_outputs),
                   attention_confidence_normalized(softmax_outputs),
                  ]

            writer.writerow(row)
            f.flush()

            print(str(configs[i]["id"]))



285




285




285




285




285




285




285




285




285




285




116




116




116




116




116




116




116




116




116




116




280




280




280




280




280




280




280




280




280




280




337




337




337




337




337




337




337




337




337




337




464




464




464




464




464




464




464




464




464




464




166




166




166




166




166




166




166




166




166




166




153




153




153




153




153




153




153




153




153




153




157




157




157




157




157




157




157




157




157




157




330




330




330




330




330




330




330




330




330




330




164




164




164




164




164




164




164




164




164




164
