In [1]:
import torch
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import torch.nn as nn
import pandas as pd

In [2]:
import argparse
import glob
import logging
import os
import random
import timeit
import pickle

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

import transformers
from transformers import (
    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    get_linear_schedule_with_warmup,
    squad_convert_examples_to_features,
)

from transformers import BertForQuestionAnswering, BertConfig, BertTokenizer

from transformers.data.metrics.squad_metrics import (
    compute_predictions_log_probs,
    compute_predictions_logits,
    squad_evaluate,
)
from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor
from transformers.trainer_utils import is_main_process


logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)


model_name_or_path = 'models/bert/'


max_seq_length = 384
doc_stride = 128
max_query_length = 10
threads = 12
n_gpu = 1

input_dir = "./data/squad"
output_dir = "./models/bert/"
model_type="bert"
# evaluate = True
train_file = "train-v2.0.json"
version_2_with_negative=True
per_gpu_eval_batch_size=16


n_best_size=20
max_answer_length=30
do_lower_case=True
verbose_logging=True
null_score_diff_threshold=0.0

global_attention = {}

In [3]:
%set_env CUDA_VISIBLE_DEVICES=0,1
%set_env CUDA_DEVICE_ORDER=PCI_BUS_ID

env: CUDA_VISIBLE_DEVICES=0,1
env: CUDA_DEVICE_ORDER=PCI_BUS_ID


In [4]:
!pwd

/rapids/notebooks/host


In [5]:
#pipeline related parameters

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 255)) # scaler 

data_dir='/rapids/notebooks/host/QA_attentions_pickled'
representation_dir='/rapids/notebooks/host/QA_attentions_pickled/representations'

class Identity(nn.Module):
    def __init__(self):
        super(Identity, self).__init__()
        
    def forward(self, x):
        return x    
    
representation_model = torch.hub.load('facebookresearch/barlowtwins:main', 'resnet50')
representation_model.fc = Identity() # pass through values from second to last layer, bypassing linear classifier

device = "cuda:1"
cuda = torch.device('cuda:1')
representation_model.to(cuda)

Using cache found in /root/.cache/torch/hub/facebookresearch_barlowtwins_main


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [6]:
# pipeline methods

def plot_layer_heads(attention, num_layers=12, num_heads=12):
  for i in range(0,11):
    p = attention[i]
    fig, axis = plt.subplots(1,12, figsize=(20,5), facecolor='w', edgecolor='k')
    plt.title(f'layer {i}')
    head = 0
    for axs, ph in zip(axis.flatten(), p):
      heatmap = axs.imshow(ph, cmap='hot')
    
def scale_examples(examples):
    num_examples = len(examples)
    scaled_examples = np.empty(shape=(num_examples), dtype=np.ndarray)
    for i, example in enumerate(examples):
        new_example = np.empty(shape=(12,12), dtype=np.ndarray)
        for l, layer in enumerate(example): #12 layers
            new_layer = np.array([])
            for h, head in enumerate(layer): #12 heads
                flat_head_transformed = scaler.fit_transform(head)
                new_example[l,h] = flat_head_transformed.reshape(384,384)
        scaled_examples[i] = new_example
    return scaled_examples

# create (1, 3, 384, 384) shape expected by barlow twins model
def reshape_example(image):
    example_channel = np.expand_dims(image, axis=0)
    batch = np.append(example_channel, example_channel, axis=0)
    batch = np.append(batch, example_channel, axis=0)
    example_3channel = np.expand_dims(batch, axis=0)
    return example_3channel

def get_representations(attentions):
    num_attentions = len(attentions)
    barlow_representations = np.zeros((num_attentions), np.object)
    for i, example in enumerate(attentions):
        reshaped_example = np.zeros((12,12), np.object)
        for l, layer in enumerate(example):
            for h, head in enumerate(layer):
                reshaped_head = torch.from_numpy(reshape_example(head)).to(cuda)
                representation_head = representation_model(reshaped_head.float())
                reshaped_example[l][h] = representation_head.detach().cpu().numpy()
        barlow_representations[i] = reshaped_example

    return barlow_representations

def flatten_layer_heads(representations_tensor):
    print("flattening layers/heads ...")
    num_examples =  len(representations_tensor)
    flat_array = np.zeros((num_examples * 12 * 12), np.ndarray)
    i = 0
    for example in representations_tensor:
        for layer in example:
            for h, head in enumerate(layer):
                flat_array[i] = head[0]
                i += 1
    return flat_array





In [12]:
# evaluation
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)


def to_list(tensor):
    return tensor.detach().cpu().tolist()

def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    eval_batch_size = 1 * max(1, n_gpu)

##  This can be used to get the last remaining examples, since batching cuts off at 130000, a multiple of 5000
#     subset = torch.utils.data.Subset(dataset, range(130000, 131944))
    subset = dataset

    eval_sampler = SequentialSampler(subset)
    eval_dataloader = DataLoader(subset, sampler=eval_sampler, batch_size=eval_batch_size)
    
#     eval_sampler = SequentialSampler(dataset)
#     eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=eval_batch_size)

    # multi-gpu evaluate
    if n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    logger.info("***** Running evaluation {} *****".format(prefix))
#     logger.info("  Num examples = %d", len(dataset))
    logger.info("  Num examples = %d", len(subset))
    logger.info("  Batch size = %d", eval_batch_size)

    all_results = []
    all_attentions = []
    start_time = timeit.default_timer()
    attn_count = 130000

    representation_df = pd.DataFrame()
    
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if model_type in ["xlm", "roberta", "distilbert", "camembert", "bart", "longformer"]:
                del inputs["token_type_ids"]

            feature_indices = batch[3]

            outputs = model(**inputs)
            
        for i, feature_index in enumerate(feature_indices):
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            start_logits = outputs.start_logits
            end_logits = outputs.end_logits
            attentions = get_layers(outputs.attentions)

            result = SquadResult(unique_id, start_logits, end_logits)
            all_results.append(result)

#             append_list_as_row('QA_bert_attentions.csv', attentions)
            all_attentions.append(attentions)

    
            attn_count += 1
            if attn_count % 250 == 0:

                logger.info("Scaling attention values to 0-255 ...")
                representations = scale_examples(all_attentions)
                logger.info("Processing to 2048 value representations through barlow_twins ...")
                representations = get_representations(representations)
                logger.info("Appending results to array/dataframe ...")
                representations = flatten_layer_heads(representations)

                df = pd.DataFrame(representations)
                df = pd.DataFrame([pd.Series(x) for x in df[0]])
                representation_df = representation_df.append(df, ignore_index=True)

                print(f"--- eval to representation batch {attn_count} ---")          
                all_attentions = []
                representations = []
                df = pd.DataFrame()
            
            if attn_count % 5000 == 0 or attn_count == 131944:
                logger.info(f"  Outputting Attention File representation_df_{attn_count} to {representation_dir}")
                representation_df.to_csv(os.path.join(representation_dir, f"representation_df_{attn_count}.csv"))
                representation_df = pd.DataFrame()

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))


    return {}

In [13]:
def get_layers(attention, num_layers=12, num_heads=12):
  layers = np.ndarray((num_heads,num_layers), np.ndarray)
  for i, layer in enumerate(attention):
    layer = layer.detach().cpu().numpy()[0]
    for j, head in enumerate(layer):
      layers[i,j] = head
  return layers

In [14]:

from csv import writer
def append_list_as_row(file_name, list_of_elem):
    # Open file in append mode
    with open(file_name, 'a+', newline='') as write_obj:
        # Create a writer object from csv module
        csv_writer = writer(write_obj)
        # Add contents of list as last row in the csv file
        csv_writer.writerow(list_of_elem)

In [15]:


def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    # Load data features from cache or dataset file
#     input_dir = data_dir else "."

    cached_features_file = os.path.join(
        input_dir,
        "cached_{}_{}_{}".format(
            "train",
            list(filter(None, model_name_or_path.split("/"))).pop(),
            str(max_seq_length),
        ),
    )

    logger.info("cached features file: %s", cached_features_file)
#     Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s", cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        logger.info("Creating features from dataset file at %s", input_dir)

        examples = processor.get_train_examples(input_dir, filename=train_file)

        logger.info("Got features from dataset file at %s", input_dir)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=max_seq_length,
            doc_stride=doc_stride,
            max_query_length=max_query_length,
            is_training=False,
            return_dataset="pt",
            threads=threads,
        )

        logger.info("Saving features into cached file %s", cached_features_file)
        torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)

    if output_examples:
        return dataset, examples, features
    return dataset


In [16]:

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
logger.warning(
    "device: %s, n_gpu: %s",
    device,
    n_gpu
)
# Set the verbosity to info of the Transformers logger (on main process only):
# if is_main_process(args.local_rank):
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Set seed
set_seed(42)

# Load pretrained model and tokenizer
config = BertConfig.from_pretrained(model_name_or_path, output_attentions=True) # no config_path?
tokenizer = BertTokenizer.from_pretrained(
    model_name_or_path,
    do_lower_case=True,
    cache_dir=input_dir,
    use_fast=False,  # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
)

[INFO|configuration_utils.py:528] 2021-07-17 04:44:52,001 >> loading configuration file models/bert/config.json
[INFO|configuration_utils.py:566] 2021-07-17 04:44:52,003 >> Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_attentions": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

[INFO|tokenization_utils_base.py:1651] 2021-07-17 04:44:52,005 >> Didn't find file models/bert/added_tokens.json. We won't load it.
[INFO|tokenization_utils_ba

In [17]:

#model = AutoModelForQuestionAnswering.from_pretrained(model_name_or_path, output_attentions=True)
model = BertForQuestionAnswering.from_pretrained(model_name_or_path, output_attentions=True)
device = "cuda:1"
model = model.to(device)
# Evaluate
processor = SquadV2Processor() if version_2_with_negative else SquadV1Processor()
result = evaluate({'data_dir': ""}, model, tokenizer)

result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
results.update(result)

logger.info("Results: {}".format(results))

[INFO|configuration_utils.py:528] 2021-07-17 04:44:52,126 >> loading configuration file models/bert/config.json
[INFO|configuration_utils.py:566] 2021-07-17 04:44:52,127 >> Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_attentions": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

[INFO|modeling_utils.py:1159] 2021-07-17 04:44:52,127 >> loading weights file models/bert/pytorch_model.bin
[INFO|modeling_utils.py:1345] 2021-07-17 04:44:53,7

flattening layers/heads ...


Evaluating:  13%|█▎        | 254/1944 [07:56<19:06:11, 40.69s/it]

--- eval to representation batch 130250 ---


Evaluating:  26%|██▌       | 499/1944 [08:02<00:33, 43.03it/s]   07/17/2021 04:54:40 - INFO - __main__ -   Scaling attention values to 0-255 ...
Evaluating:  26%|██▌       | 499/1944 [08:20<00:33, 43.03it/s]07/17/2021 04:55:06 - INFO - __main__ -   Processing to 2048 value representations through barlow_twins ...
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
07/17/2021 05:01:56 - INFO - __main__ -   Appending results to array/dataframe ...


flattening layers/heads ...


Evaluating:  26%|██▌       | 504/1944 [15:25<9:39:50, 24.16s/it] 

--- eval to representation batch 130500 ---


Evaluating:  39%|███▊      | 749/1944 [15:31<00:27, 43.03it/s]  07/17/2021 05:02:09 - INFO - __main__ -   Scaling attention values to 0-255 ...
Evaluating:  39%|███▊      | 749/1944 [15:50<00:27, 43.03it/s]07/17/2021 05:02:36 - INFO - __main__ -   Processing to 2048 value representations through barlow_twins ...
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
07/17/2021 05:09:28 - INFO - __main__ -   Appending results to array/dataframe ...


flattening layers/heads ...


Evaluating:  39%|███▉      | 755/1944 [23:00<7:29:31, 22.68s/it] 

--- eval to representation batch 130750 ---


Evaluating:  51%|█████     | 995/1944 [23:06<00:22, 42.96it/s]  07/17/2021 05:09:44 - INFO - __main__ -   Scaling attention values to 0-255 ...
Evaluating:  51%|█████     | 995/1944 [23:20<00:22, 42.96it/s]07/17/2021 05:10:10 - INFO - __main__ -   Processing to 2048 value representations through barlow_twins ...
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
07/17/2021 05:17:02 - INFO - __main__ -   Appending results to array/dataframe ...


flattening layers/heads ...


Evaluating:  52%|█████▏    | 1004/1944 [30:32<5:12:18, 19.94s/it]

--- eval to representation batch 131000 ---


Evaluating:  64%|██████▍   | 1249/1944 [30:37<00:16, 43.11it/s]  07/17/2021 05:17:15 - INFO - __main__ -   Scaling attention values to 0-255 ...
Evaluating:  64%|██████▍   | 1249/1944 [30:50<00:16, 43.11it/s]07/17/2021 05:17:42 - INFO - __main__ -   Processing to 2048 value representations through barlow_twins ...
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
07/17/2021 05:24:49 - INFO - __main__ -   Appending results to array/dataframe ...


flattening layers/heads ...


Evaluating:  65%|██████▍   | 1254/1944 [38:21<4:50:29, 25.26s/it]

--- eval to representation batch 131250 ---


Evaluating:  77%|███████▋  | 1499/1944 [38:27<00:10, 43.33it/s]  07/17/2021 05:25:05 - INFO - __main__ -   Scaling attention values to 0-255 ...
Evaluating:  77%|███████▋  | 1499/1944 [38:42<00:10, 43.33it/s]07/17/2021 05:25:31 - INFO - __main__ -   Processing to 2048 value representations through barlow_twins ...
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
07/17/2021 05:32:21 - INFO - __main__ -   Appending results to array/dataframe ...


flattening layers/heads ...


Evaluating:  77%|███████▋  | 1505/1944 [45:50<2:43:46, 22.38s/it]

--- eval to representation batch 131500 ---


Evaluating:  90%|████████▉ | 1745/1944 [45:56<00:04, 43.31it/s]  07/17/2021 05:32:34 - INFO - __main__ -   Scaling attention values to 0-255 ...
Evaluating:  90%|████████▉ | 1745/1944 [46:12<00:04, 43.31it/s]07/17/2021 05:33:00 - INFO - __main__ -   Processing to 2048 value representations through barlow_twins ...
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
07/17/2021 05:39:53 - INFO - __main__ -   Appending results to array/dataframe ...


flattening layers/heads ...


Evaluating:  90%|█████████ | 1755/1944 [53:25<59:29, 18.88s/it]  

--- eval to representation batch 131750 ---


Evaluating: 100%|█████████▉| 1940/1944 [53:29<00:00, 43.19it/s]07/17/2021 05:40:07 - INFO - __main__ -     Outputting Attention File representation_df_131944 to /rapids/notebooks/host/QA_attentions_pickled/representations
Evaluating: 100%|██████████| 1944/1944 [1:01:43<00:00,  1.90s/it]
07/17/2021 05:48:21 - INFO - __main__ -     Evaluation done in total 3703.045666 secs (0.028065 sec per example)


NameError: name 'results' is not defined