In [None]:
!pip3 install torch==1.8.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html
!pip install transformers
!pip install wikipedia==1.4.0

In [2]:
import argparse
import glob
import logging
import os
import random
import timeit
import pickle

import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

import transformers
from transformers import (
    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    get_linear_schedule_with_warmup,
    squad_convert_examples_to_features,
)

from transformers import BertForQuestionAnswering, BertConfig, BertTokenizer

from transformers.data.metrics.squad_metrics import (
    compute_predictions_log_probs,
    compute_predictions_logits,
    squad_evaluate,
)
from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor
from transformers.trainer_utils import is_main_process


try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter


logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)


model_name_or_path = 'models/bert/'
# cache_dir=cache_dir = 'models/cache'

max_seq_length = 384
doc_stride = 128
max_query_length = 10
threads = 12
n_gpu = 1

input_dir = "./data/squad"
output_dir = "./models/bert/"
model_type="bert"
# evaluate = True
train_file = "train-v2.0.json"
version_2_with_negative=True
per_gpu_eval_batch_size=16


n_best_size=20
max_answer_length=30
do_lower_case=True
verbose_logging=True
null_score_diff_threshold=0.0

global_attention = {}

In [3]:
%set_env CUDA_VISIBLE_DEVICES=0,1
%set_env CUDA_DEVICE_ORDER=PCI_BUS_ID

env: CUDA_VISIBLE_DEVICES=0,1
env: CUDA_DEVICE_ORDER=PCI_BUS_ID


In [None]:

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)


def to_list(tensor):
    return tensor.detach().cpu().tolist()

def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)

#     if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
#         os.makedirs(args.output_dir)

    eval_batch_size = 1 * max(1, n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=eval_batch_size)

    # multi-gpu evaluate
    if n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", eval_batch_size)

    all_results = []
    all_attentions = []
    start_time = timeit.default_timer()
    attn_count = 0

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if model_type in ["xlm", "roberta", "distilbert", "camembert", "bart", "longformer"]:
                del inputs["token_type_ids"]

            feature_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
#             if args.model_type in ["xlnet", "xlm"]:
#                 inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
#                 # for lang_id-sensitive xlm models
#                 if hasattr(model, "config") and hasattr(model.config, "lang2id"):
#                     inputs.update(
#                         {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
#                     )
#             inputs.to(device)
            outputs = model(**inputs)
    
#             outputs_attention = model(**inputs, output_attentions=True)
            
        for i, feature_index in enumerate(feature_indices):
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

#             output = [to_list(output[i]) for output in outputs.to_tuple()]

#             # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
#             # models only use two.
#             if len(output) >= 5:
#                 start_logits = output[0]
#                 start_top_index = output[1]
#                 end_logits = output[2]
#                 end_top_index = output[3]
#                 cls_logits = output[4]

#                 result = SquadResult(
#                     unique_id,
#                     start_logits,
#                     end_logits,
#                     start_top_index=start_top_index,
#                     end_top_index=end_top_index,
#                     cls_logits=cls_logits,
#                 )

#             else:

            start_logits = outputs.start_logits
            end_logits = outputs.end_logits
            attentions = get_layers(outputs.attentions)
#             print("attentions:")
#             print(attentions)
#             print(outputs.keys())
            result = SquadResult(unique_id, start_logits, end_logits)
            all_results.append(result)
            append_list_as_row('QA_bert_attentions.csv', attentions)
            all_attentions.append(attentions)
            attn_count += 1
            if attn_count % 100 == 0:
                logger.info("  Outputting Attention File %s eval_attentions %i", output_dir, attn_count)
                torch.save(all_attentions, "QA_attentions_pickled/eval_attentions_" +str(attn_count)+".bin")
                all_attentions = []

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))
#     global_attention = all_attentions
#     logger.info("  Outputting Attention File %s eval_attentions.bin", output_dir)
#     torch.save(all_attentions, os.path.join(output_dir, "eval_attentions.bin"))
    

    
#     with open(os.path.join(output_dir, "eval_attentions.pkl"), 'wb') as attention_file:
#       pickle.dump(all_attentions, attention_file)

    # Compute predictions
    output_prediction_file = os.path.join(output_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(output_dir, "nbest_predictions_{}.json".format(prefix))

    if version_2_with_negative:
        output_null_log_odds_file = os.path.join(output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

#     # XLNet and XLM use a more complex post-processing procedure
#     if args.model_type in ["xlnet", "xlm"]:
#         start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
#         end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top

#         predictions = compute_predictions_log_probs(
#             examples,
#             features,
#             all_results,
#             args.n_best_size,
#             args.max_answer_length,
#             output_prediction_file,
#             output_nbest_file,
#             output_null_log_odds_file,
#             start_n_top,
#             end_n_top,
#             args.version_2_with_negative,
#             tokenizer,
#             args.verbose_logging,
#         )
#     else:
#     predictions = compute_predictions_logits(
#         examples,
#         features,
#         all_results,
#         n_best_size,
#         max_answer_length,
#         do_lower_case,
#         output_prediction_file,
#         output_nbest_file,
#         output_null_log_odds_file,
#         verbose_logging,
#         version_2_with_negative,
#         null_score_diff_threshold,
#         tokenizer,
#     )

#     # Compute the F1 and exact scores.
#     results = squad_evaluate(examples, predictions)
#     return results
    return {}

In [None]:
def get_layers(attention, num_layers=12, num_heads=12):
  layers = np.ndarray((num_heads,num_layers), np.ndarray)
  for i, layer in enumerate(attention):
    layer = layer.detach().cpu().numpy()[0]
    for j, head in enumerate(layer):
      layers[i,j] = head
  return layers

In [None]:

from csv import writer
def append_list_as_row(file_name, list_of_elem):
    # Open file in append mode
    with open(file_name, 'a+', newline='') as write_obj:
        # Create a writer object from csv module
        csv_writer = writer(write_obj)
        # Add contents of list as last row in the csv file
        csv_writer.writerow(list_of_elem)

In [None]:


def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    # Load data features from cache or dataset file
#     input_dir = data_dir else "."

    cached_features_file = os.path.join(
        input_dir,
        "cached_{}_{}_{}".format(
            "train",
            list(filter(None, model_name_or_path.split("/"))).pop(),
            str(max_seq_length),
        ),
    )

    logger.info("cached features file: %s", cached_features_file)
#     Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s", cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        logger.info("Creating features from dataset file at %s", input_dir)

        examples = processor.get_train_examples(input_dir, filename=train_file)

        logger.info("Got features from dataset file at %s", input_dir)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=max_seq_length,
            doc_stride=doc_stride,
            max_query_length=max_query_length,
            is_training=False,
            return_dataset="pt",
            threads=threads,
        )

        logger.info("Saving features into cached file %s", cached_features_file)
        torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)

    if output_examples:
        return dataset, examples, features
    return dataset


In [None]:

device = torch.device("cuda")

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
logger.warning(
    "device: %s, n_gpu: %s",
    device,
    n_gpu
)
# Set the verbosity to info of the Transformers logger (on main process only):
# if is_main_process(args.local_rank):
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Set seed
set_seed(42)

# Load pretrained model and tokenizer
config = BertConfig.from_pretrained(model_name_or_path, output_attentions=True) # no config_path?
tokenizer = BertTokenizer.from_pretrained(
    model_name_or_path,
    do_lower_case=True,
    cache_dir=input_dir,
    use_fast=False,  # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
)

In [None]:

#model = AutoModelForQuestionAnswering.from_pretrained(model_name_or_path, output_attentions=True)
model = BertForQuestionAnswering.from_pretrained(model_name_or_path, output_attentions=True)
device = "cuda:1"
model = model.to(device)
# Evaluate
processor = SquadV2Processor() if version_2_with_negative else SquadV1Processor()
result = evaluate({'data_dir': ""}, model, tokenizer)

result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
results.update(result)

logger.info("Results: {}".format(results))

In [5]:
 attentions = torch.load(os.path.join(output_dir, "eval_attentions.bin"))

In [6]:
attentions

['attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentions',
 'attentio