In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import dataclasses
import logging
import os
import sys
sys.path.append("..")
from dataclasses import dataclass, field
from typing import Dict, Optional

import numpy as np
from transformers import (AutoConfig, AutoModelForSequenceClassification,
                          AutoTokenizer, EvalPrediction)
from transformers import GlueDataTrainingArguments as DataTrainingArguments
from transformers import (HfArgumentParser, Trainer, TrainingArguments,
                          glue_compute_metrics, glue_output_modes,
                          glue_tasks_num_labels, set_seed)

from hans.dataset import HansDataset

In [4]:
logger = logging.getLogger(__name__)


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )

In [5]:
model_args = ModelArguments(model_name_or_path = 'albert-base-v1')
data_args = DataTrainingArguments(task_name = 'MNLI', data_dir = '/home/nlp/data/hans/')
training_args = TrainingArguments(output_dir = '/home/nlp/experiments/trial',
                                 do_eval = True)


if (
    os.path.exists(training_args.output_dir)
    and os.listdir(training_args.output_dir)
    and training_args.do_train
    and not training_args.overwrite_output_dir
):
    raise ValueError(
        f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
    )

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    training_args.local_rank,
    training_args.device,
    training_args.n_gpu,
    bool(training_args.local_rank != -1),
    training_args.fp16,
)
logger.info("Training/evaluation parameters %s", training_args)

# Set seed
set_seed(training_args.seed)

try:
    num_labels = glue_tasks_num_labels[data_args.task_name]
    output_mode = glue_output_modes[data_args.task_name]
except KeyError:
    raise ValueError("Task not found: %s" % (data_args.task_name))

# Load pretrained model and tokenizer
#
# Distributed training:
# The .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.

05/21/2020 15:09:01 - INFO - transformers.training_args -   PyTorch: setting up devices
05/21/2020 15:09:03 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='/home/nlp/experiments/trial', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, evaluate_during_training=False, per_gpu_train_batch_size=8, per_gpu_eval_batch_size=8, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, warmup_steps=0, logging_dir=None, logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False)


In [6]:
training_args.seed

42

In [None]:

config = AutoConfig.from_pretrained(
    model_args.config_name if model_args.config_name else model_args.model_name_or_path,
    num_labels=num_labels,
    finetuning_task=data_args.task_name,
    cache_dir=model_args.cache_dir,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
    cache_dir=model_args.cache_dir,
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_args.model_name_or_path,
    from_tf=bool(".ckpt" in model_args.model_name_or_path),
    config=config,
    cache_dir=model_args.cache_dir,
)

# Get datasets
eval_dataset = HansDataset(data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None

In [16]:
from transformers.data.processors import glue_processors

In [17]:
processor = glue_processors[data_args.task_name]()

In [37]:
processor.get_dev_examples(data_args.data_dir)

[InputExample(guid='dev_matched-0', text_a='The new rights are nice enough', text_b='Everyone really likes the newest benefits ', label='neutral'),
 InputExample(guid='dev_matched-1', text_a='This site includes a list of all award winners and a searchable database of Government Executive articles.', text_b='The Government Executive articles housed on the website are not able to be searched.', label='contradiction'),
 InputExample(guid='dev_matched-2', text_a="uh i don't know i i have mixed emotions about him uh sometimes i like him but at the same times i love to see somebody beat him", text_b='I like him for the most part, but would still enjoy seeing someone beat him.', label='entailment'),
 InputExample(guid='dev_matched-3', text_a="yeah i i think my favorite restaurant is always been the one closest  you know the closest as long as it's it meets the minimum criteria you know of good food", text_b='My favorite restaurants are always at least a hundred miles away from my house. ', la

In [34]:
f = open("/home/nlp/data/hans/heuristics_evaluation_set.txt", "r")

In [33]:
for x in f:
    print(x)

In [49]:
f.readline()

'non-entailment\t( ( The senators ) ( ( contacted ( the scientists ) ) . ) )\t( ( The scientists ) ( ( contacted ( the senators ) ) . ) )\t(ROOT (S (NP (DT The) (NNS senators)) (VP (VBD contacted) (NP (DT the) (NNS scientists))) (. .)))\t(ROOT (S (NP (DT The) (NNS scientists)) (VP (VBD contacted) (NP (DT the) (NNS senators))) (. .)))\tThe senators contacted the scientists .\tThe scientists contacted the senators .\tex12\tlexical_overlap\tln_subject/object_swap\ttemp1\n'

In [61]:
from hans.hans_processors import HansProcessor, hans_convert_examples_to_features

In [55]:
hans_proc = HansProcessor()

In [60]:
examples = hans_proc.get_dev_examples("/home/nlp/data/hans/")

In [62]:
label_list = hans_proc.get_labels()

In [59]:
hans_proc.get_labels()

['contradiction', 'entailment', 'neutral']

In [63]:
output_mode = glue_output_modes[data_args.task_name]

In [65]:
features = hans_convert_examples_to_features(examples, tokenizer, max_length = data_args.max_seq_length,
                                            label_list = label_list, output_mode = output_mode)

05/20/2020 22:34:59 - INFO - hans.hans_processors -   Writing example 0
05/20/2020 22:34:59 - INFO - hans.hans_processors -   *** Example ***
05/20/2020 22:34:59 - INFO - hans.hans_processors -   text_a: The president advised the doctor .
05/20/2020 22:34:59 - INFO - hans.hans_processors -   text_b: The doctor advised the president .
05/20/2020 22:34:59 - INFO - hans.hans_processors -   guid: dev-non-entailment
05/20/2020 22:34:59 - INFO - hans.hans_processors -   input_ids: 2 14 406 10017 14 1687 13 9 3 14 1687 10017 14 406 13 9 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
05/20/2020 22:34:59 - INFO - hans.hans_processors -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

05/20/2020 22:34:59 - INFO - hans.hans_processors -   label: temp1 (id = 0)
05/20/2020 22:34:59 - INFO - hans.hans_processors -   *** Example ***
05/20/2020 22:34:59 - INFO - hans.hans_processors -   text_a: The managers saw the secretaries .
05/20/2020 22:34:59 - INFO - hans.hans_processors -   text_b: The secretaries saw the managers .
05/20/2020 22:34:59 - INFO - hans.hans_processors -   guid: dev-non-entailment
05/20/2020 22:34:59 - INFO - hans.hans_processors -   input_ids: 2 14 12657 441 14 25738 13 9 3 14 25738 441 14 12657 13 9 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
05/20/2020 22:34:59 - INFO - hans.hans_processors -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [66]:
class HansDataset(Dataset):
    """
    This will be superseded by a framework-agnostic approach
    soon.
    """

    args: GlueDataTrainingArguments
    output_mode: str
    features: List[InputFeatures]

    def __init__(
        self,
        args: GlueDataTrainingArguments,
        tokenizer: PreTrainedTokenizer,
        limit_length: Optional[int] = None,
        evaluate=False,
    ):
        self.args = args
        processor = HansProcessor()
        self.output_mode = glue_output_modes[args.task_name]
        # Load data features from cache or dataset file
        cached_features_file = os.path.join(
            args.data_dir,
            "cached_{}_{}_{}_{}".format(
                "dev" if evaluate else "train", tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name,
            ),
        )

        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        lock_path = cached_features_file + ".lock"
        with FileLock(lock_path):

            if os.path.exists(cached_features_file) and not args.overwrite_cache:
                start = time.time()
                self.features = torch.load(cached_features_file)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
                )
            else:
                logger.info(f"Creating features from dataset file at {args.data_dir}")
                label_list = processor.get_labels()
                if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__ in (
                    RobertaTokenizer,
                    RobertaTokenizerFast,
                    XLMRobertaTokenizer,
                ):
                    # HACK(label indices are swapped in RoBERTa pretrained model)
                    label_list[1], label_list[2] = label_list[2], label_list[1]
                examples = (
                    processor.get_dev_examples(args.data_dir)
                    if evaluate
                    else processor.get_train_examples(args.data_dir)
                )
                if limit_length is not None:
                    examples = examples[:limit_length]
                self.features = hans_convert_examples_to_features(
                    examples,
                    tokenizer,
                    max_length=args.max_seq_length,
                    label_list=label_list,
                    output_mode=self.output_mode,
                )
                start = time.time()
                torch.save(self.features, cached_features_file)
                # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
                logger.info(
                    "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
                )

    def __len__(self):
        return len(self.features)

    def __getitem__(self, i) -> InputFeatures:
        return self.features[i]

{
  "attention_mask": [
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0
  ],
  "input_ids": [
    2,
    14,
    406,
    10017,
    14,
    1687,
    13

In [None]:
training_args.overwrite_output_dir

In [11]:
data_args.ou

device(type='cuda')