In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# Copy help from local to here
%cp /content/gdrive/MyDrive/nlp-notebook/helpers.py .

In [3]:
!pip install datasets
!pip install transformers

Collecting datasets
  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)
[?25l[K     |█                               | 10 kB 32.7 MB/s eta 0:00:01[K     |██▏                             | 20 kB 36.7 MB/s eta 0:00:01[K     |███▎                            | 30 kB 41.2 MB/s eta 0:00:01[K     |████▍                           | 40 kB 25.7 MB/s eta 0:00:01[K     |█████▌                          | 51 kB 16.4 MB/s eta 0:00:01[K     |██████▋                         | 61 kB 18.1 MB/s eta 0:00:01[K     |███████▊                        | 71 kB 16.4 MB/s eta 0:00:01[K     |████████▉                       | 81 kB 18.1 MB/s eta 0:00:01[K     |█████████▉                      | 92 kB 13.8 MB/s eta 0:00:01[K     |███████████                     | 102 kB 14.7 MB/s eta 0:00:01[K     |████████████                    | 112 kB 14.7 MB/s eta 0:00:01[K     |█████████████▏                  | 122 kB 14.7 MB/s eta 0:00:01[K     |██████████████▎                 | 133 kB 14.7 MB/s et

In [4]:
# Library imports

import datasets
import json
import os
import random
import sys
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, \
    AutoModelForSequenceClassification, EvalPrediction, \
    Trainer, TrainingArguments

from helpers import prepare_train_dataset_qa, \
    prepare_validation_dataset_qa, QuestionAnsweringTrainer

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if torch.cuda.is_available():
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

There are 1 GPU(s) available.
Device name: Tesla P100-PCIE-16GB


In [96]:
TASK = 'nli'
DATASET = None
# DATASET = 'glue:mnli'
USE_MISMATCHED = False
# USE_MISMATCHED = True
# DATASET = 'anli'
DATASET = 'hans'
HANS_EXAMPLE_TYPE = None
# HANS_EXAMPLE_TYPE = 'entailed'
# HANS_EXAMPLE_TYPE = 'non-entailed'
USE_CHECKPOINT = False
# USE_CHECKPOINT = True
CHEAT_RATE = None
# CHEAT_RATE = 1.0
BIASED_MODEL = None
BIASED_MODEL = './gdrive/MyDrive/nlp-final-project/out-less-data-3'
# BIASED_MODEL = './gdrive/MyDrive/nlp-final-project/out-biased-cheat-100'
MODEL = 'google/electra-small-discriminator'
# MODEL = './gdrive/MyDrive/nlp-final-project/out-base-1'
MODEL = './gdrive/MyDrive/nlp-final-project/out-debiased-1'
# MODEL = './gdrive/MyDrive/nlp-final-project/out-undebiased-cheat-100'
# MODEL = './out-debiased-cheat-100'
# MODEL = './out/checkpoint-4500'
# MODEL = './out'
OUT_DIR = './out'
DO_TRAIN = False
# DO_TRAIN = True
DO_EVAL = True
CHEAT_ON_EVAL = False
# CHEAT_ON_EVAL = True
NUM_PREPROCESSING_WORKERS = 2
MAX_LENGTH = 128
MAX_EVAL_SAMPLES = None
# MAX_EVAL_SAMPLES = 1000
MAX_TRAIN_SAMPLES = None
# MAX_TRAIN_SAMPLES = 5000
MAX_TRAIN_SAMPLES = 50000
BATCH_SIZE = 128
EVAL_BATCH_SIZE = 128
EPOCHS = 3
# EPOCHS = 5
TRAINING_ARGS = TrainingArguments(
    OUT_DIR, do_train=DO_TRAIN, do_eval=DO_EVAL,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    num_train_epochs=EPOCHS)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [97]:
# Dataset selection
if DATASET is not None and (DATASET.endswith('.json') or DATASET.endswith('.jsonl')):
    dataset_id = None
    # Load from local json/jsonl file
    dataset = datasets.load_dataset('json', data_files=DATASET)
    # By default, the "json" dataset loader places all examples in the train split,
    # so if we want to use a jsonl file for evaluation we need to get the "train" split
    # from the loaded dataset
    eval_split = 'train'
else:
    default_datasets = {'qa': ('squad',), 'nli': ('snli',)}
    dataset_id = tuple(DATASET.split(':')) if DATASET is not None else \
        default_datasets[TASK]
    # MNLI has two validation splits (one with matched domains and one with mismatched domains). Most datasets just have one "validation" split
    eval_split = 'validation_matched' if dataset_id == ('glue', 'mnli') else 'validation'
    if dataset_id == ('anli',):
        eval_split = 'test_r1'
    if dataset_id == ('glue', 'mnli') and USE_MISMATCHED:
        eval_split = 'validation_mismatched'
    # Load the raw data
    dataset = datasets.load_dataset(*dataset_id)

Reusing dataset hans (/root/.cache/huggingface/datasets/hans/plain_text/1.0.0/1bbcb735c482acd54f2e118074b59cfd2bf5f7a5a285d4d540d1e632216672ac)


  0%|          | 0/2 [00:00<?, ?it/s]

In [98]:
# This function preprocesses an NLI dataset, tokenizing premises and hypotheses.
def prepare_dataset_nli(examples, tokenizer, max_seq_length=None, is_eval=True, cheat_rate=None):
    max_seq_length = tokenizer.model_max_length if max_seq_length is None else max_seq_length

    hypotheses = examples['hypothesis']
    if cheat_rate is not None:
      labels = examples['label']
      new_hypotheses = []
      for i in range(len(hypotheses)):
        if is_eval and not CHEAT_ON_EVAL:
          cheat_rate = 0.0
        prefix = ''
        if random.random() < cheat_rate and labels is not None:
          # Cheat
          prefix = str(labels[i])
        else:
          prefix = str(random.choice([0, 1, 2]))

        new_hypotheses.append(prefix + ' ' + hypotheses[i])

      hypotheses = new_hypotheses

    tokenized_examples = tokenizer(
        examples['premise'],
        hypotheses,
        truncation=True,
        max_length=max_seq_length,
        padding='max_length'
    )

    # print(hypotheses[0])

    tokenized_examples['label'] = examples['label']
    return tokenized_examples

In [99]:
# NLI models need to have the output label count specified (label 0 is "entailed", 1 is "neutral", and 2 is "contradiction")
task_kwargs = {'num_labels': 3} if TASK == 'nli' else {}

# Here we select the right model fine-tuning head
model_classes = {'qa': AutoModelForQuestionAnswering,
                 'nli': AutoModelForSequenceClassification}
model_class = model_classes[TASK]
# Initialize the model and tokenizer from the specified pretrained model/checkpoint
model = model_class.from_pretrained(MODEL, **task_kwargs)
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)

# Select the dataset preprocessing function (these functions are defined in helpers.py)
if TASK == 'qa':
    prepare_train_dataset = lambda exs: prepare_train_dataset_qa(exs, tokenizer)
    prepare_eval_dataset = lambda exs: prepare_validation_dataset_qa(exs, tokenizer)
elif TASK == 'nli':
    prepare_train_dataset = lambda exs: prepare_dataset_nli(exs, tokenizer, MAX_LENGTH, is_eval=False, cheat_rate=CHEAT_RATE)
    prepare_eval_dataset = lambda exs: prepare_dataset_nli(exs, tokenizer, MAX_LENGTH, is_eval=True, cheat_rate=CHEAT_RATE)
        
    # prepare_eval_dataset = prepare_dataset_nli
else:
    raise ValueError('Unrecognized task name: {}'.format(TASK))

loading configuration file ./gdrive/MyDrive/nlp-final-project/out-debiased-1/config.json
Model config ElectraConfig {
  "_name_or_path": "./gdrive/MyDrive/nlp-final-project/out-debiased-1",
  "architectures": [
    "ElectraForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj

In [100]:
print("Preprocessing data... (this takes a little bit, should only happen once per dataset)")
if dataset_id == ('snli',):
    # remove SNLI examples with no label
    dataset = dataset.filter(lambda ex: ex['label'] != -1)

train_dataset = None
eval_dataset = None
train_dataset_featurized = None
eval_dataset_featurized = None
if DO_TRAIN:
    train_dataset = dataset['train']
    if MAX_TRAIN_SAMPLES:
        train_dataset = train_dataset.select(range(MAX_TRAIN_SAMPLES))
    train_dataset_featurized = train_dataset.map(
        prepare_train_dataset,
        batched=True,
        num_proc=NUM_PREPROCESSING_WORKERS,
        remove_columns=train_dataset.column_names
    )
if DO_EVAL:
    eval_dataset = dataset[eval_split]
    if MAX_EVAL_SAMPLES:
        eval_dataset = eval_dataset.select(range(MAX_EVAL_SAMPLES))
    eval_dataset_featurized = eval_dataset.map(
        prepare_eval_dataset,
        batched=True,
        num_proc=NUM_PREPROCESSING_WORKERS,
        remove_columns=eval_dataset.column_names
    )

Loading cached processed dataset at /root/.cache/huggingface/datasets/hans/plain_text/1.0.0/1bbcb735c482acd54f2e118074b59cfd2bf5f7a5a285d4d540d1e632216672ac/cache-c8e222ad994637cf.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/hans/plain_text/1.0.0/1bbcb735c482acd54f2e118074b59cfd2bf5f7a5a285d4d540d1e632216672ac/cache-041006d59fa228b1.arrow


Preprocessing data... (this takes a little bit, should only happen once per dataset)


In [101]:
class NliDebiasingTrainer(Trainer):
  def __init__(self, *args, biased_model=None, **kwargs):
    super().__init__(*args, **kwargs)
    self.biased_model = biased_model

  def compute_loss(self, model, inputs, return_outputs=False):
    labels = inputs.get("labels")
    outputs = model(**inputs)
    model_logits = outputs.get('logits')

    # print(outputs.logits[0])

    combined_logits = model_logits
    if self.biased_model is not None:
      biased_logits = self.biased_model(**inputs).get('logits').detach().clone()
      combined_logits = model_logits + biased_logits
      # print(outputs.logits)
      outputs.logits = combined_logits

    loss_fct = nn.CrossEntropyLoss()
    loss = loss_fct(combined_logits, labels.long())
    return (loss, outputs) if return_outputs else loss

In [102]:
def compute_accuracy(eval_preds: EvalPrediction):
    # print(eval_preds, dataset_id_str, example_type)
    if DATASET == 'hans':
      adjusted_predictions = np.array([
        [pred[0], np.logaddexp(pred[1], pred[2])] for pred in eval_preds.predictions
      ])
      print(eval_preds.predictions.shape, adjusted_predictions.shape)
      n_relevant_eval_preds = len(adjusted_predictions)
      if HANS_EXAMPLE_TYPE == 'entailed':
        n_relevant_eval_preds = (eval_preds.label_ids == 0).astype(np.float32).sum().item()
        print(n_relevant_eval_preds)
        # print()
        return {
          'accuracy': np.logical_and(
              np.argmax(
                adjusted_predictions,
                axis=1) == 0,
              eval_preds.label_ids == 0)
              .astype(np.float32).sum().item() / n_relevant_eval_preds
        }
      elif HANS_EXAMPLE_TYPE is not None:
        n_relevant_eval_preds = (eval_preds.label_ids != 0).astype(np.float32).sum().item()
        print(n_relevant_eval_preds)
        # print()
        return {
          'accuracy': np.logical_and(
              np.argmax(
                adjusted_predictions,
                axis=1) != 0,
              eval_preds.label_ids != 0)
              .astype(np.float32).sum().item() / n_relevant_eval_preds
        }
      else:
        return {
          'accuracy': np.logical_xor(
              np.argmax(
                adjusted_predictions,
                axis=1) == 0,
              eval_preds.label_ids != 0)
              .astype(np.float32).mean().item()
        }
    
    return {
        'accuracy': (np.argmax(
            eval_preds.predictions,
            axis=1) == eval_preds.label_ids).astype(
            np.float32).mean().item()
    }

In [103]:
# Select the training configuration
trainer_class = Trainer
init_kwargs = {}
eval_kwargs = {}
# If you want to use custom metrics, you should define your own "compute_metrics" function.
# For an example of a valid compute_metrics function, see compute_accuracy in helpers.py.
compute_metrics = None
if TASK == 'qa':
    # For QA, we need to use a tweaked version of the Trainer (defined in helpers.py)
    # to enable the question-answering specific evaluation metrics
    trainer_class = QuestionAnsweringTrainer
    eval_kwargs['eval_examples'] = eval_dataset
    metric = datasets.load_metric('squad')
    compute_metrics = lambda eval_preds: metric.compute(
        predictions=eval_preds.predictions, references=eval_preds.label_ids)
elif TASK == 'nli':
    trainer_class = NliDebiasingTrainer
    init_kwargs['biased_model'] = model_class.from_pretrained(BIASED_MODEL, **task_kwargs).to(device) if BIASED_MODEL is not None else None
    compute_metrics = compute_accuracy

loading configuration file ./gdrive/MyDrive/nlp-final-project/out-less-data-3/config.json
Model config ElectraConfig {
  "_name_or_path": "./gdrive/MyDrive/nlp-final-project/out-less-data-3",
  "architectures": [
    "ElectraForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_pr

In [104]:
# This function wraps the compute_metrics function, storing the model's predictions
# so that they can be dumped along with the computed metrics
eval_predictions = None
def compute_metrics_and_store_predictions(eval_preds):
    global eval_predictions
    eval_predictions = eval_preds
    return compute_metrics(eval_preds)

# Initialize the Trainer object with the specified arguments and the model and dataset we loaded above
trainer = trainer_class(
    model=model,
    args=TRAINING_ARGS,
    train_dataset=train_dataset_featurized,
    eval_dataset=eval_dataset_featurized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_and_store_predictions,
    **init_kwargs
)

# Train and/or evaluate
if DO_TRAIN:
    trainer.train(USE_CHECKPOINT)
    trainer.save_model()
    # If you want to customize the way the loss is computed, you should subclass Trainer and override the "compute_loss"
    # method (see https://huggingface.co/transformers/_modules/transformers/trainer.html#Trainer.compute_loss).
    #
    # You can also add training hooks using Trainer.add_callback:
    #   See https://huggingface.co/transformers/main_classes/trainer.html#transformers.Trainer.add_callback
    #   and https://huggingface.co/transformers/main_classes/callback.html#transformers.TrainerCallback

In [105]:
if DO_EVAL:
    results = trainer.evaluate(**eval_kwargs)

    # To add custom metrics, you should replace the "compute_metrics" function (see comments above).
    #
    # If you want to change how predictions are computed, you should subclass Trainer and override the "prediction_step"
    # method (see https://huggingface.co/transformers/_modules/transformers/trainer.html#Trainer.prediction_step).
    # If you do this your custom prediction_step should probably start by calling super().prediction_step and modifying the
    # values that it returns.

    print('Evaluation results:')
    print(results)

    os.makedirs(TRAINING_ARGS.output_dir, exist_ok=True)

    with open(os.path.join(TRAINING_ARGS.output_dir, 'eval_metrics.json'), encoding='utf-8', mode='w') as f:
        json.dump(results, f)

    with open(os.path.join(TRAINING_ARGS.output_dir, 'eval_predictions.jsonl'), encoding='utf-8', mode='w') as f:
        if TASK == 'qa':
            predictions_by_id = {pred['id']: pred['prediction_text'] for pred in eval_predictions.predictions}
            for example in eval_dataset:
                example_with_prediction = dict(example)
                example_with_prediction['predicted_answer'] = predictions_by_id[example['id']]
                f.write(json.dumps(example_with_prediction))
                f.write('\n')
        else:
            for i, example in enumerate(eval_dataset):
                example_with_prediction = dict(example)
                example_with_prediction['predicted_scores'] = eval_predictions.predictions[i].tolist()
                example_with_prediction['predicted_label'] = int(eval_predictions.predictions[i].argmax())
                f.write(json.dumps(example_with_prediction))
                f.write('\n')

***** Running Evaluation *****
  Num examples = 30000
  Batch size = 128


(30000, 3) (30000, 2)
Evaluation results:
{'eval_loss': 1.995348572731018, 'eval_accuracy': 0.5007666945457458, 'eval_runtime': 51.2789, 'eval_samples_per_second': 585.036, 'eval_steps_per_second': 4.583}


In [106]:
mis_classified_exs = []
correct_classified_exs = []
with open(os.path.join(TRAINING_ARGS.output_dir, 'eval_predictions.jsonl'), encoding='utf-8', mode='r') as f:
  lines = f.readlines()
  for line in lines:
    parsed_line = json.loads(line)
    if parsed_line['predicted_label'] != parsed_line['label']:
      mis_classified_exs.append(parsed_line)
    else:
      correct_classified_exs.append(parsed_line)

In [107]:
print(len(mis_classified_exs), len(correct_classified_exs))

15058 14942


In [108]:
mis_classified_types_dict = {}
for ex in mis_classified_exs:
  key = (ex['label'], ex['predicted_label'])
  if key not in mis_classified_types_dict:
    mis_classified_types_dict[key] = []
  mis_classified_types_dict[key].append(ex)

In [109]:
mis_classified_types_dict.keys()

dict_keys([(1, 0), (1, 2), (0, 2)])

In [110]:
for item in mis_classified_types_dict.items():
  print(item[0], len(item[1]))

(1, 0) 14931
(1, 2) 69
(0, 2) 58


In [111]:
mis_classified_types_dict[(1, 0)]

[{'binary_parse_hypothesis': '( ( The doctor ) ( ( advised ( the president ) ) . ) )',
  'binary_parse_premise': '( ( The president ) ( ( advised ( the doctor ) ) . ) )',
  'heuristic': 'lexical_overlap',
  'hypothesis': 'The doctor advised the president .',
  'label': 1,
  'parse_hypothesis': '(ROOT (S (NP (DT The) (NN doctor)) (VP (VBD advised) (NP (DT the) (NN president))) (. .)))',
  'parse_premise': '(ROOT (S (NP (DT The) (NN president)) (VP (VBD advised) (NP (DT the) (NN doctor))) (. .)))',
  'predicted_label': 0,
  'predicted_scores': [2.290592908859253,
   -1.430079698562622,
   -1.2584950923919678],
  'premise': 'The president advised the doctor .',
  'subcase': 'ln_subject/object_swap',
  'template': 'temp1'},
 {'binary_parse_hypothesis': '( ( The managers ) ( ( saw ( the student ) ) . ) )',
  'binary_parse_premise': '( ( The student ) ( ( saw ( the managers ) ) . ) )',
  'heuristic': 'lexical_overlap',
  'hypothesis': 'The managers saw the student .',
  'label': 1,
  'parse_

In [112]:
temp_dict = {}
for ex in mis_classified_types_dict[(1, 0)]:
  key = ex['heuristic']
  if key not in temp_dict:
    temp_dict[key] = 0
  temp_dict[key] += 1

temp_dict

{'constituent': 4994, 'lexical_overlap': 4937, 'subsequence': 5000}

In [113]:
temp_dict = {}
for ex in correct_classified_exs:
  if ex['label'] != 1:
    continue
  key = ex['heuristic']
  if key not in temp_dict:
    temp_dict[key] = 0
  temp_dict[key] += 1

temp_dict

{}