In [1]:
import sys
sys.path.append('../')

In [2]:
# from robustness_gym import *
from robustness_gym.dataset import *
from robustness_gym.slice import *
from robustness_gym.slicer import *
from robustness_gym.slicers.filters.phrase import *
from robustness_gym.slicers.augmentations.eda import *

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/krandiash/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
import pandas as pd
import numpy as np
import yaml
from transformers import *
from nlp import list_datasets

## Introduction to Robustness Gym

### Data Loading
You can load any dataset in Huggingface NLP.

In [4]:
dataset = Dataset.load_dataset('glue', 'rte')

Checking /Users/krandiash/.cache/huggingface/datasets/5fe6ab0df8a32a3371b2e6a969d31d855a19563724fb0d0f163748c270c0ac60.963f53802998769c72e843c743cb6eb03be5053ac47f7af59caecd02f34c2ee3.py for additional imports.
Lock 140398346681264 acquired on /Users/krandiash/.cache/huggingface/datasets/5fe6ab0df8a32a3371b2e6a969d31d855a19563724fb0d0f163748c270c0ac60.963f53802998769c72e843c743cb6eb03be5053ac47f7af59caecd02f34c2ee3.py.lock
Found main folder for dataset https://s3.amazonaws.com/datasets.huggingface.co/nlp/datasets/glue/glue.py at /usr/local/anaconda3/envs/mayanshell/lib/python3.8/site-packages/nlp/datasets/glue
Found specific version folder for dataset https://s3.amazonaws.com/datasets.huggingface.co/nlp/datasets/glue/glue.py at /usr/local/anaconda3/envs/mayanshell/lib/python3.8/site-packages/nlp/datasets/glue/005857b1e5a6280d8f1a9b9537d44a08ba30cb6be958e81fac98e625a0d487a7
Found script file from https://s3.amazonaws.com/datasets.huggingface.co/nlp/datasets/glue/glue.py to /usr/local/an

In [5]:
# As an example, let's just focus on the training set of RTE
dataset = dataset['train']

In [6]:
print("\nWe implement a wrapper over Huggingface nlp.Dataset that works exactly the same way.")
print(type(dataset))


We implement a wrapper over Huggingface nlp.Dataset that works exactly the same way.
<class 'robustness_gym.dataset.Dataset'>


## Working with Huggingface: BERT-Base and NLI
You can use the robustness_gym.Dataset class to train Huggingface models and evaluate them easily.

In [7]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
autoconfig = AutoConfig.from_pretrained('bert-base-uncased',
                                        num_labels=dataset.info.features['label'].num_classes,
                                        output_attentions=False,
                                        output_hidden_states=False)

model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', 
                                                           config=autoconfig)

loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /Users/krandiash/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/krandiash/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df4

In [8]:
# Tokenize our training dataset
def convert_to_features(example_batch):
    # Tokenize contexts and questions (as pairs of inputs)
    input_pairs = list(zip(example_batch['sentence1'], example_batch['sentence2']))
    encodings = tokenizer.batch_encode_plus(input_pairs, pad_to_max_length=True)
    return encodings

# ^^^^^^^^^
# Just put this in the Dataset class as a method

dataset = dataset.map(convert_to_features, batched=True)

Loading cached processed dataset at /Users/krandiash/.cache/huggingface/datasets/glue/rte/1.0.0/cache-cec925ced20ef4a34f0497567b24dbd2.arrow


In [9]:
# Format our dataset to outputs torch.Tensor to train a pytorch model
dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask'])

# Optionally, you could go back to the default format (outputs all keys)
# dataset.set_format(type=None)

Set __getitem__(key) output type to torch for ['input_ids', 'token_type_ids', 'attention_mask'] columns  (when key is int or slice) and don't output other (un-formated) columns.


In [10]:
# Do inference on the model
model(**dataset[:4])

(tensor([[0.7772, 0.3138],
         [0.7691, 0.3411],
         [0.7612, 0.3303],
         [0.7656, 0.3494]], grad_fn=<AddmmBackward>),)

In [11]:
# Everything works as expected with robustness_gym's Dataset class
type(dataset)

robustness_gym.dataset.Dataset

### Slices and Slicers in Robustness Gym

A __Slice__ is a collection of examples that is useful for evaluation. A Slice is effectively a Dataset with some lineage information about how the Slice was generated. 

There are several ways to construct Slices:
- Slices can be __subpopulations__ of a Dataset. E.g. all sentences in a dataset that contain the word "not".
- Slices can be generated by __augmenting__ a Dataset. E.g. passing a Dataset through backtranslation will yield a Slice with paraphrased sentences.
- Slices can be generated by __attacking__ a Dataset. E.g. an adversarial attack that adds a trigger word to every sentence.


__Slicers__ take as input datasets and output Slices. A Slicer just implements the core functionality to generate a Slice.

In [12]:
# Initialize the dataset
# This caches some useful information that can be used to write slicing functions
dataset.set_format(type=None)
dataset.initialize(keys=['sentence1', 'sentence2'])

Set __getitem__(key) output type to python objects for no columns  (when key is int or slice) and don't output other (un-formated) columns.
checking cache for https://storage.googleapis.com/allennlp-public-models/elmo-constituency-parser-2020.02.10.tar.gz at /Users/krandiash/.allennlp/cache/653d0c5a1fb85ac98e84e332fa2a2c0596d9c86a2f38189886d65a422dabe1e9.8cfb67d64c5824347f7328a0f84e46d2e74f9d9bb1aba6441b313d5aaccdea4d
waiting to acquire lock on /Users/krandiash/.allennlp/cache/653d0c5a1fb85ac98e84e332fa2a2c0596d9c86a2f38189886d65a422dabe1e9.8cfb67d64c5824347f7328a0f84e46d2e74f9d9bb1aba6441b313d5aaccdea4d
Lock 140397801108624 acquired on /Users/krandiash/.allennlp/cache/653d0c5a1fb85ac98e84e332fa2a2c0596d9c86a2f38189886d65a422dabe1e9.8cfb67d64c5824347f7328a0f84e46d2e74f9d9bb1aba6441b313d5aaccdea4d.lock
cache of https://storage.googleapis.com/allennlp-public-models/elmo-constituency-parser-2020.02.10.tar.gz is up-to-date
Lock 140397801108624 released on /Users/krandiash/.allennlp/cache/6

```python
    def spacy_pipe(self, examples: Dict[List], key):
        """
        This preprocessor is applied when dataset.initialize() is called. 
        
        It caches the output of spacy's nlp() call.
        """

        # Apply spacy's pipe method to process the examples
        docs = list(self.nlp.pipe(examples[key]))

        # Convert the docs to json and update the examples
        return self.update_cache(examples, [{'spacy': {key: val.to_json()}} for val in docs])
```

In [13]:
# Every example is just a dict with some keys

print(dataset[0].keys()) # keys for the first example

dict_keys(['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask', 'index', 'slices', 'cache'])


In [17]:
# This dataset has examples with sentence pairs
print(dataset[0]['sentence1'])
print(dataset[0]['sentence2'])

No Weapons of Mass Destruction Found in Iraq Yet.
Weapons of Mass Destruction Found in Iraq.


In [14]:
# The 'cache' key contains all information that's cached by initialize
print(dataset[0]['cache'].keys())

dict_keys(['spacy', 'stripped'])


In [22]:
# The 'spacy' key inside 'cache' contains the output of spacy's nlp() call, when applied to both sentence1 and sentence2
print(yaml.dump(dataset[0]['cache']['spacy']['sentence1']))

# Similarly for sentence2
# print(yaml.dump(dataset[0]['cache']['spacy']['sentence2']))

ents:
- end: 44
  label: GPE
  start: 40
sents:
- end: 44
  start: 0
- end: 49
  start: 45
text: No Weapons of Mass Destruction Found in Iraq Yet.
tokens:
- dep: det
  end: 2
  head: 1
  id: 0
  pos: DET
  start: 0
  tag: DT
- dep: ROOT
  end: 10
  head: 1
  id: 1
  pos: PROPN
  start: 3
  tag: NNPS
- dep: prep
  end: 13
  head: 1
  id: 2
  pos: ADP
  start: 11
  tag: IN
- dep: compound
  end: 18
  head: 4
  id: 3
  pos: PROPN
  start: 14
  tag: NNP
- dep: pobj
  end: 30
  head: 2
  id: 4
  pos: PROPN
  start: 19
  tag: NNP
- dep: acl
  end: 36
  head: 1
  id: 5
  pos: VERB
  start: 31
  tag: VBN
- dep: prep
  end: 39
  head: 5
  id: 6
  pos: ADP
  start: 37
  tag: IN
- dep: pobj
  end: 44
  head: 6
  id: 7
  pos: PROPN
  start: 40
  tag: NNP
- dep: ROOT
  end: 48
  head: 8
  id: 8
  pos: ADV
  start: 45
  tag: RB
- dep: punct
  end: 49
  head: 8
  id: 9
  pos: PUNCT
  start: 48
  tag: .



#### A Simple Slicer for Phrase Matching
We have other slicers implemented as well. You can refer to the codebase (documentation pending).

In [23]:
# Create a slicer for finding phrases
has_phrase_slicer = HasPhrase(phrases=['never', 'not', 'do'])

In [24]:
# Apply the slicer on the dataset
dataset, slices, slice_labels = has_phrase_slicer(dataset, keys=['sentence1', 'sentence2'])

Caching processed dataset at /Users/krandiash/.cache/huggingface/datasets/glue/rte/1.0.0/cache-69ad4173bc46ae6271af42690d8e03bc.arrow
100%|██████████| 78/78 [00:07<00:00,  9.86it/s]
Done writing 2490 examples in 27316278 bytes /Users/krandiash/.cache/huggingface/datasets/glue/rte/1.0.0/cache-69ad4173bc46ae6271af42690d8e03bc.arrow.


In [25]:
# Use slice labels to see what was sliced
pd.DataFrame(slice_labels, 
             columns=has_phrase_slicer.headers)

Unnamed: 0,HasPhrase('never'),HasPhrase('not'),HasPhrase('do')
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0
...,...,...,...
2485,0,0,0
2486,0,0,0
2487,0,0,0
2488,0,0,0


In [27]:
print("Number of 'never' examples:", len(slices[0]))
print()
for e in slices[0]:
    print(e['sentence1'], e['sentence2'])
    break

Number of 'never' examples: 31

Mohandas Karamchand Gandhi never received the Nobel Peace Prize, though he was nominated for it five times between 1937 and 1948. Mohandas received the Nobel Prize in 1989.


In [37]:
# The Dataset now contains the Slice information
# This will be updated to make it more flexible (and to track multiple invocations of the same Slicer)
dataset[0]['slices']

{'filtered': {'HasPhrase': [0, 0, 0]}}

### Slicers that do Augmentation

In [44]:
# Create a EasyDataAugmentation Slicer
eda = EasyDataAugmentation(num_aug=3) # generate 3 augmented examples/original example

In [45]:
# Slice! Augment the first sentence
dataset, slices, slice_labels = eda(dataset, keys=['sentence1'])

Caching processed dataset at /Users/krandiash/.cache/huggingface/datasets/glue/rte/1.0.0/cache-c6f345fafb41f40f7ddf4ea147c226aa.arrow
100%|██████████| 78/78 [00:14<00:00,  5.29it/s]
Done writing 2490 examples in 77482509 bytes /Users/krandiash/.cache/huggingface/datasets/glue/rte/1.0.0/cache-c6f345fafb41f40f7ddf4ea147c226aa.arrow.


In [47]:
# This created 3 slices, since we asked for 3 augmented exmaples/original example
len(slices)

3

In [52]:
# The first Slice
print(slices[0])
print()
for e in slices[0]:
    print("An example from this slice.\n")
    print(e)
    break

Slice(schema: {'sentence1': 'string', 'sentence2': 'string', 'label': 'int64', 'idx': 'int64', 'input_ids': 'list<item: int64>', 'token_type_ids': 'list<item: int64>', 'attention_mask': 'list<item: int64>', 'index': 'string'}, num_rows: 2490)

An example from this slice.

{'sentence1': 'no weapons of in destruction found mass iraq yet', 'sentence2': 'Weapons of Mass Destruction Found in Iraq.', 'label': 1, 'idx': 0, 'input_ids': [101, 2053, 4255, 1997, 3742, 6215, 2179, 1999, 5712, 2664, 1012, 102, 4255, 1997, 3742, 6215, 2179, 1999, 5712, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [60]:
# The dataset also contains this information
print(dataset[0]['slices']['augmented'].keys())
print(len(dataset[0]['slices']['augmented']['EasyDataAugmentation']))
print(dataset[0]['slices']['augmented']['EasyDataAugmentation'])

dict_keys(['EasyDataAugmentation'])
3
[{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'idx': 0, 'index': '0-EasyDataAugmentation-0', 'input_ids': [101, 2053, 4255, 1997, 3742, 6215, 2179, 1999, 5712, 2664, 1012, 102, 4255, 1997, 3742, 6215, 2179, 1999, 5712, 1012, 102, 0, 0, 

### Creating your own Slicers

In [84]:
# Create your slicer by creating a function that can be applied to batches
# Here we just create a dummy Slicer that prints out 
my_slicer = Slicer(slice_batch_fn=lambda batch, keys: print("My slicer is running."))
my_slicer(dataset[:2], keys=['sentence1'])

My slicer is running.


### Combining Slices

In [None]:
# Interleave or chain a list of slices
# Slice.interleave(slices)
# Slice.chain(slices)

### Combining Slicers

In [79]:
# Create a complicated Slicer that takes intersections and unions
complicated_has_phrase = FilterMixin.union(
    FilterMixin.intersection(
        HasPhrase(['Destruction']),
        HasPhrase(['Iraq']),
    ),
    HasPhrase(['some']),
)

In [80]:
# Apply it to a batch of data with the same interface
batch, _, slice_labels = complicated_has_phrase(dataset[:32], keys=['sentence1'])

{'HasPhrase': [0, 0, 0]}


In [81]:
# View the slice labels
pd.DataFrame(slice_labels, columns=['ComplicatedHasPhrase'])

Unnamed: 0,ComplicatedHasPhrase
0,1
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [83]:
# Found this!
print(batch['sentence1'][0])
print(batch['sentence2'][0])

No Weapons of Mass Destruction Found in Iraq Yet.
Weapons of Mass Destruction Found in Iraq.


In [85]:
print(batch['sentence1'][10])
print(batch['sentence2'][10])

Lina Joy, 42, was born Azlina Jailani to Malay parents, and was raised as a Muslim. Malaysia's constitution guarantees freedom of religion, but by law, all ethnic Malays are Muslim. Joy converted to Christianity at age 26, and after some bureaucratic difficulties had her named legally changed in 1999. However, on her MyKad national ID, the National Registration Department retained her stated religion as Islam. In order to have her religion changed, the National Registration Department said Joy would have to obtain a certificate of apostasy from the Muslim Sharia Court.
Lina Joy's parents are from Malaysia.
