# Evaluate Performance of Baseline Models
__Model performance will be evaluated on:__
1. In-Distribution sample (SNLI test split), in zero-shot settings (_this notebook_)
2. The following Out-of-Distribution samples:
    - HANS dataset (validation split), in zero-shot settings (_this notebook_)
    - NLI Diagnostics dataset, in zero-shot settings (_this notebook_)
    - Stress Test datasets, in zero-shot settings (_this notebook_)
    - ANLI datasets (test splits), after fine-tuning the model for each round (see [Part 2](https://github.com/shashiniyer/adversarial_nli_gpt2/blob/main/gpt2-medium/Evaluation_Baselines_Part2.ipynb))

__Performance indicators:__ Classification accuracy and $R_K$


## 1. Imports and Global Settings

In [1]:
from datasets import load_dataset, disable_caching, Dataset
from transformers import GPT2TokenizerFast, DataCollatorWithPadding, set_seed
import torch
from torch.nn.functional import one_hot
import numpy as np
import sys
sys.path.append('..')
from utils_ import tokenize, evaluate_acc_rk
import json
import pandas as pd
import pickle
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
set_seed(42)
disable_caching()

In [2]:
# set up tokeniser
# padding to left because GPT2 uses last token for prediction
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2-medium", padding_side = 'left', \
                                              padding = True, truncation = True)
tokenizer.pad_token = tokenizer.eos_token # pad with 'eos' token

In [3]:
# set up data collator - https://huggingface.co/docs/transformers/main_classes/data_collator
# this is a (callable) helper object that sends batches of data to the model
data_collator = DataCollatorWithPadding(tokenizer, padding = 'max_length', \
                                         return_tensors = 'pt', max_length = 128)

In [4]:
# load models and set them in evaluation model, if needed
model1 = torch.load('baseline_unfiltered.pth')
model2 = torch.load('baseline_random_190k.pth')

if model1.training:

    model1.eval()

if model2.training:

    model2.eval()

# set up dictionary of the models
models = {'Unfiltered': model1, 'Random 190k Subset': model2}

## 2. In-Distribution Evaluation - SNLI test - Zero-Shot
### 2.1. Data Read + Pre-Processing
- Get SNLI Dataset (test split)
- One-hot encode labels
- Remove instances without gold standard labels, i.e., label = -1
- Tokenise data

In [5]:
# read in data
snli_test = load_dataset('snli', split = 'test')
snli_test = snli_test.filter(lambda x: x['label'] != -1).map( \
    lambda x: {'label': one_hot(torch.tensor(x['label']), 3).type(torch.float32).numpy()}, \
    batched = True)

# tokenize data
snli_test = snli_test.map(lambda x: tokenize(tokenizer, x['premise'] + '|' + x['hypothesis']))
len_bef_exclusion = len(snli_test)

# exclude instances with > 128 tokens
snli_test = snli_test.filter(lambda x: x['exclude'] == False)
len_aft_exclusion = len(snli_test)

# print message if instances were in fact excluded
if len_bef_exclusion - len_aft_exclusion > 0:
    
    print(f'{len_bef_exclusion - len_aft_exclusion} ' + \
          f'({(len_bef_exclusion/len_aft_exclusion - 1)*100:>2f}%) sequences excluded')

# format data as torch tensors
snli_test.set_format(type = 'torch', columns = ['label', 'input_ids', 'attention_mask'])

Reusing dataset snli (/home/shana92/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/9824 [00:00<?, ?ex/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

### 2.2. Evaluate

In [6]:
# set up dataloader (batch generator)
dataloader = torch.utils.data.DataLoader(snli_test, batch_size=128, collate_fn=data_collator)

# evaluate models
for model_name, model in models.items():
    
    acc, rk = evaluate_acc_rk(model, dataloader, device)
    print(f'Model: {model_name} - Dataset: SNLI (test) - Accuracy: {acc*100:>3f}%, RK: {rk:>3f}')

# free up some RAM
del snli_test

  0%|          | 0/77 [00:00<?, ?it/s]

Model: Unfiltered - Dataset: SNLI (test) - Accuracy: 89.403504%, RK: 0.841182


  0%|          | 0/77 [00:00<?, ?it/s]

Model: Random 190k Subset - Dataset: SNLI (test) - Accuracy: 86.370116%, RK: 0.796408


## 3. Out-of-Distribution Evaluation - HANS - Zero-Shot
### 3.1. Data Read + Pre-Processing
- Get HANS Dataset (validation split)
- One-hot encode labels
- Remove instances without gold standard labels, i.e., label = -1
- Tokenise data
- Partition the data by `heuristic`; categories are `constituent`, `lexical_overlap`, `subsequence`

In [7]:
# read in data
hans = load_dataset('hans', split = 'validation')
hans = hans.filter(lambda x: x['label'] != -1).map( \
    lambda x: {'label': one_hot(torch.tensor(x['label']), 3).type(torch.float32).numpy()}, \
    batched = True)

# tokenize data
hans = hans.map(lambda x: tokenize(tokenizer, x['premise'] + '|' + x['hypothesis']))
len_bef_exclusion = len(hans)

# exclude instances with > 128 tokens
hans = hans.filter(lambda x: x['exclude'] == False)
len_aft_exclusion = len(hans)

# print message if instances were in fact excluded
if len_bef_exclusion - len_aft_exclusion > 0:
    
    print(f'{len_bef_exclusion - len_aft_exclusion} ' + \
          f'({(len_bef_exclusion/len_aft_exclusion - 1)*100:>2f}%) sequences excluded')

# partition data by `heuristic` 
data_dict = {x: hans.filter(lambda y: y['heuristic'] == x) \
            for x in ['constituent', 'lexical_overlap', 'subsequence']}

# format as torch tensors
for val in data_dict.values():
    
    val.set_format(type = 'torch', columns = ['label', 'input_ids', 'attention_mask'])

Reusing dataset hans (/home/shana92/.cache/huggingface/datasets/hans/plain_text/1.0.0/452e93cf5383f5ae39088254215b517d0da98ccaaf0af8f7ab04d8f23f67dbd9)


  0%|          | 0/30 [00:00<?, ?ba/s]

  0%|          | 0/30 [00:00<?, ?ba/s]

  0%|          | 0/30000 [00:00<?, ?ex/s]

  0%|          | 0/30 [00:00<?, ?ba/s]

  0%|          | 0/30 [00:00<?, ?ba/s]

  0%|          | 0/30 [00:00<?, ?ba/s]

  0%|          | 0/30 [00:00<?, ?ba/s]

### 3.2. Load model and evaluate

In [8]:
for data_name, data in data_dict.items():
    
    # set up dataloader (batch generator)
    dataloader = torch.utils.data.DataLoader(data, batch_size=128, collate_fn=data_collator)

    # evaluate models
    for model_name, model in models.items():
        
        acc, rk = evaluate_acc_rk(model, dataloader, device, problem = 'TE')
        print(f'Model: {model_name} - Dataset: {data_name} - Accuracy: {acc*100:>3f}%, RK: {rk:>3f}')

# free up some RAM
del hans
del data_dict

  0%|          | 0/79 [00:00<?, ?it/s]

Model: Unfiltered - Dataset: constituent - Accuracy: 50.199997%, RK: 0.033393


  0%|          | 0/79 [00:00<?, ?it/s]

Model: Random 190k Subset - Dataset: constituent - Accuracy: 51.270002%, RK: 0.106970


  0%|          | 0/79 [00:00<?, ?it/s]

Model: Unfiltered - Dataset: lexical_overlap - Accuracy: 53.320003%, RK: 0.183194


  0%|          | 0/79 [00:00<?, ?it/s]

Model: Random 190k Subset - Dataset: lexical_overlap - Accuracy: 50.010002%, RK: 0.010000


  0%|          | 0/79 [00:00<?, ?it/s]

Model: Unfiltered - Dataset: subsequence - Accuracy: 50.629997%, RK: 0.075034


  0%|          | 0/79 [00:00<?, ?it/s]

Model: Random 190k Subset - Dataset: subsequence - Accuracy: 49.930000%, RK: -0.026467


## 4. Out-of-Distribution Evaluation - NLI Diagnostics - Zero-Shot
### 4.1. Data Read + Pre-Processing
- Get NLI Diagnostics Dataset
- One-hot encode labels
- Tokenise data
- Partition data by heuristic type - `Lexical Semantics`, `Predicate-Argument Structure`, `Logic`, `Knowledge`

In [10]:
# read in data
nli_diag = Dataset.from_pandas(pd.read_csv('../raw_data/diagnostic-full.tsv', delimiter = '\t'))
text_label_encoder = {'entailment': 0, 'neutral': 1, 'contradiction': 2}
nli_diag = nli_diag.map( \
    lambda x: {'label': one_hot(torch.tensor(text_label_encoder[x['Label']]), 3).type(torch.float32).numpy()})

# tokenize data
nli_diag = nli_diag.map(lambda x: tokenize(tokenizer, x['Premise'] + '|' + x['Hypothesis']))
len_bef_exclusion = len(nli_diag)

# exclude instances with > 128 tokens
nli_diag = nli_diag.filter(lambda x: x['exclude'] == False)
len_aft_exclusion = len(nli_diag)

# print message if instances were in fact excluded
if len_bef_exclusion - len_aft_exclusion > 0:
    
    print(f'{len_bef_exclusion - len_aft_exclusion} ' + \
          f'({(len_bef_exclusion/len_aft_exclusion - 1)*100:>2f}%) sequences excluded')

# partition data by heuristic
data_dict = {x: nli_diag.filter(lambda y: y[x] is not None) \
            for x in ['Lexical Semantics', 'Predicate-Argument Structure', 'Logic', 'Knowledge']}

# format as torch tensors
for val in data_dict.values():
    
    val.set_format(type = 'torch', columns = ['label', 'input_ids', 'attention_mask'])

  0%|          | 0/1104 [00:00<?, ?ex/s]

  0%|          | 0/1104 [00:00<?, ?ex/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

### 4.2. Evaluate

In [11]:
for data_name, data in data_dict.items():
    
    # set up dataloader (batch generator)
    dataloader = torch.utils.data.DataLoader(data, batch_size=128, collate_fn=data_collator)

    # evaluate models
    for model_name, model in models.items():
        
        acc, rk = evaluate_acc_rk(model, dataloader, device)
        print(f'Model: {model_name} - Dataset: {data_name} - Accuracy: {acc*100:>3f}%, RK: {rk:>3f}')

# free up some RAM
del nli_diag
del data_dict

  0%|          | 0/3 [00:00<?, ?it/s]

Model: Unfiltered - Dataset: Lexical Semantics - Accuracy: 53.532606%, RK: 0.287183


  0%|          | 0/3 [00:00<?, ?it/s]

Model: Random 190k Subset - Dataset: Lexical Semantics - Accuracy: 49.184781%, RK: 0.209301


  0%|          | 0/4 [00:00<?, ?it/s]

Model: Unfiltered - Dataset: Predicate-Argument Structure - Accuracy: 61.792451%, RK: 0.343092


  0%|          | 0/4 [00:00<?, ?it/s]

Model: Random 190k Subset - Dataset: Predicate-Argument Structure - Accuracy: 58.962262%, RK: 0.283766


  0%|          | 0/3 [00:00<?, ?it/s]

Model: Unfiltered - Dataset: Logic - Accuracy: 46.153846%, RK: 0.193275


  0%|          | 0/3 [00:00<?, ?it/s]

Model: Random 190k Subset - Dataset: Logic - Accuracy: 43.406594%, RK: 0.143971


  0%|          | 0/3 [00:00<?, ?it/s]

Model: Unfiltered - Dataset: Knowledge - Accuracy: 42.957747%, RK: 0.136357


  0%|          | 0/3 [00:00<?, ?it/s]

Model: Random 190k Subset - Dataset: Knowledge - Accuracy: 38.732395%, RK: 0.109651


## 5. Out-of-Distribution Evaluation - Stress Tests - Zero-Shot
### 5.1. Data Read + Pre-Processing
- Get Stress Test Datasets
- Partition data by heuristic type:
    - `Competence` to consist of the datasets `antonym_matched`, `antonym_mismatched`, `quant_hard`
    - `Distraction` to consist of the datasets `taut2_matched`, `taut2_mismatched`, `negation_matched`,`negation_mismatched`, `length_mismatch_matched`, `length_mismatch_mismatched`
    - `Noise` to consist of the datasets `dev_gram_functionword_swap_perturbed_matched`, `dev_gram_keyboard_matched`, `dev_gram_functionword_swap_perturbed_mismatched`, `dev_gram_swap_mismatched`,
 `dev_gram_keyboard_mismatched`, `dev_gram_swap_matched`, `dev_gram_contentword_swap_perturbed_mismatched`, `dev_gram_contentword_swap_perturbed_matched`
- One-hot encode labels
- Tokenise data

In [13]:
# load in files from '../stress_tests_datasets.pkl'
with open('../stress_tests_datasets.pkl', 'rb') as f:
    stress_tests_datasets = pickle.load(f)   

# utility function to concatenate datasets and return 'datasets.Dataset' in torch format
def conc_prep_datasets(heuristic, key_list):
    
    # concat datasets
    out = stress_tests_datasets[key_list[0]]
    
    for key in key_list[1:]:
        
        out = pd.concat([out, stress_tests_datasets[key]], axis = 0)
    
    # one-hot encode labels
    out = Dataset.from_pandas(out).map(lambda x: \
        {'label': one_hot(torch.tensor(text_label_encoder[x['gold_label']]), 3).type(torch.float32).numpy()})
    
    # tokenize
    out = out.map(lambda x: tokenize(tokenizer, x['sentence1'] + '|' + x['sentence2']))
    len_bef_exclusion = len(out)
    
    # exclude instances with > 128 tokens
    out = out.filter(lambda x: x['exclude'] == False)
    len_aft_exclusion = len(out)

    # print message if instances were in fact excluded
    if len_bef_exclusion - len_aft_exclusion > 0:

        print(f'Heuristic: {heuristic} - {len_bef_exclusion - len_aft_exclusion} ' + \
              f'({(len_bef_exclusion/len_aft_exclusion - 1)*100:>2f}%) sequences excluded')
    
    # format data as torch tensors
    out.set_format(type = 'torch', columns = ['label', 'input_ids', 'attention_mask'])
    
    return(out)
    
# partition data by heuristic + pre-process them
data_dict = {'Competence': conc_prep_datasets('Competence', \
                                              ['antonym_matched', 'antonym_mismatched', 'quant_hard']), \
             'Distraction': conc_prep_datasets('Distraction', \
                             ['taut2_matched', 'taut2_mismatched', 'negation_matched', 'negation_mismatched', \
                             'length_mismatch_matched', 'length_mismatch_mismatched']), \
             'Noise': conc_prep_datasets('Noise', \
                                         [k for k in stress_tests_datasets.keys() if k.startswith('dev_gram')])}

  0%|          | 0/24071 [00:00<?, ?ex/s]

  0%|          | 0/24071 [00:00<?, ?ex/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

Heuristic: Competence - 396 (1.672650%) sequences excluded


  0%|          | 0/294705 [00:00<?, ?ex/s]

  0%|          | 0/294705 [00:00<?, ?ex/s]

  0%|          | 0/295 [00:00<?, ?ba/s]

Heuristic: Distraction - 1060 (0.360980%) sequences excluded


  0%|          | 0/333160 [00:00<?, ?ex/s]

  0%|          | 0/333160 [00:00<?, ?ex/s]

  0%|          | 0/334 [00:00<?, ?ba/s]

Heuristic: Noise - 1070 (0.322202%) sequences excluded


### 5.2. Evaluate

In [14]:
for data_name, data in data_dict.items():
    
    # set up dataloader (batch generator)
    dataloader = torch.utils.data.DataLoader(data, batch_size=128, collate_fn=data_collator)

    # evaluate models
    for model_name, model in models.items():
        
        acc, rk = evaluate_acc_rk(model, dataloader, device)
        print(f'Model: {model_name} - Dataset: {data_name} - Accuracy: {acc*100:>3f}%, RK: {rk:>3f}')

# free up some RAM
del stress_tests_datasets
del data_dict

  0%|          | 0/185 [00:00<?, ?it/s]

Model: Unfiltered - Dataset: Competence - Accuracy: 42.065468%, RK: -0.108413


  0%|          | 0/185 [00:00<?, ?it/s]

Model: Random 190k Subset - Dataset: Competence - Accuracy: 28.764519%, RK: -0.098956


  0%|          | 0/2295 [00:00<?, ?it/s]

Model: Unfiltered - Dataset: Distraction - Accuracy: 58.131415%, RK: 0.377976


  0%|          | 0/2295 [00:00<?, ?it/s]

Model: Random 190k Subset - Dataset: Distraction - Accuracy: 51.099116%, RK: 0.303057


  0%|          | 0/2595 [00:00<?, ?it/s]

Model: Unfiltered - Dataset: Noise - Accuracy: 64.691800%, RK: 0.471141


  0%|          | 0/2595 [00:00<?, ?it/s]

Model: Random 190k Subset - Dataset: Noise - Accuracy: 57.425696%, RK: 0.374395
