GLUE sets: model will be trained on eval set, so you shouldn't also test on the eval set. The problem is that the labels are withheld for the test set. 
Start with SNLI. MultiNLI is a later option too. As is rotten_tomatoes. 
* Victim model performance on dataset train, valid, test set. (done, written code to measure it)
* Create new paraphrased valid + test datasets (done a preliminary version on the valid set) 
* Measure victim model performance on paraphrased datasets (done. on vanilla valid set is about 87% accuracy. generating 16 paraphrases (i.e. not many) and evaluating performance on all of them, we get ~75% accuracy)
* Get document embeddings of original and paraphrased and compare (done)
  * https://github.com/UKPLab/sentence-transformers
* Write a simple way to measure paraphrase quality (done) 
* Construct reward function 


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import torch 
from torch.utils.data import DataLoader
from datasets import load_dataset, load_metric
import datasets, transformers
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, AutoTokenizer
from pprint import pprint
import numpy as np, pandas as pd
import scipy
from utils import *   # local script 
import pyarrow
from sentence_transformers import SentenceTransformer, util
from IPython.core.debugger import set_trace
from GPUtil import showUtilization
import seaborn as sns
from itertools import repeat
from collections import defaultdict
from IPython.display import Markdown

path_cache = './cache/'
path_results = "./results/"

seed = 420
torch.manual_seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
devicenum = torch.cuda.current_device() if device.type == 'cuda' else -1
n_wkrs = 4 * torch.cuda.device_count()
batch_size = 64
pd.set_option("display.max_colwidth", 400)

In [3]:
# Paraphrase model (para)
para_name = "tuner007/pegasus_paraphrase"
para_tokenizer = AutoTokenizer.from_pretrained(para_name)
para_model = AutoModelForSeq2SeqLM.from_pretrained(para_name).to(device)

In [4]:
# Victim Model (VM)
vm_name = "textattack/distilbert-base-cased-snli"
vm_tokenizer = AutoTokenizer.from_pretrained(vm_name)
vm_model = AutoModelForSequenceClassification.from_pretrained(vm_name).to(device)
vm_idx2lbl = vm_model.config.id2label
vm_lbl2idx = vm_model.config.label2id
vm_num_labels = vm_model.num_labels

In [5]:
# Semantic Similarity model 
embedding_model = SentenceTransformer('paraphrase-distilroberta-base-v1')

In [6]:
dataset = load_dataset("snli")
train,valid,test = dataset['train'],dataset['validation'],dataset['test']

label_cname = 'label'
remove_minus1_labels = lambda x: x[label_cname] != -1
train = train.filter(remove_minus1_labels)
valid = valid.filter(remove_minus1_labels)
test = test.filter(remove_minus1_labels)

# make sure that all datasets have the same number of labels as what the victim model predicts
assert train.features[label_cname].num_classes == vm_num_labels
assert valid.features[label_cname].num_classes == vm_num_labels
assert test.features[ label_cname].num_classes == vm_num_labels

train_dl = DataLoader(train, batch_size=batch_size, shuffle=True, num_workers=n_wkrs)
valid_dl = DataLoader(valid, batch_size=batch_size, shuffle=True, num_workers=n_wkrs)
test_dl = DataLoader( test,  batch_size=batch_size, shuffle=True, num_workers=n_wkrs)

Reusing dataset snli (/data/tproth/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


HBox(children=(FloatProgress(value=0.0, max=551.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [7]:
def get_paraphrases(input_text,num_return_sequences,num_beams, num_beam_groups=1,diversity_penalty=0):
    batch = para_tokenizer(input_text,truncation=True,padding='longest', return_tensors="pt").to(device)
    translated = para_model.generate(**batch,num_beams=num_beams, num_return_sequences=num_return_sequences, 
                                   temperature=1.5, num_beam_groups=num_beam_groups, diversity_penalty=diversity_penalty)
    tgt_text = para_tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

def gen_dataset_paraphrases(x, cname_input, cname_output, n_seed_seqs=32): 
    """ x: one row of a dataset. 
    cname_input: column to generate paraphrases for 
    cname_output: column name to give output of paraphrases 
    n_seed_seqs: rough indicator of how many paraphrases to return. 
            For now, keep at 4,8,16,32,64 etc"""
    # TODO: figure out how to batch this. 
    if n_seed_seqs % 4 != 0: raise ValueError("keep n_seed_seqs divisible by 4 for now")
    n = n_seed_seqs/2
    #low diversity (ld) paraphrases 
    ld_l = get_paraphrases(x[cname_input],num_return_sequences=int(n),
                            num_beams=int(n))
    #high diversity (hd) paraphrases. We can use num_beam_groups and diversity_penalty as hyperparameters. 
    hd_l =  get_paraphrases(x[cname_input],num_return_sequences=int(n),
                            num_beams=int(n), num_beam_groups=int(n),diversity_penalty=50002.5)
    l = ld_l + hd_l 
    x[cname_output] = l #TODO: change to list(set(l))             
    return x 


In [8]:
# Generate paraphrase dataset
n_seed_seqs = 48
date = '20210629'
fname = path_cache + 'valid_small_'+ date + '_' + str(n_seed_seqs)
if os.path.exists(fname):  # simple caching
    valid_small = datasets.load_from_disk(fname)
else:
    valid_small = valid.shard(20, 0, contiguous=True)
    valid_small = valid_small.map(lambda x: gen_dataset_paraphrases(x, n_seed_seqs=n_seed_seqs,
                        cname_input='hypothesis', cname_output='hypothesis_paraphrases'),
                    batched=False)
    valid_small.save_to_disk(fname)
    
    

In [9]:
# Create a new version of paraphrase dataset by repeating all other fields to be same 
# length as number of paraphrases. 
def create_paraphrase_dataset(batch, l_cname): 
    """Repeat the other fields to be the same length as the number of paraphrases.
    l_cname: column name that contains the list of paraphrases"""    
    return_d = defaultdict(list) 
    for o in zip(*batch.values()):
        d = dict(zip(batch.keys(), o))
        n_paraphrases = len(d[l_cname])
        for k,v in d.items(): 
            return_d[k] += v if k == l_cname else [v for o in range(n_paraphrases)]
    return return_d      

fname = path_cache + 'valid_small_paraphrases_' + date + '_'+ str(n_seed_seqs)
if os.path.exists(fname):     
    valid_small_paraphrases = datasets.load_from_disk(fname)
else:
    # Need to call this with batched=True to work. 
    valid_small_paraphrases = valid_small.map(lambda x: create_paraphrase_dataset(x,
                                                             l_cname='hypothesis_paraphrases'), 
                                              batched=True)
    valid_small_paraphrases.save_to_disk(fname)


In [10]:
# Generate results dataframe 
def get_vm_scores(): 
    """very hacky procedure to generate victim model scores """
    # Get preds and accuracy on the paraphrase dataset
    print("Getting victim model scores.")
    some_dl = DataLoader(valid_small_paraphrases, batch_size=batch_size, shuffle=False, 
                         num_workers=n_wkrs, pin_memory=True)
    dl = some_dl
    metric = load_metric('accuracy')
    para_probs_l,orig_probs_l = [], []
    assert vm_model.training == False  # checks that model is in eval mode 
    #monitor = Monitor(2)  # track GPU usage and memory
    with torch.no_grad():
        for i, data in enumerate(dl): 
            if i % 50 == 0 : print(i, "out of", len(dl))
            labels,premise = data['label'].to(device),data["premise"]
            paraphrases,orig = data["hypothesis_paraphrases"],data["hypothesis"]

            # predictions for original
            inputs = vm_tokenizer(premise,orig,padding=True,truncation=True, return_tensors="pt")
            inputs.to(device)
            outputs = vm_model(**inputs, labels=labels)
            probs = outputs.logits.softmax(1)
            preds = probs.argmax(1)
            orig_probs_l.append(probs.cpu())  

            # predictions for paraphrases
            inputs = vm_tokenizer(premise,paraphrases, padding=True,truncation=True, return_tensors="pt")
            inputs.to(device)
            outputs = vm_model(**inputs, labels=labels)
            probs = outputs.logits.softmax(1)
            preds = probs.argmax(1)
            para_probs_l.append(probs.cpu())
            metric.add_batch(predictions=preds, references=labels)

    orig_probs_t, para_probs_t = torch.cat(orig_probs_l),torch.cat(para_probs_l)
    #monitor.stop()
    return para_probs_t, orig_probs_t

def generate_sim_scores(): 
    """Function to just loop and generate sim scores for each input"""
    print("Getting similarity scores")
    sim_score_l = []
    for i, data in enumerate(valid_small): 
        if i % 50 == 0 : print(i, "out of", len(valid_small))
        orig, para = data['hypothesis'], data['hypothesis_paraphrases']
        orig_emb,para_emb  = embedding_model.encode(orig),embedding_model.encode(para)
        cos_sim = util.cos_sim(orig_emb,para_emb)[0]
        sim_score_l.append(cos_sim)
    sim_score_t = torch.cat(sim_score_l)
    return sim_score_t

fname = path_cache + 'results_df_'+ date + "_" + str(n_seed_seqs) + ".csv"
if os.path.exists(fname):
    results_df = pd.read_csv(fname)
else: 
    sim_score_t = generate_sim_scores()
    para_probs_t, orig_probs_t = get_vm_scores()
    vm_para_scores = torch.tensor([r[idx] for idx,r in zip(valid_small_paraphrases['label'],para_probs_t)])
    vm_orig_scores = torch.tensor([r[idx] for idx,r in zip(valid_small_paraphrases['label'],orig_probs_t)])
    
    results_df = pd.DataFrame({'premise': valid_small_paraphrases['premise'],
                  'orig': valid_small_paraphrases['hypothesis'],
                  'para': valid_small_paraphrases['hypothesis_paraphrases'],
                  'sim_score': sim_score_t,
                  'label_true': valid_small_paraphrases['label'], 
                  'label_vm_orig': orig_probs_t.argmax(1),
                  'label_vm_para': para_probs_t.argmax(1),
                  'vm_orig_truelabel': vm_orig_scores,             
                  'vm_para_truelabel': vm_para_scores,
                  'vm_truelabel_change': vm_orig_scores - vm_para_scores,
                  'vm_orig_class0': orig_probs_t[:,0], 
                  'vm_orig_class1': orig_probs_t[:,1], 
                  'vm_orig_class2': orig_probs_t[:,2],  
                  'vm_para_class0': para_probs_t[:,0], 
                  'vm_para_class1': para_probs_t[:,1], 
                  'vm_para_class2': para_probs_t[:,2]     
                  })
    results_df['vm_truelabel_change_X_sim_score'] = results_df['vm_truelabel_change'] * results_df['sim_score']
    results_df.to_csv(fname, index_label = 'idx')

### Permutation method to detect label flips

Take each example $Ex$ in the filtered set and generate paraphrases (e.g. 16) of it (or it might work better with a simple token-replacement strategy). Run each through the victim model (might be better with a different model, but still trained on dataset) and record predictions. Then tally up the label predictions (or maybe take average of the probabilities). Each prediction is a vote for the true label. 

Idea is that if $Ex$ changes ground truth label to class 4, then most of the paraphrases of $Ex$ will be of class 4 too. If $Ex$ is truly adversarial, then most of the paraphrases of $Ex$ are likely to be of the original class (or at least of other classes). So in other words: 
* if `is_adversarial = 1` then we expect most votes to be for other classes to `label_vm_para`. This means we expect more variance in the voting. If we take model confidence for the class of `label_vm_para` and work out entropy/variance, we expect it to be high. 
* if `is_adversarial = 0` then we expect most votes to be for the same class as `label_vm_para`. This means we expect less variance in the voting. If we take model confidence for the class of `label_vm_para` and work out entropy/variance, we expect it to be low. 

Variations 

* Instead of generating further paraphrases for all label flippers, try the checklist tests on the input. e.g. replace number/proper noun
* Try systematic perturbations
* Record probability of the true class or the predicted class and put it into a distribution. Calculate entropy of it (STRIP style). The idea is that there is some reliable difference in these probabilities between ground-truth flips and otherwise and that entropy can be used as a rough measurement to distinguish between it. 
* Can try the above while keeping track of sentence embeddings + attention layers 

In [11]:
# Read in manually labelled data. This is to track results. 
fname = path_cache + 'results_df_48_20210514_labelled_subset.csv'
dset_advlbl = load_dataset('csv', data_files=fname)['train'].train_test_split(test_size=0.25)
train_advlbl,test_advlbl = dset_advlbl['train'],dset_advlbl['test']

# # as pandas df
# df_advlbl = pd.read_csv(fname)
# train_advlbl,_,test_advlbl = create_train_valid_test(df_advlbl, frac_train=0.75, frac_valid = 0.001)
# # To join with the original. (might be some issues with the idx/row-number col)
# # x = pd.merge(results_df, df_advlbl, on =['idx', 'premise','orig', 'para'])

Using custom data configuration default-ebc62bd8d2fb84e0
Reusing dataset csv (/data/tproth/.cache/huggingface/datasets/csv/default-ebc62bd8d2fb84e0/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0)


#### Paraphrases of paraphrases 

nlp dataset -> gen_paraphrases (returns dataset) -> create_paraphrase_dataset -> get vm labels -> save in data frame 

In [12]:
n = 48
cols_to_drop = ['is_adversarial','label_true','label_vm_orig','orig','sim_score']
def paraphrase_and_return_dict(x, n_seed_seqs=16): 
    x['perms'] = get_paraphrases(x['para'], num_return_sequences=n, num_beams=n, 
                                num_beam_groups=8, diversity_penalty=100000.0)
    return x 
train_advlbl_perms = train_advlbl.map(lambda x: paraphrase_and_return_dict(x, n_seed_seqs=n),
                  batched=False, remove_columns = cols_to_drop)
train_advlbl_expanded = train_advlbl_perms.map(lambda x: create_paraphrase_dataset(x, l_cname='perms'),
                           batched=True)

HBox(children=(FloatProgress(value=0.0, max=198.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [13]:
# Get victim model predictions for each prediction  
advlbl_expanded_dl = DataLoader(train_advlbl_expanded, batch_size=batch_size, shuffle=False, 
                                num_workers=n_wkrs, pin_memory=True)
dl = advlbl_expanded_dl
probs_l = []
assert vm_model.training == False  # checks that model is in eval mode 
with torch.no_grad():
    for i, data in enumerate(dl): 
        if i % 50 == 0 : print(i, "out of", len(dl))
        premise,perms = data["premise"],data["perms"]
        # predictions for original
        inputs = vm_tokenizer(premise,perms,padding=True,truncation=True, return_tensors="pt")
        inputs.to(device)
        outputs = vm_model(**inputs)
        probs = outputs.logits.softmax(1)
        # preds = probs.argmax(1)
        probs_l.append(probs.cpu()) 

probs_t = torch.cat(probs_l)
preds_t = torch.argmax(probs_t,1)

0 out of 149
50 out of 149
100 out of 149


In [14]:
# Bring back to original
train_advlbl_expanded = train_advlbl_expanded.add_column('vm_label', preds_t.tolist())
train_advlbl_expanded = train_advlbl_expanded.add_column('vm_prob0', probs_t[:,0].tolist())
train_advlbl_expanded = train_advlbl_expanded.add_column('vm_prob1', probs_t[:,1].tolist())
train_advlbl_expanded = train_advlbl_expanded.add_column('vm_prob2', probs_t[:,2].tolist())

In [15]:

# Make into pandas_df 
advlbl_df = pd.DataFrame(train_advlbl_expanded) 
advlbl_df.vm_label = advlbl_df.vm_label.astype('category')

# Count "votes" of each set of permutations 
votes_df = advlbl_df.groupby(['idx'])['vm_label'].describe()
votes_df = votes_df.rename(columns={'count':'votes','unique': "n_cats_with_votes",
                                     "top": 'top_cat', 'freq': 'top_cat_votes'})

In [16]:
# Get entropy and variance from each set of permutations, then choose only the values
# that correspond to the predicted label of the paraphrase
def get_entropy(x, bins=10): 
    """Return shannon entropy of a vector. Used in pandas summary functions"""
    # the bins parameters affects the entropy quite a bit (it introduces zeros)
    hist,_ = np.histogram(x, bins=bins)  
    hist = hist/sum(hist)  # turn into PMF (not strictly required for scipy entropy, but easier to interpret)
    return scipy.stats.entropy(hist)
grp = advlbl_df.groupby(['idx'])[['vm_prob0','vm_prob1','vm_prob2']]
entropy_df = grp.agg(func = get_entropy)
var_df     = grp.agg(func = 'var')
entropy_df.columns = [o + "_entropy" for o in entropy_df.columns]
var_df.columns     = [o + "_var"     for o in var_df.columns]

In [17]:
label_df =  advlbl_df[['idx','label_vm_para']].drop_duplicates()
def choose_col_of_df_from_label_column(df, labeldf, name='entropy'): 
    """Picks columns of df corresponding to the predicted vm label of the paraphrase. 
    Works only if probs of classes are the first columns of df in order."""
    df = df.merge(labeldf,left_index=True, right_on='idx')
    v = df['label_vm_para'].values
    # See https://stackoverflow.com/a/61234228/5381490
    df[name+'_label_vm_para'] = np.take_along_axis(df.values, v[:,None] ,axis=1)
    return df 
entropy_df = choose_col_of_df_from_label_column(entropy_df, label_df, name='entropy')
var_df     = choose_col_of_df_from_label_column(var_df,     label_df, name='var')

In [18]:
# Change original labelled set to a pandas data frame and merge it in 
train_advlbl_df,test_advlbl_df = pd.DataFrame(dset_advlbl['train']),pd.DataFrame(dset_advlbl['test'])
train_advlbl_df = pd.merge(train_advlbl_df, votes_df, left_on ='idx', right_index=True)
train_advlbl_df = pd.merge(train_advlbl_df, entropy_df[['idx','entropy_label_vm_para']], 
                           left_on ='idx', right_on='idx')
train_advlbl_df = pd.merge(train_advlbl_df, var_df[['idx', 'var_label_vm_para']], 
                           left_on ='idx', right_on='idx')

In [19]:
# Calculate label flip percentage and measure success
train_advlbl_df['label_flip'] = train_advlbl_df['top_cat'] != train_advlbl_df['label_vm_para'] 
def permutation_success(x,y): 
    result = None
    if   x == 1 and y == True:   result = True
    elif x == 0 and y == False:  result = True
    elif x == -1 or x == -2:     result = "To be determined"
    else:                        result = False
    return result
v1,v2 = train_advlbl_df['is_adversarial'].values, train_advlbl_df['label_flip'].values
train_advlbl_df['permutation_success'] = list(map(permutation_success, v1,v2))

pd.crosstab(index=train_advlbl_df['label_flip'], 
                             columns=train_advlbl_df['is_adversarial'],
                             margins=True)

is_adversarial,-2,-1,0,1,All
label_flip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,2,15,91,44,152
True,2,9,11,24,46
All,4,24,102,68,198


In [20]:
train_advlbl_df.label_flip.value_counts()

False    152
True      46
Name: label_flip, dtype: int64

In [21]:
advlbl_df

Unnamed: 0,idx,label_vm_para,para,perms,premise,vm_label,vm_prob0,vm_prob1,vm_prob2
0,7299,0,A woman is not awake.,A woman is not awake.,A woman is talking on the phone while standing next to a dog.,0,0.969092,0.018904,0.012004
1,7299,0,A woman is not awake.,A woman isn't awake.,A woman is talking on the phone while standing next to a dog.,0,0.928273,0.037017,0.034710
2,7299,0,A woman is not awake.,The woman is not awake.,A woman is talking on the phone while standing next to a dog.,0,0.963114,0.023327,0.013559
3,7299,0,A woman is not awake.,A woman is asleep.,A woman is talking on the phone while standing next to a dog.,2,0.000436,0.001917,0.997646
4,7299,0,A woman is not awake.,A woman is sleeping.,A woman is talking on the phone while standing next to a dog.,2,0.000251,0.000932,0.998818
...,...,...,...,...,...,...,...,...,...
9499,19904,1,bricks are used in the Building,a brick building is made of brick,A large group of people are gathered outside of a brick building lit with spotlights.,0,0.818214,0.107462,0.074324
9500,19904,1,bricks are used in the Building,a brick building is utilized,A large group of people are gathered outside of a brick building lit with spotlights.,0,0.918170,0.077881,0.003949
9501,19904,1,bricks are used in the Building,More than one item can be found inside The Building,A large group of people are gathered outside of a brick building lit with spotlights.,0,0.852742,0.051356,0.095902
9502,19904,1,bricks are used in the Building,More than one item can be found insidethe Building,A large group of people are gathered outside of a brick building lit with spotlights.,0,0.902995,0.029699,0.067306


In [22]:
#### Exploring the method via reporting ####

## Set up parameters 
idx = train_advlbl_df.sample()[['idx']].values[0][0] #sample an index randomly from the table
main_tbl = train_advlbl_df.query("idx==@idx")
def getval(cname): return  main_tbl.loc[:,cname].values[0]
prem,hyp,para,sim_score = getval('premise'),getval('orig'),getval('para'),getval('sim_score')  
label_true,label_vm_orig,label_vm_para = getval('label_true'),getval('label_vm_orig'),getval('label_vm_para')
advlbl = getval('is_adversarial')
d_advlbl2str = {
    1: "is a **successful** adversarial example",
    0: "is **unsuccessful**: it flips the true label",
    -1: "contains a hypothesis paraphrase that **doesn't make sense** or is nonsensical.", 
    -2: "is **excluded**: the original label might be wrong"
}
advstr = d_advlbl2str[advlbl]
perm_samples = advlbl_df.query("idx==@idx").sample(5).to_markdown()
ncats,top_cat,top_cat_votes = getval('n_cats_with_votes'),getval('top_cat'),getval('top_cat_votes')

label_flip               = top_cat != label_vm_para
label_flip_to_orig_label = top_cat == label_vm_orig
label_flip_to_diff_label = top_cat != label_vm_para and top_cat != label_vm_orig

results_msg = ""
if not label_flip:           results_msg += "This does not flip the predicted label. \n"
if label_flip_to_orig_label: results_msg += "This flips the label to the vm predicted label (" +\
    str(label_vm_orig) + ") of the original hypothesis. \n"
if label_flip_to_diff_label: results_msg += "This flips the predicted label but to a different class to the vm prediction of the original hypothesis.\n"

results_msg += "\n"
if  advlbl == 1:  
    results_msg += "If the theory is correct we expected a label flip for an adversarial example.\n "
    if label_flip: results_msg +=  "The label flip occured, so this was **successful**.\n"
    else:          results_msg +=  "The label flip did not occur, so this was **unsuccessful**.\n"   
elif advlbl == 0:  
    results_msg += "If the theory is correct we expect the label does not flip for an unadversarial example.\n "
    if label_flip: results_msg +=  "The label flip occured, so this was **unsuccessful**.\n"
    else:          results_msg +=  "The label flip did not occur, so this was **successful**.\n"  
elif advlbl == -1: 
    results_msg += "The original paraphrase didn't make sense, so we should figure out how to detect this.\n "
else: 
    results_msg += "The SNLI example was wrong or strange: disregard this example.\n"

## Insert into template 
Markdown(f"""
Example with idx **{idx}**   

{main_tbl.to_markdown(index=True)}   


* **Premise**: `{prem}`  
* **Hypothesis (original)**: `{hyp}` (True label **{label_true}**, Victim Model (VM) label **{label_vm_orig}**)    
* **Hypothesis paraphrase**: `{para}` (VM label **{label_vm_para}**)     

This example {advstr}.    

We generate {n} further *permutations* of the hypothesis paraphrase and get VM votes and confidence for 
each of them. The label of the hypothesis paraphrase was **{label_vm_para}**. 
Here are five of these permutations (randomly chosen):  

{perm_samples}

**Voting strategy results** 

We get {ncats} categories with votes. The most voted for category is **label {top_cat}** with {top_cat_votes}
votes. The paraphrase initially had label **{label_vm_para}**.

{results_msg}


Now we look at the variance and entropy of the predicted probabilities of each class. 
We are interested in class **{label_vm_para}** as it is the label of the hypothesis paraphrase. 

*Entropy*  

{entropy_df.query("idx==@idx").round(2).to_markdown(index=True)}

*Variance*   

{var_df.query("idx==@idx").round(2).to_markdown(index=True)}




""")



Example with idx **21096**   

|    |   idx | premise                            | orig                        | para                      |   sim_score |   label_true |   label_vm_orig |   label_vm_para |   is_adversarial |   votes |   n_cats_with_votes |   top_cat |   top_cat_votes |   entropy_label_vm_para |   var_label_vm_para | label_flip   | permutation_success   |
|---:|------:|:-----------------------------------|:----------------------------|:--------------------------|------------:|-------------:|----------------:|----------------:|-----------------:|--------:|--------------------:|----------:|----------------:|------------------------:|--------------------:|:-------------|:----------------------|
| 57 | 21096 | a man is swimming inside of a pool | there is a person drowning. | A person is in the water. |    0.641395 |            2 |               2 |               0 |                0 |      48 |                   3 |         0 |              41 |                 1.27239 |           0.0831694 | False        | True                  |   


* **Premise**: `a man is swimming inside of a pool`  
* **Hypothesis (original)**: `there is a person drowning.` (True label **2**, Victim Model (VM) label **2**)    
* **Hypothesis paraphrase**: `A person is in the water.` (VM label **0**)     

This example is **unsuccessful**: it flips the true label.    

We generate 48 further *permutations* of the hypothesis paraphrase and get VM votes and confidence for 
each of them. The label of the hypothesis paraphrase was **0**. 
Here are five of these permutations (randomly chosen):  

|      |   idx |   label_vm_para | para                      | perms                                | premise                            |   vm_label |   vm_prob0 |   vm_prob1 |   vm_prob2 |
|-----:|------:|----------------:|:--------------------------|:-------------------------------------|:-----------------------------------|-----------:|-----------:|-----------:|-----------:|
| 2783 | 21096 |               0 | A person is in the water. | water, person                        | a man is swimming inside of a pool |          1 |   0.351933 |  0.568916  | 0.0791504  |
| 2757 | 21096 |               0 | A person is in the water. | People in the water                  | a man is swimming inside of a pool |          0 |   0.919389 |  0.0726834 | 0.00792719 |
| 2744 | 21096 |               0 | A person is in the water. | A person is in the water             | a man is swimming inside of a pool |          0 |   0.946072 |  0.0462132 | 0.00771503 |
| 2745 | 21096 |               0 | A person is in the water. | There is a person in a body of water | a man is swimming inside of a pool |          0 |   0.984732 |  0.0134389 | 0.00182862 |
| 2739 | 21096 |               0 | A person is in the water. | The person is wet.                   | a man is swimming inside of a pool |          0 |   0.917546 |  0.0812952 | 0.00115936 |

**Voting strategy results** 

We get 3 categories with votes. The most voted for category is **label 0** with 41
votes. The paraphrase initially had label **0**.

This does not flip the predicted label. 

If the theory is correct we expect the label does not flip for an unadversarial example.
 The label flip did not occur, so this was **successful**.



Now we look at the variance and entropy of the predicted probabilities of each class. 
We are interested in class **0** as it is the label of the hypothesis paraphrase. 

*Entropy*  

|      |   vm_prob0_entropy |   vm_prob1_entropy |   vm_prob2_entropy |   idx |   label_vm_para |   entropy_label_vm_para |
|-----:|-------------------:|-------------------:|-------------------:|------:|----------------:|------------------------:|
| 2736 |               1.27 |               1.14 |               0.47 | 21096 |               0 |                    1.27 |

*Variance*   

|      |   vm_prob0_var |   vm_prob1_var |   vm_prob2_var |   idx |   label_vm_para |   var_label_vm_para |
|-----:|---------------:|---------------:|---------------:|------:|----------------:|--------------------:|
| 2736 |           0.08 |           0.04 |           0.03 | 21096 |               0 |                0.08 |






In [23]:
# # calculates performance of victim model on a dataloader

# dl = valid_dl
# metric = load_metric('accuracy')
# for i, data in enumerate(dl): 
#     if i % 10 == 0 : print(i, "out of", len(dl)) 
#     labels,premise,hypothesis = data['label'].to(device),data["premise"],data["hypothesis"]
#     inputs = vm_tokenizer(premise,hypothesis, padding=True,truncation=True, return_tensors="pt")
#     inputs.to(device)
#     outputs = vm_model(**inputs, labels=labels)
#     probs = outputs.logits.softmax(1)
#     preds = probs.argmax(1)
#     metric.add_batch(predictions=preds, references=labels)

# metric.compute()


In [24]:
# # Score semantic similarity with cross encoders

# from sentence_transformers.cross_encoder import CrossEncoder
# cross_encoder= CrossEncoder('cross-encoder/quora-distilroberta-base')
# i =11
# data = valid_small[i]
# orig, para = data['hypothesis'], data['hypothesis_paraphrases']
# orig_rep = [orig for i in range(len(para))]
# pairs = list(zip(orig_rep,para))
# scores = cross_encoder.predict(pairs)
# results_df = pd.DataFrame({'pairs':pairs, 'para': para,'score': cos_sim})
# print(orig)
# results_df.sort_values('score', ascending=False)

In [25]:
# # with sentence transformers

# valid_small_dl = DataLoader(valid_small, batch_size=4, shuffle=False, 
#                      num_workers=n_wkrs, pin_memory=True)
# sim_score_l = []
# for i, data in enumerate(valid_small_dl): 
#     pass
#     orig, para = data['hypothesis'], data['hypothesis_paraphrases']
#     orig_emb,para_emb  = embedding_model.encode(orig),embedding_model.encode(para)
# #     cos_sim = util.cos_sim(orig_emb,para_emb)[0]
# #     results_df = pd.DataFrame({'para': para,'score': cos_sim})
# #     print(orig)
# #     results_df.sort_values('score', ascending=False)