GLUE sets: model will be trained on eval set, so you shouldn't also test on the eval set. The problem is that the labels are withheld for the test set. 
Start with SNLI. MultiNLI is a later option too. As is rotten_tomatoes. 
* Victim model performance on dataset train, valid, test set. (done, written code to measure it)
* Create new paraphrased valid + test datasets (done a preliminary version on the valid set) 
* Measure victim model performance on paraphrased datasets (done. on vanilla valid set is about 87% accuracy. generating 16 paraphrases (i.e. not many) and evaluating performance on all of them, we get ~75% accuracy)
* Get document embeddings of original and paraphrased and compare (done)
  * https://github.com/UKPLab/sentence-transformers
* Write a simple way to measure paraphrase quality (done) 
* Construct reward function 


In [1]:
%load_ext autoreload
%autoreload 2

In [183]:
import os
import torch 
from torch.utils.data import DataLoader
from datasets import load_dataset, load_metric
import datasets, transformers
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, AutoTokenizer
from pprint import pprint
import numpy as np, pandas as pd
from utils import *   # local script 
import pyarrow
from sentence_transformers import SentenceTransformer, util
from IPython.core.debugger import set_trace
from GPUtil import showUtilization
import seaborn as sns
from itertools import repeat
from collections import defaultdict


path_cache = './cache/'
path_results = "./results/"

seed = 420
torch.manual_seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
devicenum = torch.cuda.current_device() if device.type == 'cuda' else -1
n_wkrs = 4 * torch.cuda.device_count()
batch_size = 64
pd.set_option("display.max_colwidth", 400)

In [3]:
# Paraphrase model (para)
para_name = "tuner007/pegasus_paraphrase"
para_tokenizer = AutoTokenizer.from_pretrained(para_name)
para_model = AutoModelForSeq2SeqLM.from_pretrained(para_name).to(device)

In [4]:
# Victim Model (VM)
vm_name = "textattack/distilbert-base-cased-snli"
vm_tokenizer = AutoTokenizer.from_pretrained(vm_name)
vm_model = AutoModelForSequenceClassification.from_pretrained(vm_name).to(device)
vm_idx2lbl = vm_model.config.id2label
vm_lbl2idx = vm_model.config.label2id
vm_num_labels = vm_model.num_labels

In [5]:
# Semantic Similarity model 
embedding_model = SentenceTransformer('paraphrase-distilroberta-base-v1')

In [6]:
dataset = load_dataset("snli")
train,valid,test = dataset['train'],dataset['validation'],dataset['test']

label_cname = 'label'
remove_minus1_labels = lambda x: x[label_cname] != -1
train = train.filter(remove_minus1_labels)
valid = valid.filter(remove_minus1_labels)
test = test.filter(remove_minus1_labels)

# make sure that all datasets have the same number of labels as what the victim model predicts
assert train.features[label_cname].num_classes == vm_num_labels
assert valid.features[label_cname].num_classes == vm_num_labels
assert test.features[ label_cname].num_classes == vm_num_labels

train_dl = DataLoader(train, batch_size=batch_size, shuffle=True, num_workers=n_wkrs)
valid_dl = DataLoader(valid, batch_size=batch_size, shuffle=True, num_workers=n_wkrs)
test_dl = DataLoader( test,  batch_size=batch_size, shuffle=True, num_workers=n_wkrs)

Reusing dataset snli (/data/tproth/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


HBox(children=(FloatProgress(value=0.0, max=551.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [87]:
def get_paraphrases(input_text,num_return_sequences,num_beams, num_beam_groups=1,diversity_penalty=0):
    batch = para_tokenizer(input_text,truncation=True,padding='longest', return_tensors="pt").to(device)
    translated = para_model.generate(**batch,num_beams=num_beams, num_return_sequences=num_return_sequences, 
                                   temperature=1.5, num_beam_groups=num_beam_groups, diversity_penalty=diversity_penalty)
    tgt_text = para_tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

def gen_dataset_paraphrases(x, cname_input, cname_output, n_seed_seqs=32): 
    """ x: one row of a dataset. 
    cname_input: column to generate paraphrases for 
    cname_output: column name to give output of paraphrases 
    n_seed_seqs: rough indicator of how many paraphrases to return. 
            For now, keep at 4,8,16,32,64 etc"""
    # TODO: figure out how to batch this. 
    if n_seed_seqs % 4 != 0: raise ValueError("keep n_seed_seqs divisible by 4 for now")
    n = n_seed_seqs/2
    #low diversity (ld) paraphrases 
    ld_l = get_paraphrases(x[cname_input],num_return_sequences=int(n),
                            num_beams=int(n))
    #high diversity (hd) paraphrases. We can use num_beam_groups and diversity_penalty as hyperparameters. 
    hd_l =  get_paraphrases(x[cname_input],num_return_sequences=int(n),
                            num_beams=int(n), num_beam_groups=int(n),diversity_penalty=50002.5)
    l = ld_l + hd_l 
    x[cname_output] = l #TODO: change to list(set(l))             
    return x 


In [8]:
# Generate paraphrase dataset
n_seed_seqs = 48
fname = path_cache + 'valid_small_'+ str(n_seed_seqs)
if os.path.exists(fname):  # simple caching
    valid_small = datasets.load_from_disk(fname)
else:
    valid_small = valid.shard(20, 0, contiguous=True)
    valid_small = valid_small.map(lambda x: gen_dataset_paraphrases(x, n_seed_seqs=n_seed_seqs,
                        cname_input='hypothesis', cname_output='hypothesis_paraphrases'),
                    batched=False)
    valid_small.save_to_disk(fname)
    
    

In [202]:
# Create a new version of paraphrase dataset by repeating all other fields to be same 
# length as number of paraphrases. 
def create_paraphrase_dataset(batch, l_cname): 
    """Repeat the other fields to be the same length as the number of paraphrases.
    l_cname: column name that contains the list of paraphrases"""    
    return_d = defaultdict(list) 
    for o in zip(*batch.values()):
        d = dict(zip(batch.keys(), o))
        n_paraphrases = len(d[l_cname])
        for k,v in d.items(): 
            return_d[k] += v if k == l_cname else [v for o in range(n_paraphrases)]
    return return_d      


fname = path_cache + 'valid_small_paraphrases_' + str(n_seed_seqs)
if os.path.exists(fname):     
    valid_small_paraphrases = datasets.load_from_disk(fname)
else:
    # Need to call this with batched=True to work. 
    valid_small_paraphrases = valid_small.map(lambda x: create_paraphrase_dataset(x,
                                                             l_cname='hypothesis_paraphrases'), 
                                              batched=True)
    valid_small_paraphrases.save_to_disk(fname)


In [11]:
# Generate results dataframe 
def get_vm_scores(): 
    """very hacky procedure to generate victim model scores """
    # Get preds and accuracy on the paraphrase dataset
    print("Getting victim model scores.")
    some_dl = DataLoader(valid_small_paraphrases, batch_size=batch_size, shuffle=False, 
                         num_workers=n_wkrs, pin_memory=True)
    dl = some_dl
    metric = load_metric('accuracy')
    para_probs_l,orig_probs_l = [], []
    assert vm_model.training == False  # checks that model is in eval mode 
    #monitor = Monitor(2)  # track GPU usage and memory
    with torch.no_grad():
        for i, data in enumerate(dl): 
            if i % 50 == 0 : print(i, "out of", len(dl))
            labels,premise = data['label'].to(device),data["premise"]
            paraphrases,orig = data["hypothesis_paraphrases"],data["hypothesis"]

            # predictions for original
            inputs = vm_tokenizer(premise,orig,padding=True,truncation=True, return_tensors="pt")
            inputs.to(device)
            outputs = vm_model(**inputs, labels=labels)
            probs = outputs.logits.softmax(1)
            preds = probs.argmax(1)
            orig_probs_l.append(probs.cpu())  

            # predictions for paraphrases
            inputs = vm_tokenizer(premise,paraphrases, padding=True,truncation=True, return_tensors="pt")
            inputs.to(device)
            outputs = vm_model(**inputs, labels=labels)
            probs = outputs.logits.softmax(1)
            preds = probs.argmax(1)
            para_probs_l.append(probs.cpu())
            metric.add_batch(predictions=preds, references=labels)

    orig_probs_t, para_probs_t = torch.cat(orig_probs_l),torch.cat(para_probs_l)
    #monitor.stop()

    # bit of a hack, i'm sure there's a native pytorch function for this but I couldn't find it
    vm_para_scores = torch.tensor([r[idx] for idx,r in zip(valid_small_paraphrases['label'],para_probs_t)])
    vm_orig_scores = torch.tensor([r[idx] for idx,r in zip(valid_small_paraphrases['label'],orig_probs_t)])
    return para_probs_t, orig_probs_t

def generate_sim_scores(): 
    """Function to just loop and generate sim scores for each input"""
    print("Getting similarity scores")
    sim_score_l = []
    for i, data in enumerate(valid_small): 
        if i % 50 == 0 : print(i, "out of", len(valid_small))
        orig, para = data['hypothesis'], data['hypothesis_paraphrases']
        orig_emb,para_emb  = embedding_model.encode(orig),embedding_model.encode(para)
        cos_sim = util.cos_sim(orig_emb,para_emb)[0]
        sim_score_l.append(cos_sim)
    sim_score_t = torch.cat(sim_score_l)
    return sim_score_t

fname = path_cache + 'results_df' + str(n_seed_seqs) + "_1.csv"
if os.path.exists(fname):
    results_df = pd.read_csv(fname)
else: 
    sim_score_t = generate_sim_scores()
    para_probs_t, orig_probs_t = get_vm_scores()
    vm_para_scores = torch.tensor([r[idx] for idx,r in zip(valid_small_paraphrases['label'],para_probs_t)])
    vm_orig_scores = torch.tensor([r[idx] for idx,r in zip(valid_small_paraphrases['label'],orig_probs_t)])
    
    results_df = pd.DataFrame({'premise': valid_small_paraphrases['premise'],
                  'orig': valid_small_paraphrases['hypothesis'],
                  'para': valid_small_paraphrases['hypothesis_paraphrases'],
                  'sim_score': sim_score_t,
                  'label_true': valid_small_paraphrases['label'], 
                  'label_vm_orig': orig_probs_t.argmax(1),
                  'label_vm_para': para_probs_t.argmax(1),
                  'vm_orig_truelabel': vm_orig_scores,             
                  'vm_para_truelabel': vm_para_scores,
                  'vm_truelabel_change': vm_orig_scores - vm_para_scores,
                  'vm_orig_class0': orig_probs_t[:,0], 
                  'vm_orig_class1': orig_probs_t[:,1], 
                  'vm_orig_class2': orig_probs_t[:,2],  
                  'vm_para_class0': para_probs_t[:,0], 
                  'vm_para_class1': para_probs_t[:,1], 
                  'vm_para_class2': para_probs_t[:,2]     
                  })
    results_df['vm_truelabel_change_X_sim_score'] = results_df['vm_truelabel_change'] * results_df['sim_score']
    results_df.to_csv(fname, index_label = 'idx')

### Permutation method to detect label flips

Take each example $Ex$ in the filtered set and generate paraphrases (e.g. 16) of it (or it might work better with a simple token-replacement strategy). Run each through the victim model (might be better with a different model, but still trained on dataset) and record predictions. Then tally up the label predictions (or maybe take average of the probabilities). Each prediction is a vote for the true label. Idea is that if $Ex$ changes ground truth label to class 4, then most of the paraphrases of $Ex$ will be of class 4 too. If $Ex$ is truly adversarial, then most of the paraphrases of $Ex$ are likely to be of the original class. 

Variations 

* Instead of generating further paraphrases for all label flippers, try the checklist tests on the input. e.g. replace number/proper noun
* Try systematic perturbations
* Record probability of the true class or the predicted class and put it into a distribution. Calculate entropy of it (STRIP style). The idea is that there is some reliable difference in these probabilities between ground-truth flips and otherwise and that entropy can be used as a rough measurement to distinguish between it. 
* Can try the above while keeping track of sentence embeddings + attention layers 

In [103]:
# Read in manually labelled data. This is to track results. 
fname = path_cache + 'results_df_48_20210514_labelled_subset.csv'
dset_advlbl = load_dataset('csv', data_files=fname)['train'].train_test_split(test_size=0.25)
train_advlbl,test_advlbl = dset_advlbl['train'],dset_advlbl['test']


# # as pandas df
# df_advlbl = pd.read_csv(fname)
# train_advlbl,_,test_advlbl = create_train_valid_test(df_advlbl, frac_train=0.75, frac_valid = 0.001)
# # To join with the original. (might be some issues with the idx/row-number col)
# # x = pd.merge(results_df, df_advlbl, on =['idx', 'premise','orig', 'para'])

Using custom data configuration default-ebc62bd8d2fb84e0
Reusing dataset csv (/data/tproth/.cache/huggingface/datasets/csv/default-ebc62bd8d2fb84e0/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0)


In [287]:
train_advlbl_df,test_advlbl_df = pd.DataFrame(dset_advlbl['train']),pd.DataFrame(dset_advlbl['test'])

#### Paraphrases of paraphrases 

nlp dataset -> gen_paraphrases (returns dataset) -> create_paraphrase_dataset -> get vm labels -> save in data frame 

In [104]:
n = 16 
cols_to_drop = ['is_adversarial','label_true','label_vm_orig','label_vm_para','orig','sim_score']
def paraphrase_and_return_dict(x, n_seed_seqs=16): 
    x['perms'] = get_paraphrases(x['para'], num_return_sequences=n, num_beams=n)
    return x 
train_advlbl_perms = train_advlbl.map(lambda x: paraphrase_and_return_dict(x, n_seed_seqs=n),
                  batched=False, remove_columns = cols_to_drop)
train_advlbl_expanded = train_advlbl_perms.map(lambda x: create_paraphrase_dataset(x, l_cname='perms'),
                           batched=True)

HBox(children=(FloatProgress(value=0.0, max=198.0), HTML(value='')))




In [208]:
# Get victim model predictions for each prediction  
advlbl_expanded_dl = DataLoader(train_advlbl_expanded, batch_size=batch_size, shuffle=False, 
                                num_workers=n_wkrs, pin_memory=True)
dl = advlbl_expanded_dl
probs_l = []
assert vm_model.training == False  # checks that model is in eval mode 
with torch.no_grad():
    for i, data in enumerate(dl): 
        if i % 50 == 0 : print(i, "out of", len(dl))
        premise,perms = data["premise"],data["perms"]
        # predictions for original
        inputs = vm_tokenizer(premise,perms,padding=True,truncation=True, return_tensors="pt")
        inputs.to(device)
        outputs = vm_model(**inputs)
        probs = outputs.logits.softmax(1)
        # preds = probs.argmax(1)
        probs_l.append(probs.cpu()) 

probs_t = torch.cat(probs_l)
preds_t = torch.argmax(probs_t,1)


In [224]:
# Bring back to original
train_advlbl_expanded = train_advlbl_expanded.add_column('vm_label', preds_t.tolist())
train_advlbl_expanded = train_advlbl_expanded.add_column('vm_prob0', probs_t[:,0].tolist())
train_advlbl_expanded = train_advlbl_expanded.add_column('vm_prob1', probs_t[:,1].tolist())
train_advlbl_expanded = train_advlbl_expanded.add_column('vm_prob2', probs_t[:,2].tolist())

In [288]:
# Make into pandas_df 
advlbl_df = pd.DataFrame(train_advlbl_expanded) 
advlbl_df.vm_label = advlbl_df.vm_label.astype('category')

# Count "votes" of each set of permutations 
votes_df = advlbl_df.groupby(['idx'])['vm_label'].describe()
votes_df = votes_df.rename(columns={'count':'votes','unique': "n_cats_with_votes",
                                     "top": 'top_cat', 'freq': 'top_cat_votes'})

In [289]:
train_advlbl_df = pd.merge(train_advlbl_df, votes_df, left_on='idx', right_index=True)

In [311]:
x = advlbl_df[['vm_prob0','vm_prob1','vm_prob2']][0:16].values

In [None]:
def entropy(x):
    """
    x is assumed to be an (nsignals, nsamples) array containing integers between
    0 and n_unique_vals
    """
    x = np.atleast_2d(x)
    nrows, ncols = x.shape
    nbins = x.max() + 1

    # count the number of occurrences for each unique integer between 0 and x.max()
    # in each row of x
    counts = np.vstack((np.bincount(row, minlength=nbins) for row in x))

    # divide by number of columns to get the probability of each unique value
    p = counts / float(ncols)

    # compute Shannon entropy in bits
    return -np.sum(p * np.log2(p), axis=1)

In [317]:
np.bincount(x[:,0])

TypeError: Cannot cast array data from dtype('float64') to dtype('int64') according to the rule 'safe'

In [305]:
(np.bincount(x[:0], minlength=nbins) for row in x))

Unnamed: 0,vm_prob0,vm_prob1,vm_prob2
0,0.078689,0.010921,0.001433
1,0.078541,0.012596,0.001009
2,0.072975,0.042969,0.011727
3,0.02992,0.132521,0.212876
4,0.07277,0.044026,0.012171
5,0.023403,0.152397,0.238178
6,0.057459,0.082056,0.078675
7,0.074377,0.016331,0.024472
8,0.064317,0.083978,0.033467
9,0.073438,0.039123,0.011907


In [299]:
x.yaxis

<matplotlib.axis.YAxis at 0x2b81b2d6f8e0>

In [256]:
advlbl_df.vm_label.astype('category')

0       0
1       0
2       0
3       2
4       0
       ..
3163    1
3164    1
3165    1
3166    1
3167    0
Name: vm_label, Length: 3168, dtype: category
Categories (3, int64): [0, 1, 2]

In [250]:
advlbl_df.value_counts(['idx', 'vm_label'])

idx    vm_label
7718   0           16
12487  1           16
687    0           16
1526   1           16
18937  0           16
                   ..
1103   1            1
       0            1
9422   2            1
1054   2            1
374    1            1
Length: 359, dtype: int64

In [243]:
advlbl_grp.

AttributeError: 'n' is not a valid function for 'DataFrameGroupBy' object

In [242]:
advlbl_df

Unnamed: 0,idx,para,perms,premise,vm_label,vm_prob0,vm_prob1,vm_prob2
0,13505,A man and woman stand near the beach.,Two people stand near the beach.,A carefully balanced male stands on one foot near a clean ocean beach area.,0,0.979817,0.017380,0.002803
1,13505,A man and woman stand near the beach.,Two people are near the beach.,A carefully balanced male stands on one foot near a clean ocean beach area.,0,0.977980,0.020046,0.001974
2,13505,A man and woman stand near the beach.,A man and woman are outside.,A carefully balanced male stands on one foot near a clean ocean beach area.,0,0.908671,0.068383,0.022946
3,13505,A man and woman stand near the beach.,A man and woman are standing.,A carefully balanced male stands on one foot near a clean ocean beach area.,2,0.372555,0.210900,0.416545
4,13505,A man and woman stand near the beach.,A man and a woman are outside.,A carefully balanced male stands on one foot near a clean ocean beach area.,0,0.906121,0.070064,0.023815
...,...,...,...,...,...,...,...,...
3163,23223,The child plays in the mud...,The child is outside.,A little baby with dirty fingers and smudges on her face points to her blue eye.,1,0.202849,0.778191,0.018960
3164,23223,The child plays in the mud...,A kid plays in the mud.,A little baby with dirty fingers and smudges on her face points to her blue eye.,1,0.173756,0.514919,0.311325
3165,23223,The child plays in the mud...,The child plays in the mud.,A little baby with dirty fingers and smudges on her face points to her blue eye.,1,0.077979,0.496839,0.425181
3166,23223,The child plays in the mud...,A child is playing in mud.,A little baby with dirty fingers and smudges on her face points to her blue eye.,1,0.134194,0.532714,0.333092


## Archive 

In [None]:
# # calculates performance of victim model on a dataloader

# dl = valid_dl
# metric = load_metric('accuracy')
# for i, data in enumerate(dl): 
#     if i % 10 == 0 : print(i, "out of", len(dl)) 
#     labels,premise,hypothesis = data['label'].to(device),data["premise"],data["hypothesis"]
#     inputs = vm_tokenizer(premise,hypothesis, padding=True,truncation=True, return_tensors="pt")
#     inputs.to(device)
#     outputs = vm_model(**inputs, labels=labels)
#     probs = outputs.logits.softmax(1)
#     preds = probs.argmax(1)
#     metric.add_batch(predictions=preds, references=labels)

# metric.compute()


In [None]:
# # Score semantic similarity with cross encoders

# from sentence_transformers.cross_encoder import CrossEncoder
# cross_encoder= CrossEncoder('cross-encoder/quora-distilroberta-base')
# i =11
# data = valid_small[i]
# orig, para = data['hypothesis'], data['hypothesis_paraphrases']
# orig_rep = [orig for i in range(len(para))]
# pairs = list(zip(orig_rep,para))
# scores = cross_encoder.predict(pairs)
# results_df = pd.DataFrame({'pairs':pairs, 'para': para,'score': cos_sim})
# print(orig)
# results_df.sort_values('score', ascending=False)

In [None]:
# # with sentence transformers

# valid_small_dl = DataLoader(valid_small, batch_size=4, shuffle=False, 
#                      num_workers=n_wkrs, pin_memory=True)
# sim_score_l = []
# for i, data in enumerate(valid_small_dl): 
#     pass
#     orig, para = data['hypothesis'], data['hypothesis_paraphrases']
#     orig_emb,para_emb  = embedding_model.encode(orig),embedding_model.encode(para)
# #     cos_sim = util.cos_sim(orig_emb,para_emb)[0]
# #     results_df = pd.DataFrame({'para': para,'score': cos_sim})
# #     print(orig)
# #     results_df.sort_values('score', ascending=False)