GLUE sets: model will be trained on eval set, so you shouldn't also test on the eval set. The problem is that the labels are withheld for the test set. 
Start with SNLI. MultiNLI is a later option too. As is rotten_tomatoes. 
* Victim model performance on dataset train, valid, test set. (done, written code to measure it)
* Create new paraphrased valid + test datasets (done a preliminary version on the valid set) 
* Measure victim model performance on paraphrased datasets (done. on vanilla valid set is about 87% accuracy. generating 16 paraphrases (i.e. not many) and evaluating performance on all of them, we get ~75% accuracy)
* Get document embeddings of original and paraphrased and compare 
  * https://github.com/UKPLab/sentence-transformers
* Write a simple way to measure paraphrase quality
* Construct reward function 


In [1]:
%load_ext autoreload
%autoreload 2

In [25]:
import os
import torch 
from torch.utils.data import DataLoader
from datasets import load_dataset, load_metric
import datasets, transformers
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, AutoTokenizer
from pprint import pprint
import numpy as np, pandas as pd
from utils import *   # local script 
import pyarrow
from sentence_transformers import SentenceTransformer, util
from IPython.core.debugger import set_trace

pd.set_option("display.max_colwidth", 400)
path_cache = './cache/'

seed = 420
torch.manual_seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
devicenum = torch.cuda.current_device() if device.type == 'cuda' else -1
n_wkrs = 4 * torch.cuda.device_count()
batch_size = 64

In [3]:
para_name = "tuner007/pegasus_paraphrase"
para_tokenizer = AutoTokenizer.from_pretrained(para_name)
para_model = AutoModelForSeq2SeqLM.from_pretrained(para_name).to(device)

In [4]:
# Victim Model (VM)
vm_name = "textattack/distilbert-base-cased-snli"
vm_tokenizer = AutoTokenizer.from_pretrained(vm_name)
vm_model = AutoModelForSequenceClassification.from_pretrained(vm_name).to(device)
vm_idx2lbl = vm_model.config.id2label
vm_lbl2idx = vm_model.config.label2id

In [5]:
dataset = load_dataset("snli")
train,valid,test = dataset['train'],dataset['validation'],dataset['test']

remove_minus1_labels = lambda x: x['label'] != -1
train = train.filter(remove_minus1_labels)
valid = valid.filter(remove_minus1_labels)
test = test.filter(remove_minus1_labels)

train_dl = DataLoader(train, batch_size=batch_size, shuffle=True, num_workers=n_wkrs)
valid_dl = DataLoader(valid, batch_size=batch_size, shuffle=True, num_workers=n_wkrs)
test_dl = DataLoader( test,  batch_size=batch_size, shuffle=True, num_workers=n_wkrs)



Reusing dataset snli (/data/tproth/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


HBox(children=(FloatProgress(value=0.0, max=551.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [6]:
def get_paraphrases(input_text,num_return_sequences,num_beams, num_beam_groups=1,diversity_penalty=0):
  batch = para_tokenizer(input_text,truncation=True,padding='longest', return_tensors="pt").to(device)
  translated = para_model.generate(**batch,num_beams=num_beams, num_return_sequences=num_return_sequences, 
                                   temperature=1.5, num_beam_groups=num_beam_groups, diversity_penalty=diversity_penalty)
  tgt_text = para_tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

def gen_hypothesis_paraphrases(x, n_seed_seqs=32): 
  """keep n_seed_seqs at 4,8,16,32,64 etc"""
  # TODO: figure out how to batch this. 
  if n_seed_seqs % 4 != 0: raise ValueError("keep n_seed_seqs divisible by 4 for now")
  n = n_seed_seqs/2
  #low diversity (ld) paraphrases 
  ld_l = get_paraphrases(x['hypothesis'],num_return_sequences=int(n),
                            num_beams=int(n))
  #high diversity (hd) paraphrases. We can use num_beam_groups and diversity_penalty as hyperparameters. 
  hd_l =  get_paraphrases(x['hypothesis'],num_return_sequences=int(n),
                            num_beams=int(n), num_beam_groups=int(n),diversity_penalty=50002.5)
  l = ld_l + hd_l 
  x['hypothesis_paraphrases'] = l#  list(set(l))             
  return x 


In [85]:
fname = path_cache + 'valid_small_48'
if os.path.exists(fname):  # caching, rudimentally
    valid_small = datasets.load_from_disk(fname)
else:
    valid_small = valid.shard(20, 0, contiguous=True)
    valid_small = valid_small.map(lambda x: gen_hypothesis_paraphrases(x, n_seed_seqs=48),
                  batched=False)
    valid_small.save_to_disk(fname)

HBox(children=(FloatProgress(value=0.0, max=493.0), HTML(value='')))




In [8]:
# Create a new dataset by repeating all other fields to be same length as number of paraphrases. 
# 
def create_paraphrase_dataset(batch): 
    """Repeat the other fields to be the same length as the number of paraphrases."""    
    n_premises = len(batch['premise'])
    paraphrases,hyp,prem,labels=[],[],[],[]
    d1 = dict()
    def rep_entry(x): return [x for o in range(n_paraphrases)]
    for p,h,l,hp in zip(batch['premise'], batch['hypothesis'], batch['label'], batch['hypothesis_paraphrases']):
        n_paraphrases = len(hp)
        paraphrases +=    hp
        hyp         +=    rep_entry(h)
        prem        +=    rep_entry(p)
        labels      +=    rep_entry(l)
    return {
        'hypothesis': hyp,
        'hypothesis_paraphrases': paraphrases,
        'premise':prem,
        'label':labels 
       }


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [31]:
fname = path_cache + 'valid_small_paraphrases_48'
if os.path.exists(fname):     
    valid_small_paraphrases = datasets.load_from_disk(fname)
else:
    # Need to call this with batched=True to work. 
    valid_small_paraphrases = valid_small.map(create_paraphrase_dataset,batched=True)
    valid_small_paraphrases.save_to_disk(fname)
some_dl = DataLoader(valid_small_paraphrases, batch_size=16, shuffle=False, num_workers=n_wkrs)

True

0 out of 493
10 out of 493
20 out of 493
30 out of 493
40 out of 493
50 out of 493
60 out of 493
70 out of 493
80 out of 493
90 out of 493
100 out of 493
110 out of 493
120 out of 493
130 out of 493
140 out of 493
150 out of 493
160 out of 493
170 out of 493
180 out of 493
190 out of 493
200 out of 493
210 out of 493
220 out of 493
230 out of 493
240 out of 493
250 out of 493
260 out of 493
270 out of 493
280 out of 493
290 out of 493
300 out of 493
310 out of 493
320 out of 493
330 out of 493
340 out of 493
350 out of 493
360 out of 493
370 out of 493
380 out of 493
390 out of 493
400 out of 493
410 out of 493
420 out of 493
430 out of 493
440 out of 493
450 out of 493
460 out of 493
470 out of 493
480 out of 493
490 out of 493


{'accuracy': 0.7560851926977687}

0 out of 154
10 out of 154
20 out of 154
30 out of 154
40 out of 154
50 out of 154
60 out of 154
70 out of 154
80 out of 154
90 out of 154
100 out of 154
110 out of 154
120 out of 154
130 out of 154
140 out of 154
150 out of 154


{'accuracy': 0.8768542979069295}

In [128]:
embedding_model = SentenceTransformer('paraphrase-distilroberta-base-v1')

In [130]:
# with sentence transformers
i =9
data = valid_small[i]
orig, para = data['hypothesis'], data['hypothesis_paraphrases']
orig_emb,para_emb  = embedding_model.encode(orig),embedding_model.encode(para)
cos_sim = util.cos_sim(orig_emb,para_emb)[0]
results_df = pd.DataFrame({'para': para,'score': cos_sim})
print(orig)
results_df.sort_values('score', ascending=False)

boys play football


Unnamed: 0,para,score
26,boys playing football,0.947049
27,boys playing football.,0.914903
31,boys playing football.,0.914903
8,The boys play football.,0.851649
4,Boys play football.,0.850938
16,Boys are playing football,0.823949
15,Boys playing football.,0.821321
14,The boys are playing football,0.818522
25,The boys are playing football,0.818522
9,There are boys playing football.,0.816782


## Archive 

In [None]:
# # calculates performance of victim model on a dataloader 
# dl = valid_dl
# metric = load_metric('accuracy')
# for i, data in enumerate(dl): 
#     if i % 10 == 0 : print(i, "out of", len(dl)) 
#     labels,premise,hypothesis = data['label'].to(device),data["premise"],data["hypothesis"]
#     inputs = vm_tokenizer(premise,hypothesis, padding=True,truncation=True, return_tensors="pt")
#     inputs.to(device)
#     outputs = vm_model(**inputs, labels=labels)
#     probs = outputs.logits.softmax(1)
#     preds = probs.argmax(1)
#     metric.add_batch(predictions=preds, references=labels)

# metric.compute()


In [None]:
# # Calculates accuracy on the paraphrase dataset
# dl = some_dl
# metric = load_metric('accuracy')
# for i, data in enumerate(dl): 
#     if i % 10 == 0 : print(i, "out of", len(dl))
#     labels,premise,paraphrases = data['label'].to(device),data["premise"],data["hypothesis_paraphrases"]
#     inputs = vm_tokenizer(premise,paraphrases, padding=True,truncation=True, return_tensors="pt")
#     inputs.to(device)
#     outputs = vm_model(**inputs, labels=labels)
#     probs = outputs.logits.softmax(1)
#     preds = probs.argmax(1)
#     metric.add_batch(predictions=preds, references=labels)

# metric.compute()

In [None]:
# # Score semantic similarity with cross encoders
# from sentence_transformers.cross_encoder import CrossEncoder
# cross_encoder= CrossEncoder('cross-encoder/quora-distilroberta-base')
# i =11
# data = valid_small[i]
# orig, para = data['hypothesis'], data['hypothesis_paraphrases']
# orig_rep = [orig for i in range(len(para))]
# pairs = list(zip(orig_rep,para))
# scores = cross_encoder.predict(pairs)
# results_df = pd.DataFrame({'pairs':pairs, 'para': para,'score': cos_sim})
# print(orig)
# results_df.sort_values('score', ascending=False)