In [14]:
import os
import numpy as np
import pandas as pd
import random
import json
from subprocess import Popen, PIPE, STDOUT
import torch
import torch.nn as nn
from torch.autograd import Variable, grad
from bleu import compute_bleu
from models import load_models, generate
from utils import batchify, to_gpu
from utils import Corpus, filter_flip_polarity
random.seed(1111)
np.random.seed(1111)
torch.manual_seed(1111)

<torch._C.Generator at 0x7fba03573050>

In [15]:
MODEL_DIR = './output/hsieh_bpe_20_epochs'
DATA_DIR = './data/hsieh_bpe'

In [16]:
model_args, idx2word, autoencoder, gan_gen, gan_disc, enc_classifier \
        = load_models(MODEL_DIR, suffix="_10", on_gpu=True, arch_cl="100")

# not needed
del gan_gen
del gan_disc
torch.cuda.empty_cache()

word2idx = json.load(open("{}/vocab.json".format(MODEL_DIR), "r"))

Loading models from./output/hsieh_bpe_20_epochs


In [17]:
corpus = Corpus(DATA_DIR,
                maxlen=30,
                vocab_size=12000,
                lowercase=False,
                max_lines=100000,
                test_size=-1,
                load_vocab_file=os.path.join(MODEL_DIR, 'vocab.json'),
                test_path='test.txt',)

Loaded vocab file ./output/hsieh_bpe_20_epochs/vocab.json with 5971 words
Number of sentences cropped from ./data/hsieh_bpe/train.txt: 0 out of 100000 total, dropped 1517. OOV rate 0.000
Using test.txt as test set
Number of sentences cropped from ./data/hsieh_bpe/test.txt: 0 out of 100000 total, dropped 1538. OOV rate 0.000


**TODO: Do we need to flip the labels? or is that because they do FGSM as minus eps x grad?**

In [18]:
bsz = 35
# f_test = filter_flip_polarity(corpus.test)
# test_data = batchify(f_test, bsz=bsz, shuffle=False, pad_id=0)

test_data = batchify(corpus.test, bsz=bsz, shuffle=False, pad_id=0)

In [19]:
def recover_original_sentence(source_batch, bpe_decode=False):
    # recover original sentence
    original_sentences = []
    indices = source_batch.numpy()
    for idx in indices:
        words = [corpus.dictionary.idx2word[x] for x in idx if x > 1]
        
        if bpe_decode:
            # if tokens were encoded using bpemb_en, then we can convert the tokens back to English words
            original_sentences.append(bpemb_en.decode(words))
        else:
            # leave as BPE tokens
            original_sentences.append(" ".join(words))
    return original_sentences

def generate_sentences_from_embedding(autoencoder, embedded_sentences_batch, sample=False, bpe_decode=False):
    # generate sentence from embedding
    decoded_sentences = []
    max_indices = autoencoder.generate(embedded_sentences_batch, maxlen=50, sample=sample)
    max_indices = max_indices.data.cpu().numpy()
    for idx in max_indices:
        # generated sentence
        words = [corpus.dictionary.idx2word[x] for x in idx]
        # truncate sentences to first occurrence of <eos>
        truncated_sent = []
        for w in words:
            if w != '<eos>':
                truncated_sent.append(w)
            else:
                break
        if bpe_decode:
            # if tokens were encoded using bpemb_en, then we can convert the tokens back to English words
            decoded_sentences.append(bpemb_en.decode(truncated_sent))
        else:
            # leave as BPE tokens
            decoded_sentences.append(" ".join(truncated_sent))  
    return decoded_sentences

def fgsm_attack(sentence_embedding, epsilon, data_grad):
    # Collect the element-wise sign of the data gradient
    sign_data_grad = data_grad.sign()
    # Create the perturbed image by adjusting each pixel of the input image
    perturbed_embedding = sentence_embedding + epsilon*sign_data_grad
    #clip within normal range for embedding
    perturbed_embedding = torch.clamp(perturbed_embedding, -0.34, 0.32)
    return perturbed_embedding

In [20]:
criterion_ce = nn.CrossEntropyLoss().cuda()


def generate_adversarial_dataset(data,
                                 directory='./adversarial_data',
                                 phase='test',  # for naming the output files appropriately
                                 perturb='pgd',
                                 epsilon=.015,
                                 alpha=.015, pgd_iters=40):

    assert perturb in ["fgsm", "pgd"], "perturb should be 'fgsm' or 'pgd'"
    all_tags = []
    predicted_tags = []
    original_sentences = []
    source_decoded = []
    adv_decoded = []
    
    print()
    print("Dataset:", phase)
    if perturb == 'fgsm':
        print("FGSM with epsilon {}.".format(epsilon))
    elif perturb == 'pgd':
        print("PGD with epsilon {} and alpha {}, {} iters.".format(epsilon, alpha, pgd_iters))

    torch.cuda.empty_cache()
    for batch in data:
        enc_classifier.zero_grad()
        autoencoder.zero_grad()
        source, target, lengths, tags = batch
        source = to_gpu(True, Variable(source))
        #target = to_gpu(True, Variable(target)) # word ID
        tags = to_gpu(True, Variable(tags))

        # recover original sentence as BPE tokens
        original_sentences.extend(recover_original_sentence(source.cpu()))

        # keep flat list of all tags
        all_tags.extend(tags.cpu().numpy())

        # autoencoder encoded
        output_encode_only = autoencoder(source, lengths, noise=False, encode_only=True)
        output_encode_only.retain_grad()  # NL: same as output_encode_only.requires_grad = True

        # classifier output
        output_classifier = enc_classifier(output_encode_only)

        # keep the predicted label
        _, output_classifier_argmax = torch.max(output_classifier, -1)
        predicted_tags.extend(output_classifier_argmax.cpu().numpy())

        # apply perturbation
        if perturb == 'fgsm':
            classifier_loss = criterion_ce(output_classifier, tags)
            enc_classifier.zero_grad()
            classifier_loss.backward()
            code_grad = output_encode_only.grad.data
            perturbed_code = fgsm_attack(output_encode_only, epsilon, code_grad)        

        elif perturb == 'pgd':
            # alpha: step size
            # epsilon: max perturbation (ball)
            perturbed_code = output_encode_only.clone().detach()
            for i in range(pgd_iters):
                perturbed_code.requires_grad = True
                scores = enc_classifier(perturbed_code)
                tmp_loss = criterion_ce(scores, tags)
                enc_classifier.zero_grad()
                tmp_loss.backward(retain_graph=True)

                # step in the direction of the gradient
                perturbed_code = perturbed_code + alpha * perturbed_code.grad.sign()

                # Workaround as PyTorch doesn't have elementwise clip
                # from: https://gist.github.com/oscarknagg/45b187c236c6262b1c4bbe2d0920ded6#file-projected_gradient_descent-py
                perturbed_code = torch.max(torch.min(perturbed_code, code + epsilon), code - epsilon).detach()
                perturbed_code = torch.clamp(perturbed_code, -0.34, 0.32)

        # decode perturbed sentence
        adv_decoded.extend(generate_sentences_from_embedding(autoencoder, perturbed_code))

        # decode original sentence, for comparison
        source_decoded.extend(generate_sentences_from_embedding(autoencoder, output_encode_only))

    # write data

    if not os.path.exists(directory):
        os.mkdir(directory)

    # Create new datasets of decoded adversarial sentences, but only those that 
    # do not exactly match their unperturbed counterpart after both are decoded,
    # and those that were initially classified correctly
    if perturb == 'fgsm':
        target_text_file = os.path.join(directory, 'fgsm eps_{} {}.txt'.format(epsilon, phase))
        target_df_file = os.path.join(directory, 'fgsm eps_{} {}.csv'.format(epsilon, phase))
    elif perturb == 'pgd':
        target_text_file = os.path.join(directory, 'pgd eps_{} alpha_{} iters_{} {}.txt'.format(epsilon, alpha, pgd_iters, phase))
        target_df_file = os.path.join(directory, 'pgd eps_{} alpha_{} iters_{} {}.csv'.format(epsilon, alpha, pgd_iters, phase))

    with open(target_text_file, 'w') as f:
        for s, d, t, p in zip(source_decoded, adv_decoded, all_tags, predicted_tags):
            if (s != d) and (p == t):
                # only keep if sentence was originally classified correctly,
                # and if the perturbed embedding is different than the original embedding
                # after they've both been decoded
                f.write(str(t))
                f.write("\t")
                f.write(d)
                f.write("\n")
    print("Done writing adversarial data to", target_text_file)

    # For inspection, write all of the data to file
    df = pd.DataFrame(list(zip(original_sentences,
                               adv_decoded,
                               source_decoded,
                               all_tags,
                               predicted_tags)), 
                   columns =['original', 'adv_decoded', 'original_decoded', 'label', 'predicted']) 
    df['included_in_adv_dataset'] = (df['adv_decoded'] != df['original_decoded']) & (df['label'] == df['predicted'])
    df.to_csv(target_df_file)
    print("Done writing dataframe to", target_df_file)
    print()

In [21]:
eps_range = [np.round(x, 3) for x in np.arange(1e-3, 1e-1, 5e-3)]

for epsilon in eps_range:
    generate_adversarial_dataset(test_data, perturb='fgsm', epsilon=epsilon)


Dataset: test
FGSM with epsilon 0.001.
Done writing adversarial data to ./adversarial_data/fgsm eps_0.001 test.txt
Done writing dataframe to ./adversarial_data/fgsm eps_0.001 test.csv


Dataset: test
FGSM with epsilon 0.006.
Done writing adversarial data to ./adversarial_data/fgsm eps_0.006 test.txt
Done writing dataframe to ./adversarial_data/fgsm eps_0.006 test.csv


Dataset: test
FGSM with epsilon 0.011.
Done writing adversarial data to ./adversarial_data/fgsm eps_0.011 test.txt
Done writing dataframe to ./adversarial_data/fgsm eps_0.011 test.csv


Dataset: test
FGSM with epsilon 0.016.
Done writing adversarial data to ./adversarial_data/fgsm eps_0.016 test.txt
Done writing dataframe to ./adversarial_data/fgsm eps_0.016 test.csv


Dataset: test
FGSM with epsilon 0.021.
Done writing adversarial data to ./adversarial_data/fgsm eps_0.021 test.txt
Done writing dataframe to ./adversarial_data/fgsm eps_0.021 test.csv


Dataset: test
FGSM with epsilon 0.026.
Done writing adversarial data 