In [1]:
import os
import numpy as np
import pandas as pd
import random
import json
from subprocess import Popen, PIPE, STDOUT
import torch
import torch.nn as nn
from torch.autograd import Variable, grad
from bleu import compute_bleu
from models import load_models, generate
from utils import batchify, to_gpu
from utils import Corpus, filter_flip_polarity
random.seed(1111)
np.random.seed(1111)
torch.manual_seed(1111)

<torch._C.Generator at 0x7fcb14062070>

In [10]:
MODEL_DIR = './output/hsieh_bpe_20_epochs'
# MODEL_DIR = './output/hsieh_bpe_20_epochs_fgsm_0.016'
# MODEL_DIR = './output/hsieh_bpe_20_epochs_pgd_0.05_0.001_40'
DATA_DIR = './data/hsieh_bpe'
ADVERSARIAL_DATA_DIR = './adversarial_data'
BATCH_SIZE = 35

In [11]:
model_args, idx2word, autoencoder, gan_gen, gan_disc, enc_classifier \
        = load_models(MODEL_DIR, suffix="_10", on_gpu=True, arch_cl="100")

# not needed
del gan_gen
del gan_disc
torch.cuda.empty_cache()

word2idx = json.load(open("{}/vocab.json".format(MODEL_DIR), "r"))

Loading models from./output/hsieh_bpe_20_epochs


In [12]:
criterion_ce = nn.CrossEntropyLoss().cuda()

In [5]:
def fgsm_attack(sentence_embedding, epsilon, data_grad):
    # Collect the element-wise sign of the data gradient
    sign_data_grad = data_grad.sign()
    # Create the perturbed image by adjusting each pixel of the input image
    perturbed_embedding = sentence_embedding + epsilon*sign_data_grad
    #clip within normal range for embedding
    perturbed_embedding = torch.clamp(perturbed_embedding, -0.34, 0.32)
    return perturbed_embedding

In [17]:
def evaluate_model(data, perturb=None, epsilon=.015, alpha=.015, pgd_iters=49, return_pred_and_tags=False):

    # perturb can be 'fgsm' or 'pgd' (to apply perturbation live) or None to evaluate without applying any perturbations

    all_pred = []
    all_tags = []

    for i, batch in enumerate(data):
        source, target, lengths, tags = batch
        source = to_gpu(True, Variable(source))
        #target = to_gpu(True, Variable(target)) # word ID
        tags = to_gpu(True, Variable(tags))

        # autoencoder encoded
        output_encode_only = autoencoder(source, lengths, noise=False, encode_only=True)
        output_encode_only.retain_grad()  # NL: same as output_encode_only.requires_grad = True

        # initial classifier output
        output_classifier = enc_classifier(output_encode_only)

        # apply perturbation
        if perturb == 'fgsm':
            classifier_loss = criterion_ce(output_classifier, tags)
            enc_classifier.zero_grad()
            classifier_loss.backward()
            code_grad = output_encode_only.grad.data
            perturbed_code = fgsm_attack(output_encode_only, epsilon, code_grad)   

            # get classifier predictions on the perturbed code
            scores = enc_classifier(perturbed_code)

        elif perturb == 'pgd':
            # alpha: step size
            # epsilon: max perturbation (ball)
            perturbed_code = output_encode_only.clone().detach()
            for i in range(pgd_iters):
                perturbed_code.requires_grad = True
                scores = enc_classifier(perturbed_code)
                tmp_loss = criterion_ce(scores, tags)
                enc_classifier.zero_grad()
                tmp_loss.backward(retain_graph=True)

                # step in the direction of the gradient
                perturbed_code = perturbed_code + alpha * perturbed_code.grad.sign()

                # Workaround as PyTorch doesn't have elementwise clip
                # from: https://gist.github.com/oscarknagg/45b187c236c6262b1c4bbe2d0920ded6#file-projected_gradient_descent-py
                perturbed_code = torch.max(torch.min(perturbed_code, output_encode_only + epsilon), output_encode_only - epsilon).detach()
                perturbed_code = torch.clamp(perturbed_code, -0.34, 0.32)

            # get classifier predictions on the perturbed code
            scores = enc_classifier(perturbed_code)

        else:
            scores = output_classifier

        # get preds
        _, output_classifier_argmax = torch.max(scores, -1)
        pred = output_classifier_argmax.cpu().numpy()
        
        all_pred.extend(pred)
        all_tags.extend(tags.cpu().numpy())
        
    #return all_tags
    accuracy = (np.array(all_pred) == np.array(all_tags)).mean()
    
    if return_pred_and_tags:
        return accuracy, all_pred, all_tags
    return accuracy

In [16]:
#hack since corpus needs train.txt
from shutil import copyfile
copyfile(DATA_DIR + '/train.txt', ADVERSARIAL_DATA_DIR+'/train.txt')


filenames = [filename for filename in os.listdir(ADVERSARIAL_DATA_DIR) if filename.endswith(".txt") and filename != 'train.txt']
filenames.sort()

for filename in filenames:
    epsilon = filename.split(' ')[1].split('_')[1]
    corpus = Corpus(ADVERSARIAL_DATA_DIR,
            maxlen=30,
            vocab_size=12000,
            lowercase=False,
            max_lines=100000,
            test_size=-1,
            load_vocab_file=os.path.join(MODEL_DIR, 'vocab.json'),
            test_path=filename,)

    test_data = batchify(corpus.test, bsz=BATCH_SIZE, shuffle=False, pad_id=0)
    acc = evaluate_model(test_data, perturb = None)
    print('Epsilon {}, Acc {}'.format(epsilon, acc))
    print('\n')
    #if filename.endswith(".asm") or filename.endswith(".py"):

Loaded vocab file ./output/hsieh_bpe_20_epochs/vocab.json with 5971 words
Number of sentences cropped from ./adversarial_data/train.txt: 0 out of 100000 total, dropped 1517. OOV rate 0.000
Using fgsm eps_0.001 test.txt as test set
Number of sentences cropped from ./adversarial_data/fgsm eps_0.001 test.txt: 0 out of 4784 total, dropped 0. OOV rate 0.000
Epsilon 0.001, Acc 0.8922268907563026


Loaded vocab file ./output/hsieh_bpe_20_epochs/vocab.json with 5971 words
Number of sentences cropped from ./adversarial_data/train.txt: 0 out of 100000 total, dropped 1517. OOV rate 0.000
Using fgsm eps_0.006 test.txt as test set
Number of sentences cropped from ./adversarial_data/fgsm eps_0.006 test.txt: 0 out of 22655 total, dropped 8. OOV rate 0.000
Epsilon 0.006, Acc 0.8500331198940163


Loaded vocab file ./output/hsieh_bpe_20_epochs/vocab.json with 5971 words
Number of sentences cropped from ./adversarial_data/train.txt: 0 out of 100000 total, dropped 1517. OOV rate 0.000
Using fgsm eps_0.011

<b>Evaluating on FGSM trained model</b>

In [7]:
#hack since corpus needs train.txt
from shutil import copyfile
copyfile(DATA_DIR + '/train.txt', ADVERSARIAL_DATA_DIR+'/train.txt')


filenames = [filename for filename in os.listdir(ADVERSARIAL_DATA_DIR) if filename.endswith(".txt") and filename != 'train.txt']
filenames.sort()

for filename in filenames:
    epsilon = filename.split(' ')[1].split('_')[1]
    corpus = Corpus(ADVERSARIAL_DATA_DIR,
            maxlen=30,
            vocab_size=12000,
            lowercase=False,
            max_lines=100000,
            test_size=-1,
            load_vocab_file=os.path.join(MODEL_DIR, 'vocab.json'),
            test_path=filename,)

    test_data = batchify(corpus.test, bsz=BATCH_SIZE, shuffle=False, pad_id=0)
    acc = evaluate_model(test_data, perturb = None)
    print('Epsilon {}, Acc {}'.format(epsilon, acc))
    print('\n')
    #if filename.endswith(".asm") or filename.endswith(".py"):

Loaded vocab file ./output/hsieh_bpe_20_epochs_fgsm_0.016/vocab.json with 5971 words
Number of sentences cropped from ./adversarial_data/train.txt: 0 out of 100000 total, dropped 1517. OOV rate 0.000
Using fgsm eps_0.001 test.txt as test set
Number of sentences cropped from ./adversarial_data/fgsm eps_0.001 test.txt: 0 out of 4784 total, dropped 0. OOV rate 0.000
Epsilon 0.001, Acc 0.8947478991596639


Loaded vocab file ./output/hsieh_bpe_20_epochs_fgsm_0.016/vocab.json with 5971 words
Number of sentences cropped from ./adversarial_data/train.txt: 0 out of 100000 total, dropped 1517. OOV rate 0.000
Using fgsm eps_0.006 test.txt as test set
Number of sentences cropped from ./adversarial_data/fgsm eps_0.006 test.txt: 0 out of 22655 total, dropped 8. OOV rate 0.000
Epsilon 0.006, Acc 0.84707440936189


Loaded vocab file ./output/hsieh_bpe_20_epochs_fgsm_0.016/vocab.json with 5971 words
Number of sentences cropped from ./adversarial_data/train.txt: 0 out of 100000 total, dropped 1517. OOV 

<b>Evaluating on PGD trained model, hsieh_bpe_20_epochs_pgd_0.05_0.001_40</b>

In [8]:
#hack since corpus needs train.txt
from shutil import copyfile
copyfile(DATA_DIR + '/train.txt', ADVERSARIAL_DATA_DIR+'/train.txt')


filenames = [filename for filename in os.listdir(ADVERSARIAL_DATA_DIR) if filename.endswith(".txt") and filename != 'train.txt']
filenames.sort()

for filename in filenames:
    epsilon = filename.split(' ')[1].split('_')[1]
    corpus = Corpus(ADVERSARIAL_DATA_DIR,
            maxlen=30,
            vocab_size=12000,
            lowercase=False,
            max_lines=100000,
            test_size=-1,
            load_vocab_file=os.path.join(MODEL_DIR, 'vocab.json'),
            test_path=filename,)

    test_data = batchify(corpus.test, bsz=BATCH_SIZE, shuffle=False, pad_id=0)
    acc = evaluate_model(test_data, perturb = None)
    print('Epsilon {}, Acc {}'.format(epsilon, acc))
    print('\n')
    #if filename.endswith(".asm") or filename.endswith(".py"):

Loaded vocab file ./output/hsieh_bpe_20_epochs_pgd_0.05_0.001_40/vocab.json with 5971 words
Number of sentences cropped from ./adversarial_data/train.txt: 0 out of 100000 total, dropped 1517. OOV rate 0.000
Using fgsm eps_0.001 test.txt as test set
Number of sentences cropped from ./adversarial_data/fgsm eps_0.001 test.txt: 0 out of 4784 total, dropped 0. OOV rate 0.000
Epsilon 0.001, Acc 0.8968487394957984


Loaded vocab file ./output/hsieh_bpe_20_epochs_pgd_0.05_0.001_40/vocab.json with 5971 words
Number of sentences cropped from ./adversarial_data/train.txt: 0 out of 100000 total, dropped 1517. OOV rate 0.000
Using fgsm eps_0.006 test.txt as test set
Number of sentences cropped from ./adversarial_data/fgsm eps_0.006 test.txt: 0 out of 22655 total, dropped 8. OOV rate 0.000
Epsilon 0.006, Acc 0.8469419297858247


Loaded vocab file ./output/hsieh_bpe_20_epochs_pgd_0.05_0.001_40/vocab.json with 5971 words
Number of sentences cropped from ./adversarial_data/train.txt: 0 out of 100000 to

# Focus on nonrobust model's classification errors

### For each adversarial dataset, get the indices of test examples that were misclassified by the non-robust model

In [51]:
after_epochs = "_20"

model_args, idx2word, autoencoder, gan_gen, gan_disc, enc_classifier \
        = load_models(MODEL_DIR, suffix=after_epochs, on_gpu=True, arch_cl="100")

# not needed
del gan_gen
del gan_disc
torch.cuda.empty_cache()

word2idx = json.load(open("{}/vocab.json".format(MODEL_DIR), "r"))



#hack since corpus needs train.txt
from shutil import copyfile
copyfile(DATA_DIR + '/train.txt', ADVERSARIAL_DATA_DIR+'/train.txt')

filenames = [filename for filename in os.listdir(ADVERSARIAL_DATA_DIR) if filename.endswith(".txt") and filename != 'train.txt']
filenames.sort()

misclassified_data = dict()

for filename in filenames:
    epsilon = filename.split(' ')[1].split('_')[1]
    corpus = Corpus(ADVERSARIAL_DATA_DIR,
            maxlen=30,
            vocab_size=12000,
            lowercase=False,
            max_lines=100000,
            test_size=-1,
            load_vocab_file=os.path.join(MODEL_DIR, 'vocab.json'),
            test_path=filename,)

    test_data = batchify(corpus.test, bsz=1, shuffle=False, pad_id=0)
    acc, pred, tags = evaluate_model(test_data, perturb = None, return_pred_and_tags=True)
    misclassified_data[filename] = [np.array(pred) != np.array(tags)]
    print('Epsilon {}, Acc {}'.format(epsilon, acc))
    print('\n')

Loading models from./output/hsieh_bpe_20_epochs
Loaded vocab file ./output/hsieh_bpe_20_epochs/vocab.json with 5971 words
Number of sentences cropped from ./adversarial_data/train.txt: 0 out of 100000 total, dropped 1517. OOV rate 0.000
Using fgsm eps_0.001 test.txt as test set
Number of sentences cropped from ./adversarial_data/fgsm eps_0.001 test.txt: 0 out of 4784 total, dropped 0. OOV rate 0.000
Epsilon 0.001, Acc 0.8913043478260869


Loaded vocab file ./output/hsieh_bpe_20_epochs/vocab.json with 5971 words
Number of sentences cropped from ./adversarial_data/train.txt: 0 out of 100000 total, dropped 1517. OOV rate 0.000
Using fgsm eps_0.006 test.txt as test set
Number of sentences cropped from ./adversarial_data/fgsm eps_0.006 test.txt: 0 out of 22655 total, dropped 8. OOV rate 0.000
Epsilon 0.006, Acc 0.8473086943082969


Loaded vocab file ./output/hsieh_bpe_20_epochs/vocab.json with 5971 words
Number of sentences cropped from ./adversarial_data/train.txt: 0 out of 100000 total, d

### Calculate the accuracy of the misclassified examples from each adv. dataset against each of the adversarially trained models

In [52]:
models_to_evaluate = [
    './output/hsieh_bpe_20_epochs',
    './output/hsieh_bpe_20_epochs_fgsm_0.016',
    './output/hsieh_bpe_20_epochs_fgsm_0.051',
    './output/hsieh_bpe_20_epochs_pgd_0.05_0.001_40',
]


for model_dir in models_to_evaluate:
    print("\n\n ==", model_dir, "==\n")
    model_args, idx2word, autoencoder, gan_gen, gan_disc, enc_classifier \
            = load_models(model_dir, suffix=after_epochs, on_gpu=True, arch_cl="100")

    # not needed
    del gan_gen
    del gan_disc
    torch.cuda.empty_cache()

    word2idx = json.load(open("{}/vocab.json".format(MODEL_DIR), "r"))

    # load adversarial datasets, but only the indices where the non-robust model
    # made classification errors
    for filename in filenames:
        epsilon = filename.split(' ')[1].split('_')[1]
        corpus = Corpus(ADVERSARIAL_DATA_DIR,
                maxlen=30,
                vocab_size=12000,
                lowercase=False,
                max_lines=100000,
                test_size=-1,
                load_vocab_file=os.path.join(MODEL_DIR, 'vocab.json'),
                test_path=filename)

        test_data = batchify(list(np.array(corpus.test)[misclassified_data[filename][0]]),
                             bsz=BATCH_SIZE, shuffle=False, pad_id=0)        
        
        acc = evaluate_model(test_data, perturb = None)
        print('Epsilon {}, Acc {}'.format(epsilon, acc))
        print('\n')



 == ./output/hsieh_bpe_20_epochs ==

Loading models from./output/hsieh_bpe_20_epochs
Loaded vocab file ./output/hsieh_bpe_20_epochs/vocab.json with 5971 words
Number of sentences cropped from ./adversarial_data/train.txt: 0 out of 100000 total, dropped 1517. OOV rate 0.000
Using fgsm eps_0.001 test.txt as test set
Number of sentences cropped from ./adversarial_data/fgsm eps_0.001 test.txt: 0 out of 4784 total, dropped 0. OOV rate 0.000
Epsilon 0.001, Acc 0.022448979591836733


Loaded vocab file ./output/hsieh_bpe_20_epochs/vocab.json with 5971 words
Number of sentences cropped from ./adversarial_data/train.txt: 0 out of 100000 total, dropped 1517. OOV rate 0.000
Using fgsm eps_0.006 test.txt as test set
Number of sentences cropped from ./adversarial_data/fgsm eps_0.006 test.txt: 0 out of 22655 total, dropped 8. OOV rate 0.000
Epsilon 0.006, Acc 0.0239067055393586


Loaded vocab file ./output/hsieh_bpe_20_epochs/vocab.json with 5971 words
Number of sentences cropped from ./adversarial