In [2]:
%load_ext autoreload
%autoreload 2
import argparse
import numpy as np
import random
import json
from subprocess import Popen, PIPE, STDOUT
import torch
import torch.nn as nn
from torch.autograd import Variable, grad
from bleu import compute_bleu
from models import load_models, generate
from utils import batchify, to_gpu
from utils import Corpus, filter_flip_polarity
random.seed(1111)
np.random.seed(1111)
torch.manual_seed(1111)

<torch._C.Generator at 0x7f6890051670>

In [3]:
def remove_separators(sent):
    return sent.replace("@@ ", "")

def to_BPE(sent):
    p = Popen(['python', 'bytepairencoding/apply_bpe.py', "--codes", "bytepairencoding/bpecode_yelp"], stdout=PIPE, stdin=PIPE, stderr=STDOUT)
    grep_stdout = p.communicate(input=sent.encode('utf-8'))[0]
    return grep_stdout.decode('utf-8')

remove_separators(to_BPE("Test this ."))

'Test this .'

In [4]:
def gen_from(test_sentence, num_output=10):
    # autoencoder.hidden_init = True
    test_sentence = to_BPE(test_sentence).split(' ')
    test_sentence = ['<sos>'] + test_sentence
#     test_sentence_ids = [word2idx[w] if w in word2idx else word2idx['<oov>'] for w in test_sentence]
    test_sentence_ids = [word2idx[w] for w in test_sentence]
    indices = to_gpu(True, Variable(torch.LongTensor([test_sentence_ids])))
    lengths = [len(test_sentence),]
    
    sentences = []
    classes = []
    hh = autoencoder.encode(indices, lengths=lengths, noise=False)
    original_class = enc_classifier(hh)
    original_class = torch.max(original_class, -1)[1].data.cpu().numpy()
    for times in range(num_output):
        hh = autoencoder.encode(indices, lengths=lengths, noise=True)
        cc = enc_classifier(hh)
        _, cc = torch.max(cc, -1)
        cc = "{}".format(cc.data.cpu().numpy())
        max_indices = autoencoder.generate(hidden=hh, maxlen=30, sample=False)
        max_indices = max_indices.data.cpu().numpy()
        
        for idx in max_indices:
            # generated sentence
            words = [idx2word[x] for x in idx]
            # truncate sentences to first occurrence of <eos>
            truncated_sent = []
            for w in words:
                if w != '<eos>':
                    truncated_sent.append(w)
                else:
                    break
        sent = " ".join(truncated_sent)
        sentences.append(sent)
        classes.append(cc)
    return sentences, classes, original_class

In [5]:
def gen_adv(hidden):
    max_indices = autoencoder.generate(hidden=hidden, maxlen=30, sample=False)
    max_indices = max_indices.data.cpu().numpy()
    for idx in max_indices:
        # generated sentence
        words = [idx2word[x] for x in idx]
        # truncate sentences to first occurrence of <eos>
        truncated_sent = []
        for w in words:
            if w != '<eos>':
                truncated_sent.append(w)
            else:
                break
    sent = " ".join(truncated_sent)
    return sent

In [None]:
load_path = './output/example_forEMNLP'
model_args, idx2word, autoencoder, gan_gen, gan_disc, enc_classifier \
        = load_models(load_path, suffix="_10", on_gpu=True)
word2idx = json.load(open("{}/vocab.json".format(load_path), "r"))
autoencoder.cuda()
enc_classifier.cuda()
autoencoder.gpu = True

In [None]:
enc_classifier.zero_grad()
autoencoder.zero_grad()

test_sentence = "The chicken is good , but the rest of the food is even better ."
# test_sentence = "Long line , inefficient staff . Maybe my expectations were too high but it just was n't as good as I was hoping for the calories ."
# test_sentence = "enjoyed obviously cinnamon believe markt likely enjoyed creepy average specifically brazil gets primarily reality markt likely believe ich primarily brazil meh written too follow reality lover buy expectations likely dedicated condescending mediocre"
# test_sentence = "A steakhouse can not deliver quality steak , which is unacceptable . Part of the steak was made carbonized , barely edible ."
# test_sentence = "I love this place ! It 's walking distance from my office that service delicious fast food . It 's a great place to grab a quick freshly made breakfast"
# test_sentence = "It was not the worse restaurant I 've ever had in a food place for over 20 months and our server was exquissive nothing like shit ."

test_sentence = to_BPE(test_sentence).split(' ')
test_sentence = ['<sos>'] + test_sentence
test_sentence_ids = [word2idx[w] for w in test_sentence]
indices = to_gpu(True, Variable(torch.LongTensor([test_sentence_ids])))
lengths = [len(test_sentence)]

hh = autoencoder(indices, lengths=lengths, noise=False, encode_only=True)
classifier_out = enc_classifier(hh)
classifier_out.retain_grad()
hh.retain_grad()
classifier_out

In [None]:
epsilon = 5e-2
x_adversarial = to_gpu(True, Variable(hh - epsilon * encoder_output_grad, requires_grad=False))
max_indices = autoencoder.generate(hidden=x_adversarial, maxlen=30, sample=False)
max_indices = max_indices.data.cpu().numpy()

for idx in max_indices:
    # generated sentence
    words = [idx2word[x] for x in idx]
    # truncate sentences to first occurrence of <eos>
    truncated_sent = []
    for w in words:
        if w != '<eos>':
            truncated_sent.append(w)
        else:
            break
sent = " ".join(truncated_sent)
print(remove_separators(sent))

In [None]:
corpus = Corpus("./processed_yelp/",
                maxlen=30,
                vocab_size=12000,
                lowercase=False,
                max_lines=100000,
                test_size=-1,
                load_vocab_file='./output/example/vocab.json',
                test_path='test.txt',)
ntokens = len(corpus.dictionary.word2idx)

In [None]:
bsz=35
f_test = filter_flip_polarity(corpus.test)
test_data = batchify(f_test, bsz=bsz, shuffle=False, pad_id=0)


In [None]:
# test a range of epsilon values for generation
criterion_ce = nn.CrossEntropyLoss().cuda()
eps_range = map(float, np.arange(1e-3, 1e-1, 2e-3))
# eps_range = [0.015]
real_sent_printed = False
original_sentences = []
changed_sentences = []

for epsilon in eps_range:

    all_accuracies = 0.
    word_accuracies = 0.
    bleus = 0.
    nbatches = len(test_data)
    nbatch_id = 0
    for batch in test_data:
        enc_classifier.zero_grad()
        autoencoder.zero_grad()
        source, target, lengths, tags = batch
        source = to_gpu(True, Variable(source))
        target = to_gpu(True, Variable(target)) # word ID
        tags = to_gpu(True, Variable(tags))

        # autoencoder encoded
        # output_encode_only = autoencoder.encode(source, lengths, noise=False)
        output_encode_only = autoencoder(source, lengths, noise=False, encode_only=True)
        output_encode_only.retain_grad()
    
        # classifier output
        output_classifier = enc_classifier(output_encode_only)
        _, output_classifier_argmax = torch.max(output_classifier, -1)
        classifier_loss = criterion_ce(output_classifier, tags)
        classifier_loss.backward()
        
        encoder_output_grad = torch.sign(output_encode_only.grad.data)

        x_adversarial = to_gpu(True, Variable(output_encode_only - epsilon * encoder_output_grad, requires_grad=False))
        y_adversarial = enc_classifier(x_adversarial)
        _, y_adversarial = torch.max(y_adversarial, -1)
        all_accuracies += \
                torch.mean(y_adversarial.eq(tags).float()).item()
        
        # autoencoder decode
        # batch x max_len
        output = autoencoder.generate(hidden=x_adversarial, maxlen=lengths[0], sample=False)
        # reshape
        flattened_output = output.view(-1)

        mask = target.gt(0)
        masked_target = target.masked_select(mask)
        
        masked_output = \
            flattened_output.masked_select(mask)
        assert masked_output.shape == masked_target.shape

        w_ac = torch.mean(masked_output.eq(masked_target).float()).item()
        word_accuracies += w_ac
#         print(w_ac)
#         print(masked_output.data.cpu().numpy())
#         print(masked_target.data.cpu().numpy())
        # bleu return (bleu, precisions, bp, ratio, translation_length, reference_length)
        bb, _, _, _, _, _ = compute_bleu([[masked_target.data.cpu().numpy()]], \
                                         [masked_output.data.cpu().numpy()], max_order=2)
        bleus += bb
        
        # example sentence
        truncated_sent = []
        for idx in masked_target.data:
            # generated sentence
            w = idx2word[idx.item()]
            # truncate sentences to first occurrence of <eos>
            if w != '<eos>':
                truncated_sent.append(w)
            else:
                break
        sent = " ".join(truncated_sent)
        real_tag = tags.data.cpu().numpy()[0]
        if not real_sent_printed: original_sentences.append((real_tag, sent))
    #     print(real_tag, sent, "\n", '-'*20) if not real_sent_printed else print("")
        truncated_sent = []
        for idx in masked_output.data:
            # generated sentence
            w = idx2word[idx.item()]
            # truncate sentences to first occurrence of <eos>
            if w != '<eos>':
                truncated_sent.append(w)
            else:
                break
        sent = " ".join(truncated_sent)
        pred_tag = y_adversarial.data.cpu().numpy()[0]
    #     print(pred_tag, sent)
        try: changed_sentences[nbatch_id]
        except: changed_sentences.append([])
        changed_sentences[nbatch_id].append((pred_tag, sent))
        nbatch_id += 1
    print("eps: {:.5f} acc: {:.5f} w_acc: {:.7f} bleu: {:.4f}".format( \
                epsilon, all_accuracies/nbatches, word_accuracies/nbatches, bleus/nbatches))
    real_sent_printed = True
    if (all_accuracies/len(test_data)) > 0.99998:
        print("Max accuracy. Break.")
        break

    torch.cuda.empty_cache()

# # replace @@s
# # !sed -i.bak -r 's/(@@ )|(@@ ?$)//g' senti_flip_examples_xzhang.txt

In [None]:
torch.cuda.empty_cache()