In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm

import multiprocessing as mp

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM, Reshape, Activation, Input
from tensorflow.keras.datasets import imdb
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K


In [78]:
# %%time
max_features = 10000
# cut texts after this number of words (among top max_features most common words)

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features, seed=1)
print(len(x_train), 'train sequences, with shape', x_train.shape)
print(len(x_test), 'test sequences with shape', x_test.shape)

Loading data...
25000 train sequences, with shape (25000,)
25000 test sequences with shape (25000,)


In [79]:
### Decreasing the size of test set.
np.random.seed(10)
idx = np.random.choice(x_test.shape[0],2000)
x_test = x_test[idx]
y_test = y_test[idx]

In [4]:
# print("One-hot encoding of labels")
# y_train = to_categorical(y_train, 2)
# y_test = to_categorical(y_test, 2)
# print('train labels shape:',y_train.shape)
# print('test labels shape:',y_test.shape)

One-hot encoding of labels
train labels shape: (25000, 2)
test labels shape: (2000, 2)


In [80]:
maxlen = 150

x_train = sequence.pad_sequences(x_train, padding='post', truncating='post', maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('train data shape:', x_train.shape)
print('test data shape:', x_test.shape)

train data shape: (25000, 150)
test data shape: (2000, 150)


In [6]:
print("Setting up model-specific variables...")
# K.clear_session()
batch_size = 64
embedding_size = 128
lstm_size = 128
val_split = 0.2
epochs = 12
num_classes = 2

Setting up model-specific variables...


In [7]:
imdb_clf = load_model('imdb_compiled_clf_150dim.h5')

In [8]:
imdb_clf.evaluate(x_train, y_train, batch_size=512)



[0.012113170527815818, 0.99752]

## Retrieve Embeddings for all the words in the Vocabulary

In [9]:
vocab_embeddings = imdb_clf.layers[1].embeddings.numpy()
print("Shape of the generated embeddings: ",vocab_embeddings.shape)

Shape of the generated embeddings:  (10000, 128)


## Keras function to extract embeddings for samples

In [10]:
get_embeddings = K.function([imdb_clf.layers[0].input],
                                  imdb_clf.layers[1].output)

print("Testing the embedding function with a single sample...")
test_embed = get_embeddings(x_test[0])
print("Shape of generated embeddings:",test_embed.shape)

Testing the embedding function with a single sample...
Shape of generated embeddings: (150, 128)


### Saving the model

## Adversarial crafting

### Sub-model - from Embeddings to logits

In [11]:
### Defining necessary layers
embed_input = Input(shape=test_embed.shape)
embed_lstm = LSTM(lstm_size, dropout=0.2, recurrent_dropout=0.2)(embed_input)
embed_dense = Dense(num_classes)(embed_lstm)

### Define model with Embedding inputs and Logit outputs
embed_model = Model(inputs=embed_input, outputs=embed_dense)

### Transferring the trained weights from our IMDB Classifier model (imdb_clf)
embed_model.layers[1].set_weights(imdb_clf.layers[2].get_weights())
embed_model.layers[2].set_weights(imdb_clf.layers[3].get_weights())
embed_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 150, 128)]        0         
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 2)                 258       
Total params: 131,842
Trainable params: 131,842
Non-trainable params: 0
_________________________________________________________________


### Calculate Jacobian matrix for all the words in the input

In [12]:
def compute_input_jacobian(x, y, model):
    x_embed = get_embeddings(x)
    x_tensor = tf.convert_to_tensor(x_embed.reshape(-1,maxlen,embedding_size), tf.float32)
    x_var = tf.Variable(x_tensor, dtype=tf.float32)

    with tf.GradientTape(watch_accessed_variables=False) as tape:
        tape.watch(x_var)
        # Get logits
        pred_y = model(x_var)

    # Calculate gradients
    x_gradients = tape.batch_jacobian(pred_y, x_var).numpy()
    print("Shape of the Jacobian:", x_gradients.shape)

    # if not compare_losses(x, y, pred_y) : return None
    return x_gradients

def compare_losses(x, labels, preds):
    # Calculate loss
    calc_loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=preds)
    model_loss, _ = imdb_clf.evaluate(x.reshape(-1,maxlen),labels.reshape(-1,num_classes))

    return calc_loss-model_loss<0.001

In [158]:
def craft_sample(x, y, x_gradient):

    x_copy = x.copy()

    for word in range(maxlen):
        
        pred = np.argmax(imdb_clf.predict_on_batch(x.reshape(-1,maxlen)))
        if pred != y : 
            return x, word

        word_grad = x_gradient[y, word]
        # print(word_grad.shape)

        jac_sign = np.add.reduce(np.sign(word_grad))
        vocab_sign = np.add.reduce(np.sign(word_grad - vocab_embeddings),1)
#         jac_sign = np.sign(word_grad)
#         vocab_sign = np.sign(word_grad - vocab_embeddings)

        match_word = np.argmin(np.absolute(vocab_sign - jac_sign))
#         match_word = np.argmin(np.absolute(np.add.reduce(vocab_sign - jac_sign, axis=1)))
        x[word] = match_word

#         pred = np.argmax(imdb_clf.predict_on_batch(x.reshape(-1,maxlen)))
        
#     print(word,acc)

    return  x_copy, 0


In [14]:
%%time

crafted_x = []
num_changes = []
# idx = np.random.choice(x_train.shape[0], 10)
idx = range(512)
xs, ys = x_train[0:512].copy(), y_train[0:512].copy()
# xs, ys = x_train.copy(), y_train.copy()
print("Calculating gradients...")
x_gradients = compute_input_jacobian(xs,ys,embed_model)

print("Loss and accuracy of selected samples:", imdb_clf.evaluate(xs, ys, verbose=0))

Calculating gradients...
Shape of the Jacobian: (512, 2, 150, 128)
Loss and accuracy of selected samples: [0.020581036726071034, 0.9980469]
CPU times: user 1min 39s, sys: 9.26 s, total: 1min 48s
Wall time: 1min 31s


In [15]:
crafted_x = []
num_changes = []

In [16]:
print("Crafting adversarial samples...")

Crafting adversarial samples...


In [None]:
for x, y, grad in tqdm(zip(xs, ys, x_gradients), total=xs.shape[0]):
    # x = x_train[idx]
    # y = y_train[idx]
    new_x , changes = craft_sample(x, y, grad)
    crafted_x.append(new_x)
    num_changes.append(changes)

crafted_x = np.array(crafted_x)
num_changes = np.array(num_changes)

print("Average number of changes per sample:", num_changes.mean())

imdb_clf.evaluate(crafted_x, ys)

In [19]:
import pickle
with open('Crafted_x_train_512_150dim.pickle', 'ab') as fo:     
    pickle.dump(crafted_x, fo)

In [20]:
np.savetxt('crafted_x_train_512_150dim.csv', crafted_x, delimiter=',')

In [18]:
from lib.make_batches import *



In [169]:
y_train[10]

1

In [170]:
%%time
xs, ys = x_train[10].copy(), y_train[10].copy()
ys1 = np.array([0,1])
craft_sample(xs, ys1, x_gradients[10])

CPU times: user 52.2 s, sys: 54.5 s, total: 1min 46s
Wall time: 28.3 s


(array([   1,   13,  219,   14,   11, 7190,   11,   51,  215,   28,   77,
          94,  204,  521,   13, 1227, 4985,   83,    4,    2,    7,    4,
         750,   12,   16,   15,  163,   13,  774,  110,   12,  237,   21,
          62,  119,    8,  121,   81,   25,   79,    6, 1039, 1814,  660,
         233,   44,   12,  112, 1967,   42, 3701,   26,   18,   61,  278,
          43,    6,  762,    7,    2,  257, 6056,    9,  345,    2,    2,
          42,    2,  642,  257,   31,   50,    9,   24,    6,  815,   31,
        1926,    4,  636,  720,   18, 1825,   63, 8276, 3045,  944, 7775,
         140,  140, 3976,   19,    2, 3507,    2,   62,   28,  814,   12,
          69,   29,    4, 3799, 2210,   48,   25,  216,    8,   14,   22,
          19,   35,  911,  330,    5,    6,    2,    2,  483,  490,    2,
          12,  208,   83,  129,   55,  118, 1029,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [171]:
%%time
xs, ys = x_train[10].copy(), y_train[10].copy()
craft_sample2(xs, ys, x_gradients[10])

CPU times: user 46.2 s, sys: 53.9 s, total: 1min 40s
Wall time: 23.9 s


(array([   1,   13,  219,   14,   11, 7190,   11,   51,  215,   28,   77,
          94,  204,  521,   13, 1227, 4985,   83,    4,    2,    7,    4,
         750,   12,   16,   15,  163,   13,  774,  110,   12,  237,   21,
          62,  119,    8,  121,   81,   25,   79,    6, 1039, 1814,  660,
         233,   44,   12,  112, 1967,   42, 3701,   26,   18,   61,  278,
          43,    6,  762,    7,    2,  257, 6056,    9,  345,    2,    2,
          42,    2,  642,  257,   31,   50,    9,   24,    6,  815,   31,
        1926,    4,  636,  720,   18, 1825,   63, 8276, 3045,  944, 7775,
         140,  140, 3976,   19,    2, 3507,    2,   62,   28,  814,   12,
          69,   29,    4, 3799, 2210,   48,   25,  216,    8,   14,   22,
          19,   35,  911,  330,    5,    6,    2,    2,  483,  490,    2,
          12,  208,   83,  129,   55,  118, 1029,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [156]:
x_train[10]

array([   1,   14,    9,   66,    6,   55,   78,   20,  138,   86,    7,
         32,    4,   65,    9,   78,   12,    9,   35, 4502,   65, 8253,
         32, 2580,    7,  183,  295,   15,   97,   57,  281,   12,   43,
        186,    6,  355, 2846, 4129,    4,  156,  566,  297,   11,    6,
        821, 1377,   36,  566,   60,  740,   17,   35,  284,  144,  138,
        122,   13,  818,   14,   20,    5,   51,  215,   13,   81,   19,
         12,  150,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0], d