# Crafting Adversarial samples with text for LSTM

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm

import multiprocessing as mp

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM, Reshape, Activation, Input
from tensorflow.keras.datasets import imdb
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K


## Dataset

We will be using IMDB review data set that can be classified as either a positive-negative review.

The data is available through Keras for retrieval. We can limit the total number of words in vocabulary.

In [11]:
# %%time
max_features = 20000
# cut texts after this number of words (among top max_features most common words)

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features, seed=1)
print(len(x_train), 'train sequences, with shape', x_train.shape)
print(len(x_test), 'test sequences with shape', x_test.shape)

Loading data...
25000 train sequences, with shape (25000,)
25000 test sequences with shape (25000,)


In [12]:
### Decreasing the size of test set.
np.random.seed(10)
idx = np.random.choice(x_test.shape[0],2000)
x_test = x_test[idx]
y_test = y_test[idx]

## Data pre-processing

In [13]:
print("Train data review statistics:")
pdlen = pd.Series(np.array([len(x) for x in x_train]))
print(pdlen.describe())
print()
print("Test data review statistics:")
pdlen = pd.Series(np.array([len(x) for x in x_test]))
print(pdlen.describe())
# print("Average number of words in each review:", lens.max())

Train data review statistics:
count    25000.000000
mean       238.713640
std        176.497204
min         11.000000
25%        130.000000
50%        178.000000
75%        291.000000
max       2494.000000
dtype: float64

Test data review statistics:
count    2000.000000
mean      229.910000
std       167.108499
min        29.000000
25%       127.000000
50%       174.000000
75%       284.000000
max      1158.000000
dtype: float64


We need to one-hot encode the labels, to use probabilities/logits for different classes

In [14]:
print("One-hot encoding of labels")
y_train = to_categorical(y_train, 2)
y_test = to_categorical(y_test, 2)
print('train labels shape:',y_train.shape)
print('test labels shape:',y_test.shape)

One-hot encoding of labels
train labels shape: (25000, 2)
test labels shape: (2000, 2)


Keras Embedding layer expects the input to have similar length for each review.
So we either need to pad or truncate the reviews as necessary.

We are padding/truncating at the end of the review.

In [15]:
maxlen = 240

x_train = sequence.pad_sequences(x_train, padding='post', truncating='post', maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('train data shape:', x_train.shape)
print('test data shape:', x_test.shape)

train data shape: (25000, 240)
test data shape: (2000, 240)


## Model Training & Evaluation

In [20]:
print("Setting up model-specific variables...")
K.clear_session()
batch_size = 512
embedding_size = 128
lstm_size = 128
val_split = 0.2
epochs = 8
num_classes = 2

Setting up model-specific variables...


In [21]:
wordnum = Input(shape=(maxlen,))
embeddings = Embedding(max_features, embedding_size)(wordnum)
lstm_out = LSTM(lstm_size, dropout=0.2, recurrent_dropout=0.2)(embeddings)
dense_out = Dense(num_classes)(lstm_out)
out = Activation('softmax')(dense_out)

In [22]:
imdb_clf = Model(inputs=wordnum, outputs=out)
imdb_clf.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
imdb_clf.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 240)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 240, 128)          2560000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 2)                 258       
_________________________________________________________________
activation (Activation)      (None, 2)                 0         
Total params: 2,691,842
Trainable params: 2,691,842
Non-trainable params: 0
_________________________________________________________________


In [26]:
train_history = imdb_clf.fit(x_train, y_train,
                             validation_data=(x_test, y_test),
#                              validation_split=val_split,
                             batch_size=batch_size,
                             epochs=epochs
                             )

Train on 25000 samples, validate on 2000 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [27]:
print("Evaluate over Test data:")
loss, accuracy = imdb_clf.evaluate(x_test, y_test)
print('Loss over Test data:', loss)
print('Accuracy over Test data:', accuracy)

Evaluate over Test data:
Loss over Test data: 0.8298312134742737
Accuracy over Test data: 0.83


## Retrieve Embeddings for all the words in the Vocabulary

In [28]:
vocab_embeddings = imdb_clf.layers[1].embeddings.numpy()
print("Shape of the generated embeddings: ",vocab_embeddings.shape)

Shape of the generated embeddings:  (20000, 128)


## Keras function to extract embeddings for samples

In [29]:
get_embeddings = K.function([imdb_clf.layers[0].input],
                                  imdb_clf.layers[1].output)

print("Testing the embedding function with a single sample...")
test_embed = get_embeddings(x_test[0])
print("Shape of generated embeddings:",test_embed.shape)

Testing the embedding function with a single sample...
Shape of generated embeddings: (240, 128)


### Saving the model

In [30]:
imdb_clf.save("imdb_compiled_clf.h5")
imdb_clf.save_weights("imdb_model_weights.h5")

## Adversarial crafting

### Sub-model - from Embeddings to logits

In [31]:
### Defining necessary layers
embed_input = Input(shape=test_embed.shape)
embed_lstm = LSTM(lstm_size, dropout=0.2, recurrent_dropout=0.2)(embed_input)
embed_dense = Dense(num_classes)(embed_lstm)

### Define model with Embedding inputs and Logit outputs
embed_model = Model(inputs=embed_input, outputs=embed_dense)

### Transferring the trained weights from our IMDB Classifier model (imdb_clf)
embed_model.layers[1].set_weights(imdb_clf.layers[2].get_weights())
embed_model.layers[2].set_weights(imdb_clf.layers[3].get_weights())
embed_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 240, 128)]        0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
Total params: 131,842
Trainable params: 131,842
Non-trainable params: 0
_________________________________________________________________


### Calculate Jacobian matrix for all the words in the input

In [32]:
def compute_input_jacobian(x, y, model):
    x_embed = get_embeddings(x)
    x_tensor = tf.convert_to_tensor(x_embed.reshape(-1,maxlen,embedding_size), tf.float32)
    x_var = tf.Variable(x_tensor, dtype=tf.float32)

    with tf.GradientTape(watch_accessed_variables=False) as tape:
        tape.watch(x_var)
        # Get logits
        pred_y = model(x_var)

    # Calculate gradients
    x_gradients = tape.batch_jacobian(pred_y, x_var).numpy()
    print("Shape of the Jacobian:", x_gradients.shape)

    # if not compare_losses(x, y, pred_y) : return None
    return x_gradients

def compare_losses(x, labels, preds):
    # Calculate loss
    calc_loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=preds)
    model_loss, _ = imdb_clf.evaluate(x.reshape(-1,maxlen),labels.reshape(-1,num_classes))

    return calc_loss-model_loss<0.001

In [33]:
def craft_sample(x, y, x_gradient):

    x_copy = x.copy()

    _ , acc = imdb_clf.evaluate(x.reshape(-1,maxlen), y.reshape(-1,num_classes), verbose=0)

    for word in range(maxlen):

        if acc<1.0 : break

        word_grad = x_gradient[np.argmax(y), word]
        # print(word_grad.shape)

#         jac_sign = np.sign(word_grad).sum()
#         vocab_sign = np.add.reduce(np.sign(word_grad - vocab_embeddings),1)
        jac_sign = np.sign(word_grad)
        vocab_sign = np.sign(word_grad - vocab_embeddings)

#         match_word = np.argmin(vocab_sign - jac_sign)
        match_word = np.argmin(np.add.reduce(vocab_sign - jac_sign, axis=1))
        x[word] = match_word

        loss , acc = imdb_clf.evaluate(x.reshape(-1,maxlen), y.reshape(-1,num_classes), verbose=0)
        
#     print(word,acc)
    if acc<1.0:
        return x, word
    else:
        return  x_copy, 0


In [None]:
%%time

crafted_x = []
num_changes = []
idx = np.random.choice(x_train.shape[0], 10)
xs, ys = x_train[idx], y_train[idx]
print("Calculating gradients...")
x_gradients = compute_input_jacobian(xs,ys,embed_model)

# print(imdb_clf.evaluate(xs, ys))

print("Crafting adversarial samples...")
for x, y, grad in tqdm(zip(xs, ys, x_gradients), total=xs.shape[0]):
    # x = x_train[idx]
    # y = y_train[idx]
    new_x , changes = craft_sample(x, y, grad)
    crafted_x.append(new_x)
    num_changes.append(changes)

crafted_x = np.array(crafted_x)
num_changes = np.array(num_changes)

print("Average number of changes per sample:", num_changes.mean())

imdb_clf.evaluate(crafted_x, ys)


Calculating gradients...


In [24]:
%%time

crafted_x = []
num_changes = []
idx = np.random.choice(x_train.shape[0], 10)
xs, ys = x_train[idx], y_train[idx]
print("Calculating gradients...")
x_gradients = compute_input_jacobian(xs,ys,embed_model)

print("Crafting adversarial samples...")
craft_pool = mp.Pool(mp.cpu_count())
results_object = [craft_pool.apply_async(craft_sample, args=(x, y, grad)) \
                      for x, y, grad in zip(xs, ys, x_gradients)]

results = np.array([r.get() for r in results_object])
results

Calculating gradients...
(10, 2, 80, 128)
Crafting adversarial samples...


KeyboardInterrupt: 

In [None]:
results = np.array([r.get() for r in results_object])
results

In [31]:
idx = np.random.choice(x_train.shape[0], 1)
x, y = x_train[idx], y_train[idx]
print("Calculating gradients...")
x_gradient = compute_input_jacobian(x,y,embed_model)
x_copy = x.copy()

Calculating gradients...
Shape of the Jacobian: (1, 2, 80, 128)


In [35]:
y.shape

(1, 2)

In [42]:
x = x_copy.copy()

_ , acc = imdb_clf.evaluate(x.reshape(-1,maxlen), y.reshape(-1,num_classes), verbose=0)

word=0
# for word in range(maxlen):

# if acc<1.0 : break

word_grad = x_gradient[0, np.argmax(y), word]
# print(word_grad.shape)

# jac_sign = np.sign(word_grad).sum()
# print(jac_sign)
# vocab_sign = np.add.reduce(np.sign(word_grad - vocab_embeddings),1)
# print(vocab_sign)
jac_sign = np.sign(word_grad)
print(jac_sign, jac_sign.shape)
vocab_sign = np.sign(word_grad - vocab_embeddings)
print(vocab_sign, vocab_sign.shape)

[ 1.  1. -1. -1. -1. -1.  1. -1. -1.  1. -1. -1. -1. -1. -1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1. -1. -1. -1.  1. -1.  1.  1.  1.  1.  1. -1.
  1. -1. -1. -1. -1.  1.  1.  1.  1. -1. -1.  1.  1. -1. -1. -1. -1.  1.
 -1.  1. -1.  1.  1. -1.  1. -1. -1. -1. -1.  1. -1. -1. -1.  1.  1.  1.
  1. -1.  1. -1.  1. -1.  1.  1. -1. -1.  1. -1.  1.  1. -1. -1. -1.  1.
  1.  1.  1.  1. -1. -1.  1. -1. -1. -1. -1.  1.  1.  1. -1. -1.  1.  1.
 -1.  1. -1.  1. -1. -1. -1.  1. -1. -1. -1.  1.  1. -1. -1.  1. -1.  1.
 -1.  1.] (128,)
[[-1. -1. -1. ...  1. -1. -1.]
 [-1. -1.  1. ...  1.  1. -1.]
 [-1.  1. -1. ...  1. -1.  1.]
 ...
 [ 1. -1.  1. ... -1.  1.  1.]
 [ 1. -1. -1. ...  1.  1.  1.]
 [ 1.  1. -1. ...  1. -1.  1.]] (5000, 128)


In [43]:
new = vocab_sign - jac_sign
np.add.reduce(new,axis=1).shape

(5000,)

In [44]:
match_word = np.argmin(np.add.reduce(vocab_sign - jac_sign, axis=1))
#         match_word = np.argmin(np.sum(vocab_sign - jac_sign))
x[word] = match_word

loss , acc = imdb_clf.evaluate(x.reshape(-1,maxlen), y.reshape(-1,num_classes), verbose=0)

print(word,acc)

0 1.0


In [18]:
# vocab_sign = np.add.reduce(np.sign(vocab_embeddings - word_grad),1)
# match_word = np.argmin(vocab_sign - jac_sign)
# print(match_word)
# x[word] = match_word


In [19]:
# _, acc = imdb_clf.evaluate(x.reshape(-1,maxlen), y.reshape(-1,2))
# imdb_clf.predict(x.reshape(-1,maxlen))

In [20]:
# %%time
# idx = np.random.choice(x_train.shape[0], 10)
# x, y = x_train[idx], y_train[idx]
#
# x_embed = get_embeddings(x)
# x_tensor = tf.convert_to_tensor(x_embed.reshape(-1,maxlen,embedding_size), tf.float32)
# x_var = tf.Variable(x_tensor, dtype=tf.float32)
#
# with tf.GradientTape(watch_accessed_variables=False) as tape:
#     tape.watch(x_var)
#     # Get logits
#     pred_y = embed_model(x_var)
#
# # Calculate gradients
# y_gradients = tape.batch_jacobian(pred_y, x_var).numpy()
# print(y_gradients.shape)

