# Crafting Adversarial samples with text for LSTM

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM, Reshape, Activation, Input
from tensorflow.keras.datasets import imdb, mnist
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K


## Dataset

We will be using IMDB review data set that can be classified as either a positive-negative review.

The data is available through Keras for retrieval. We can limit the total number of words in vocabulary.

In [2]:
# %%time
max_features = 5000
# cut texts after this number of words (among top max_features most common words)

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features, seed=1)
print(len(x_train), 'train sequences, with shape', x_train.shape)
print(len(x_test), 'test sequences with shape', x_test.shape)

Loading data...
25000 train sequences, with shape (25000,)
25000 test sequences with shape (25000,)


## Data pre-processing

In [3]:
print("Train data review statistics:")
pdlen = pd.Series(np.array([len(x) for x in x_train]))
print(pdlen.describe())
print()
print("Test data review statistics:")
pdlen = pd.Series(np.array([len(x) for x in x_test]))
print(pdlen.describe())
# print("Average number of words in each review:", lens.max())

Train data review statistics:
count    25000.000000
mean       238.713640
std        176.497204
min         11.000000
25%        130.000000
50%        178.000000
75%        291.000000
max       2494.000000
dtype: float64

Test data review statistics:
count    25000.000000
mean       230.804200
std        169.164471
min          7.000000
25%        128.000000
50%        174.000000
75%        280.000000
max       2315.000000
dtype: float64


We need to one-hot encode the labels, to use probabilities/logits for different classes

In [4]:
print("One-hot encoding of labels")
y_train = to_categorical(y_train, 2)
y_test = to_categorical(y_test, 2)
print('train labels shape:',y_train.shape)
print('test labels shape:',y_test.shape)

One-hot encoding of labels
train labels shape: (25000, 2)
test labels shape: (25000, 2)


Keras Embedding layer expects the input to have similar length for each review.
So we either need to pad or truncate the reviews as necessary.

We are padding/truncating at the end of the review.

In [5]:
maxlen = 80

x_train = sequence.pad_sequences(x_train, padding='post', truncating='post', maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('train data shape:', x_train.shape)
print('test data shape:', x_test.shape)

train data shape: (25000, 80)
test data shape: (25000, 80)


## Model Training & Evaluation

In [6]:
print("Setting up model-specific variables...")
K.clear_session()
batch_size = 32
embedding_size = 128
lstm_size = 128
val_split = 0.2
epochs = 8
num_classes = 2

Setting up model-specific variables...


In [7]:
wordnum = Input(shape=(maxlen,))
embeddings = Embedding(max_features, embedding_size)(wordnum)
lstm_out = LSTM(lstm_size, dropout=0.2, recurrent_dropout=0.2)(embeddings)
dense_out = Dense(num_classes)(lstm_out)
out = Activation('softmax')(dense_out)

In [8]:
imdb_clf = Model(inputs=wordnum, outputs=out)
imdb_clf.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
imdb_clf.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 80)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 80, 128)           640000    
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 2)                 258       
_________________________________________________________________
activation (Activation)      (None, 2)                 0         
Total params: 771,842
Trainable params: 771,842
Non-trainable params: 0
_________________________________________________________________


In [9]:
train_history = imdb_clf.fit(x_train, y_train,
                             validation_split=val_split,
                             batch_size=batch_size,
                             epochs=epochs
                             )

Train on 20000 samples, validate on 5000 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [10]:
print("Evaluate over Test data:")
loss, accuracy = imdb_clf.evaluate(x_test, y_test)
print('Loss over Test data:', loss)
print('Accuracy over Test data:', accuracy)

Evaluate over Test data:
Loss over Test data: 0.7137441838550568
Accuracy over Test data: 0.78636


## Retrieve Embeddings for all the words in the Vocabulary

In [11]:
vocab_embeddings = imdb_clf.layers[1].embeddings.numpy()
print("Shape of the generated embeddings: ",vocab_embeddings.shape)

Shape of the generated embeddings:  (5000, 128)


## Keras function to extract embeddings for samples

In [12]:
get_embeddings = K.function([imdb_clf.layers[0].input],
                                  imdb_clf.layers[1].output)

print("Testing the embedding function with a single sample...")
test_embed = get_embeddings(x_test[0])
print("Shape of generated embeddings:",test_embed.shape)

Testing the embedding function with a single sample...
Shape of generated embeddings: (80, 128)


## Adversarial crafting

### Sub-model - from Embeddings to logits

In [13]:
### Defining necessary layers
embed_input = Input(shape=test_embed.shape)
embed_lstm = LSTM(lstm_size, dropout=0.2, recurrent_dropout=0.2)(embed_input)
embed_dense = Dense(num_classes)(embed_lstm)

### Define model with Embedding inputs and Logit outputs
embed_model = Model(inputs=embed_input, outputs=embed_dense)

### Transferring the trained weights from our IMDB Classifier model (imdb_clf)
embed_model.layers[1].set_weights(imdb_clf.layers[2].get_weights())
embed_model.layers[2].set_weights(imdb_clf.layers[3].get_weights())
embed_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 80, 128)]         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
Total params: 131,842
Trainable params: 131,842
Non-trainable params: 0
_________________________________________________________________


### Calculate Jacobian matrix for all the words in the input

In [14]:
def compute_input_jacobian(x, y, model):
    x_embed = get_embeddings(x)
    x_tensor = tf.convert_to_tensor(x_embed.reshape(-1,maxlen,embedding_size), tf.float32)
    x_var = tf.Variable(x_tensor, dtype=tf.float32)

    with tf.GradientTape(watch_accessed_variables=False) as tape:
        tape.watch(x_var)
        # Get logits
        pred_y = model(x_var)

    # Calculate gradients
    x_gradients = tape.batch_jacobian(pred_y, x_var).numpy()
    print(x_gradients.shape)

    # if not compare_losses(x, y, pred_y) : return None
    return x_gradients

def compare_losses(x, labels, preds):
    # Calculate loss
    calc_loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=preds)
    model_loss, _ = imdb_clf.evaluate(x.reshape(-1,maxlen),labels.reshape(-1,num_classes))

    return calc_loss-model_loss<0.001

In [15]:
def craft_sample(x, y, x_gradient):

    x_copy = x.copy()

    _ , acc = imdb_clf.evaluate(x.reshape(-1,maxlen), y.reshape(-1,num_classes), verbose=0)

    for word in range(maxlen):

        if acc<1.0 : break

        word_grad = x_gradient[np.argmax(y), word]
        # print(word_grad.shape)

        jac_sign = np.sign(word_grad).sum()
        vocab_sign = np.add.reduce(np.sign(vocab_embeddings - word_grad),1)

        match_word = np.argmin(vocab_sign - jac_sign)
        x[word] = match_word

        loss , acc = imdb_clf.evaluate(x.reshape(-1,maxlen), y.reshape(-1,num_classes), verbose=0)
        
    if not acc<1.0 is True: 
        return  x_copy, 0

    return x, word+1

In [16]:
%%time

crafted_x = []
num_changes = []
idx = np.random.choice(x_train.shape[0], 100)
xs, ys = x_train[idx], y_train[idx]
x_gradients = compute_input_jacobian(xs,ys,embed_model)

print(imdb_clf.evaluate(xs, ys))

for x, y, grad in tqdm(zip(xs, ys, x_gradients)):
    # x = x_train[idx]
    # y = y_train[idx]
    new_x , changes = craft_sample(x, y, grad)
    crafted_x.append(new_x)
    num_changes.append(changes)

crafted_x = np.array(crafted_x)
num_changes = np.array(num_changes)

print("Average number of changes per sample:", num_changes.mean())

imdb_clf.evaluate(crafted_x, ys)


(100, 2, 80, 128)
[0.08925686702132225, 0.96]
Average number of changes per sample: 47.97
CPU times: user 3min 56s, sys: 34.2 s, total: 4min 30s
Wall time: 2min 59s


100it [02:36,  1.56s/it]


[0.5831971788406372, 0.45]

In [17]:
# # word = 0
# for word in range(maxlen):
#     # word += 1
#     word_grad = x_gradients[0, np.argmax(y), 0, word]
#     # print(word_grad.shape)
#
#     jac_sign = np.sign(word_grad).sum()
#     vocab_sign = np.add.reduce(np.sign(vocab_embeddings - word_grad),1)
#
#     match_word = np.argmin(vocab_sign - jac_sign)
#     x[word] = match_word
#
#     loss , acc = imdb_clf.evaluate(x.reshape(-1,maxlen), y.reshape(-1,num_classes), verbose=0)
#     # print(match_word, loss, acc)
#     if acc<1.0 : break
#
# print(word)

In [18]:
# vocab_sign = np.add.reduce(np.sign(vocab_embeddings - word_grad),1)
# match_word = np.argmin(vocab_sign - jac_sign)
# print(match_word)
# x[word] = match_word


In [19]:
# _, acc = imdb_clf.evaluate(x.reshape(-1,maxlen), y.reshape(-1,2))
# imdb_clf.predict(x.reshape(-1,maxlen))

In [20]:
# %%time
# idx = np.random.choice(x_train.shape[0], 10)
# x, y = x_train[idx], y_train[idx]
#
# x_embed = get_embeddings(x)
# x_tensor = tf.convert_to_tensor(x_embed.reshape(-1,maxlen,embedding_size), tf.float32)
# x_var = tf.Variable(x_tensor, dtype=tf.float32)
#
# with tf.GradientTape(watch_accessed_variables=False) as tape:
#     tape.watch(x_var)
#     # Get logits
#     pred_y = embed_model(x_var)
#
# # Calculate gradients
# y_gradients = tape.batch_jacobian(pred_y, x_var).numpy()
# print(y_gradients.shape)



array([10,  2,  3])