# Crafting Adversarial samples with text for LSTM

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tqdm

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM, Reshape, Activation, Input
from tensorflow.keras.datasets import imdb, mnist
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K


## Dataset

We will be using IMDB review data set that can be classified as either a positive-negative review.

The data is available through Keras for retrieval. We can limit the total number of words in vocabulary.

In [3]:
# %%time
max_features = 2500
# cut texts after this number of words (among top max_features most common words)

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features, seed=1)
print(len(x_train), 'train sequences, with shape', x_train.shape)
print(len(x_test), 'test sequences with shape', x_test.shape)

Loading data...
25000 train sequences, with shape (25000,)
25000 test sequences with shape (25000,)


## Data pre-processing

In [4]:
print("Train data review statistics:")
pdlen = pd.Series(np.array([len(x) for x in x_train]))
print(pdlen.describe())
print()
print("Test data review statistics:")
pdlen = pd.Series(np.array([len(x) for x in x_test]))
print(pdlen.describe())
# print("Average number of words in each review:", lens.max())

Train data review statistics:
count    25000.000000
mean       238.713640
std        176.497204
min         11.000000
25%        130.000000
50%        178.000000
75%        291.000000
max       2494.000000
dtype: float64

Test data review statistics:
count    25000.000000
mean       230.804200
std        169.164471
min          7.000000
25%        128.000000
50%        174.000000
75%        280.000000
max       2315.000000
dtype: float64


We need to one-hot encode the labels, to use probabilities/logits for different classes

In [5]:
print("One-hot encoding of labels")
y_train = to_categorical(y_train, 2)
y_test = to_categorical(y_test, 2)
print('train labels shape:',y_train.shape)
print('test labels shape:',y_test.shape)

One-hot encoding of labels
train labels shape: (25000, 2)
test labels shape: (25000, 2)


Keras Embedding layer expects the input to have similar length for each review.
So we either need to pad or truncate the reviews as necessary.

We are padding/truncating at the end of the review.

In [6]:
maxlen = 80

x_train = sequence.pad_sequences(x_train, padding='post', truncating='post', maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('train data shape:', x_train.shape)
print('test data shape:', x_test.shape)

train data shape: (25000, 80)
test data shape: (25000, 80)


## Model Training & Evaluation

In [7]:
print("Setting up model-specific variables...")
K.clear_session()
batch_size = 256
embedding_size = 128
lstm_size = 64
val_split = 0.2
epochs = 3

Setting up model-specific variables...


In [8]:
wordnum = Input(shape=(maxlen,))
embeddings = Embedding(max_features, embedding_size)(wordnum)
lstm_out = LSTM(lstm_size, dropout=0.2, recurrent_dropout=0.2)(embeddings)
dense_out = Dense(2)(lstm_out)
out = Activation('softmax')(dense_out)

In [9]:
imdb_clf = Model(inputs=wordnum, outputs=out)
imdb_clf.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
imdb_clf.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 80)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 80, 128)           320000    
_________________________________________________________________
lstm (LSTM)                  (None, 64)                49408     
_________________________________________________________________
dense (Dense)                (None, 2)                 130       
_________________________________________________________________
activation (Activation)      (None, 2)                 0         
Total params: 369,538
Trainable params: 369,538
Non-trainable params: 0
_________________________________________________________________


In [10]:
train_history = imdb_clf.fit(x_train, y_train,
                             validation_split=val_split,
                             batch_size=batch_size,
                             epochs=epochs
                             )

Train on 20000 samples, validate on 5000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [11]:
print("Evaluate over Test data:")
loss, accuracy = imdb_clf.evaluate(x_test, y_test)
print('Loss over Test data:', loss)
print('Accuracy over Test data:', accuracy)

Evaluate over Test data:
Loss over Test data: 0.4137208525466919
Accuracy over Test data: 0.8106


## Retrieve Embeddings for all the words in the Vocabulary

In [12]:
vocab_embeddings = imdb_clf.layers[1].embeddings.numpy()
print("Shape of the generated embeddings: ",vocab_embeddings.shape)

Shape of the generated embeddings:  (2500, 128)


## Keras function to extract embeddings for samples

In [13]:
get_embeddings = K.function([imdb_clf.layers[0].input],
                                  imdb_clf.layers[1].output)

print("Testing the embedding function with a single sample...")
test_embed = get_embeddings(x_test[0])
print("Shape of generated embeddings:",test_embed.shape)

Testing the embedding function with a single sample...
Shape of generated embeddings: (80, 128)


## Adversarial crafting

### Sub-model - from Embeddings to logits

In [15]:
### Defining necessary layers
embed_input = Input(shape=test_embed.shape)
embed_lstm = LSTM(64, dropout=0.2, recurrent_dropout=0.2)(embed_input)
embed_dense = Dense(2)(embed_lstm)

### Define model with Embedding inputs and Logit outputs
embed_model = Model(inputs=embed_input, outputs=embed_dense)

### Transferring the trained weights from our IMDB Classifier model (imdb_clf)
embed_model.layers[1].set_weights(imdb_clf.layers[2].get_weights())
embed_model.layers[2].set_weights(imdb_clf.layers[3].get_weights())
embed_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 80, 128)]         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 130       
Total params: 49,538
Trainable params: 49,538
Non-trainable params: 0
_________________________________________________________________


### Calculate Jacobian matrix for all the words in the input

In [145]:
idx = 100
x = x_train[idx]

x_embed = get_embeddings(x)
x_tensor = tf.convert_to_tensor(x_embed.reshape(-1,80,128), tf.float32)
x_var = tf.Variable(x_tensor, dtype=tf.float32)

labels = y_train[idx]

with tf.GradientTape(watch_accessed_variables=False) as tape:
#     tape.reset()
    tape.watch(x_var)

    # Get logits
    pred_y = embed_model(x_var)

# print(tape.watched_variables())

# Calculate gradients
x_gradients = tape.jacobian(pred_y, x_var).numpy()
print(x_gradients.reshape(-1,2,80,128).shape)

(1, 2, 80, 128)


In [146]:
# Calculate loss
model_loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=pred_y)
# model_loss = tf.keras.losses.categorical_crossentropy(real_y, pred_y)

print("Calculated loss:", model_loss)

loss, acc = imdb_clf.evaluate(x.reshape(-1,maxlen),labels.reshape(-1,2))

print("Evaluated model loss:", loss)

Calculated loss: tf.Tensor([0.09651143], shape=(1,), dtype=float32)
Evaluated model loss: 0.09651143103837967


In [147]:
print(x[0:30])
imdb_clf.predict(x.reshape(-1,maxlen))

[ 860  860  860  860  860  860   32   58  118 1222  748  356   60  151
   91    7    4    2  121    2   36   28   35 1614  489   15   47    2
    4 2181]


array([[0.09200047, 0.9079995 ]], dtype=float32)

In [172]:
word = 8
word_grad = x_gradients[0,np.argmax(labels), 0, word]
print(word_grad.shape)

jac_sign = np.sign(word_grad).sum()
jac_sign


(128,)


-12.0

In [173]:
vocab_sign = np.add.reduce(np.sign(vocab_embeddings - word_grad),1)
match_word = np.argmin(vocab_sign - jac_sign)
print(match_word)
x[word] = match_word


577


In [174]:
_, acc = imdb_clf.evaluate(x.reshape(-1,maxlen),labels.reshape(-1,2))
imdb_clf.predict(x.reshape(-1,maxlen))



array([[0.5108993 , 0.48910066]], dtype=float32)