# Crafting Adversarial samples with text for LSTM

In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import tensorflow as tf
from tqdm import tqdm
import re
import string

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM, Activation, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K
from tensorflow.keras.preprocessing.text import Tokenizer

import tensorflow_datasets as tfds
tfds.disable_progress_bar()



[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Dataset

We will be using IMDB review data set that can be classified as either a positive-negative review.

The data is available through Keras for retrieval. We can limit the total number of words in vocabulary.

In [2]:
tensor_train, tensor_test = tfds.load('imdb_reviews', split=['train','test'])
imdb_train = pd.DataFrame(list(tfds.as_numpy(tensor_train)))
imdb_test = pd.DataFrame(list(tfds.as_numpy(tensor_test)))
print("Shape of Train split: ", imdb_train.shape)
print("Shape of Test split: ", imdb_test.shape)
imdb_train.head()

Shape of Train split:  (25000, 2)
Shape of Test split:  (25000, 2)


Unnamed: 0,label,text
0,1,"b""As a lifelong fan of Dickens, I have invaria..."
1,1,"b""Oh yeah! Jenna Jameson did it again! Yeah Ba..."
2,1,"b""I saw this film on True Movies (which automa..."
3,1,b'This was a wonderfully clever and entertaini...
4,1,b'I have no idea what the other reviewer is ta...


In [3]:
### Decreasing the size of test set.
np.random.seed(10)
idx = np.random.choice(imdb_test.shape[0],2000)
imdb_test = imdb_test.iloc[idx]
imdb_test.shape

(2000, 2)

## Data pre-processing

In [4]:
eng_stopwords = stopwords.words('english')
stopwords_dict = Counter(eng_stopwords)

def remove_stopwords(text):
    text = ' '.join([word for word in word_tokenize(text.lower()) if word not in stopwords_dict])
    return text

def html_tags(text):
    text = re.sub('\<.*?\>+', ' ', text)
    return text

def remove_punctuation(text):
#     punctuations = string.punctuation.replace('\'','')+'\t\n'
    punctuations = string.punctuation+'\t\n'
    text = re.sub('[%s]' % re.escape(punctuations), ' ', text)
    return text

In [5]:
%%time
imdb_train['ptext'] = imdb_train.text.apply(lambda x : x.decode('utf-8'))
imdb_train['ptext'] = imdb_train.ptext.apply(html_tags)
imdb_train['ptext'] = imdb_train.ptext.apply(remove_punctuation)
imdb_train['ptext'] = imdb_train.ptext.apply(remove_stopwords)
imdb_test['ptext'] = imdb_test.text.apply(lambda x : x.decode('utf-8'))
imdb_test['ptext'] = imdb_test.ptext.apply(html_tags)
imdb_test['ptext'] = imdb_test.ptext.apply(remove_punctuation)
imdb_test['ptext'] = imdb_test.ptext.apply(remove_stopwords)
imdb_train.head()

CPU times: user 31.8 s, sys: 0 ns, total: 31.8 s
Wall time: 31.8 s


Unnamed: 0,label,text,ptext
0,1,"b""As a lifelong fan of Dickens, I have invaria...",lifelong fan dickens invariably disappointed a...
1,1,"b""Oh yeah! Jenna Jameson did it again! Yeah Ba...",oh yeah jenna jameson yeah baby movie rocks on...
2,1,"b""I saw this film on True Movies (which automa...",saw film true movies automatically made scepti...
3,1,b'This was a wonderfully clever and entertaini...,wonderfully clever entertaining movie shall ne...
4,1,b'I have no idea what the other reviewer is ta...,idea reviewer talking wonderful movie created ...


In [6]:
%%time
new_text = imdb_train.ptext.apply(word_tokenize).explode()
new_text.unique().shape[0]

CPU times: user 18.4 s, sys: 0 ns, total: 18.4 s
Wall time: 18.4 s


75088

In [7]:
%%time

# Cut texts after this number of words (among top max_features most common words)
max_features = 40000 

# Define tokenizer 
tokenizer = Tokenizer(num_words=max_features,
                      lower=True)
#                       oov_token="<unk>")

# Fit the 
tokenizer.fit_on_texts(imdb_train.ptext)

# Use the '0' index for the padding character
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

CPU times: user 3.26 s, sys: 0 ns, total: 3.26 s
Wall time: 3.25 s


In [8]:
x_train = tokenizer.texts_to_sequences(imdb_train.ptext)
y_train = imdb_train.label.values

x_test = tokenizer.texts_to_sequences(imdb_test.ptext)
y_test = imdb_test.label.values

In [9]:
print("Train data review statistics:")
pdlen = pd.Series(np.array([len(x) for x in x_train]))
print(pdlen.describe())
print()
print("Test data review statistics:")
pdlen = pd.Series(np.array([len(x) for x in x_test]))
print(pdlen.describe())
# print("Average number of words in each review:", lens.max())

Train data review statistics:
count    25000.000000
mean       119.364520
std         90.134456
min          4.000000
25%         63.000000
50%         88.000000
75%        146.000000
max       1414.000000
dtype: float64

Test data review statistics:
count    2000.000000
mean      112.429000
std        83.861394
min        10.000000
25%        60.000000
50%        83.000000
75%       138.000000
max       516.000000
dtype: float64


We need to one-hot encode the labels, to use probabilities/logits for different classes

In [10]:
print("One-hot encoding of labels")
y_train_oe = to_categorical(y_train, 2)
y_test_oe = to_categorical(y_test, 2)
print('train labels shape:',y_train.shape)
print('test labels shape:',y_test.shape)

One-hot encoding of labels
train labels shape: (25000,)
test labels shape: (2000,)


Keras Embedding layer expects the input to have similar length for each review.
So we either need to pad or truncate the reviews as necessary.

We are padding/truncating at the end of the review.

In [11]:
maxlen = 70

x_train = sequence.pad_sequences(x_train, padding='post', truncating='post', maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, padding='post', truncating='post', maxlen=maxlen)

print('train data shape:', x_train.shape)
print('test data shape:', x_test.shape)

train data shape: (25000, 70)
test data shape: (2000, 70)


In [12]:
tokenizer.sequences_to_texts(x_train[0:2])

['lifelong fan dickens invariably disappointed adaptations novels although works presented extremely accurate telling human life every level victorian britain throughout pervasive thread humour could playful sarcastic narrative dictated way literary cartoonist could serious hilarious sentence pricked pride lampooned arrogance celebrated modesty empathised loneliness poverty may cliché people writer comedy often missing interpretations time writing oliver twist dramatised serial form bbc television misery cruelty non humour irony savage lampoonery result dark',
 'oh yeah jenna jameson yeah baby movie rocks one 1st movies saw say feel love great move performance outstanding liked scenery wardrobe amazing tell put lot movie girls cloth amazing hope comment helps u buy movie storyline awesome unique sure u going like jenna amazed us wonder movie many awards make wardrobe sexy girls girls scene amazing specially one looks like angel must see hope u share interests <pad> <pad>']

## Model Training & Evaluation

In [13]:
print("Setting up model-specific variables...")
K.clear_session()
batch_size = 64
embedding_size = 256
lstm_size = 128
val_split = 0.2
epochs = 10
num_classes = 2

Setting up model-specific variables...


In [14]:
seq_encode = Input(shape=(maxlen,))
embeddings = Embedding(max_features, embedding_size)(seq_encode)
lstm_out = LSTM(lstm_size)(embeddings)
dense_out = Dense(num_classes)(lstm_out)
out = Activation('softmax')(dense_out)

In [15]:
optimizer = Adam(lr=1e-3, decay=1e-4)
imdb_clf = Model(inputs=seq_encode, outputs=out)
imdb_clf.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
imdb_clf.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 70)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 70, 256)           10240000  
_________________________________________________________________
lstm (LSTM)                  (None, 128)               197120    
_________________________________________________________________
dense (Dense)                (None, 2)                 258       
_________________________________________________________________
activation (Activation)      (None, 2)                 0         
Total params: 10,437,378
Trainable params: 10,437,378
Non-trainable params: 0
_________________________________________________________________


In [16]:
train_history = imdb_clf.fit(x_train, y_train_oe,
                             validation_data=(x_test, y_test_oe),
#                              validation_split=val_split,
                             batch_size=batch_size,
                             epochs=epochs
                             )

Train on 25000 samples, validate on 2000 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [17]:
print("Evaluate over Test data:")
loss, accuracy = imdb_clf.evaluate(x_test, y_test_oe)
print('Loss over Test data:', loss)
print('Accuracy over Test data:', accuracy)

Evaluate over Test data:
Loss over Test data: 1.4879107685089112
Accuracy over Test data: 0.783


## Retrieve Embeddings for all the words in the Vocabulary

In [18]:
vocab_embeddings = imdb_clf.layers[1].embeddings.numpy()
print("Shape of the generated embeddings: ",vocab_embeddings.shape)

Shape of the generated embeddings:  (40000, 256)


## Keras function to extract embeddings for samples

In [19]:
get_embeddings = K.function([imdb_clf.layers[0].input],
                                  imdb_clf.layers[1].output)

print("Testing the embedding function with a single sample...")
test_embed = get_embeddings(x_test[0])
print("Shape of generated embeddings:",test_embed.shape)

Testing the embedding function with a single sample...
Shape of generated embeddings: (70, 256)


## Defining Submodel - from Embeddings to logits

In [20]:
### Defining necessary layers
embed_input = Input(shape=test_embed.shape)
embed_lstm = LSTM(lstm_size, dropout=0.2, recurrent_dropout=0.2)(embed_input)
embed_dense = Dense(num_classes)(embed_lstm)

### Define model with Embedding inputs and Logit outputs
embed_model = Model(inputs=embed_input, outputs=embed_dense)

### Transferring the trained weights from our IMDB Classifier model (imdb_clf)
embed_model.layers[1].set_weights(imdb_clf.layers[2].get_weights())
embed_model.layers[2].set_weights(imdb_clf.layers[3].get_weights())
# embed_model.summary()

## Saving the model

In [21]:
imdb_clf.save("saved_models/imdb_compiled_clf_130dim.h5")

## Adversarial crafting

### Calculate Jacobian matrix for all the words in the input

In [22]:
def compute_input_jacobian(x, y, model):
    x_embed = get_embeddings(x)
    x_tensor = tf.convert_to_tensor(x_embed.reshape(-1,maxlen,embedding_size), tf.float32)
    x_var = tf.Variable(x_tensor, dtype=tf.float32)

    with tf.GradientTape(watch_accessed_variables=False) as tape:
        tape.watch(x_var)
        # Get logits
        pred_y = model(x_var)

    # Calculate gradients
    x_gradients = tape.batch_jacobian(pred_y, x_var).numpy()
    print("Shape of the Jacobian:", x_gradients.shape)

    return x_gradients

In [39]:
def craft_sample(x, y, x_gradient, max_changes=maxlen):

    x_copy = x.copy()
    pred = np.argmax(imdb_clf.predict_on_batch(x_copy.reshape(-1,maxlen)))
    if pred != y :
        return x, 0

    for word in range(max_changes):

        word_grad = x_gradient[y, word]

        jac_sign = np.sign(word_grad)
        vocab_sign = np.sign(word_grad - vocab_embeddings)

        match_word = np.argmin(np.absolute(np.add.reduce(vocab_sign - jac_sign, axis=1)))
        x_copy[word] = match_word

        pred = np.argmax(imdb_clf.predict_on_batch(x_copy.reshape(-1,maxlen)))
        if pred != y : break

    return  x_copy, word

In [45]:
np.random.seed(10)

num_samples_class = 20

crafted_x = []
num_changes = []

idx0 = np.random.choice(np.argwhere(y_train == 0).reshape(-1,), num_samples_class, replace=False)
idx1 = np.random.choice(np.argwhere(y_train == 1).reshape(-1,), num_samples_class, replace=False)
idx = np.concatenate((idx0,idx1))
# np.random.shuffle(idx)

xs, ys, ys_oe = x_train[idx].copy(), y_train[idx].copy(), y_train_oe[idx].copy()

In [46]:
%%time

print("Calculating gradients...")
x_gradients = compute_input_jacobian(xs,ys,embed_model)

print("Loss and accuracy of selected samples:", imdb_clf.evaluate(xs, ys_oe, verbose=0))

Calculating gradients...
Shape of the Jacobian: (40, 2, 70, 256)
Loss and accuracy of selected samples: [0.0005068563448730856, 1.0]
CPU times: user 27.6 s, sys: 0 ns, total: 27.6 s
Wall time: 27.2 s


In [47]:
print("Crafting adversarial samples...")

Crafting adversarial samples...


In [48]:
print("Crafting with defined number of changes for each sample:")
max_changes = [10,20,30,40,50,60,70]

Crafting with defined number of changes for each sample:


In [49]:
loss_arr = []
acc_arr = []
for change in max_changes:
    print("Limiting the number of word changes to ",change," in each sample:")
    crafted_x = []
    num_changes = []
    
    for x, y, grad in tqdm(zip(xs, ys, x_gradients), total=xs.shape[0]):
        new_x , changes = craft_sample(x, y, grad, max_changes=change)
        crafted_x.append(new_x)
        num_changes.append(changes)

    crafted_x = np.array(crafted_x)
    num_changes = np.array(num_changes)
    
    print("Evaluation for the crafted samples:")
    loss, acc = imdb_clf.evaluate(crafted_x, ys_oe)
    loss_arr.append(loss)
    acc_arr.append(acc)
    



  0%|          | 0/40 [00:00<?, ?it/s][A[A

Limiting the number of word changes to  10  in each sample:




  2%|▎         | 1/40 [00:00<00:37,  1.05it/s][A[A

  5%|▌         | 2/40 [00:01<00:36,  1.05it/s][A[A

  8%|▊         | 3/40 [00:02<00:35,  1.05it/s][A[A

 10%|█         | 4/40 [00:03<00:34,  1.05it/s][A[A

 12%|█▎        | 5/40 [00:04<00:33,  1.05it/s][A[A

 15%|█▌        | 6/40 [00:05<00:32,  1.05it/s][A[A

 18%|█▊        | 7/40 [00:06<00:27,  1.20it/s][A[A

 20%|██        | 8/40 [00:07<00:27,  1.15it/s][A[A

 22%|██▎       | 9/40 [00:08<00:27,  1.12it/s][A[A

 25%|██▌       | 10/40 [00:09<00:27,  1.10it/s][A[A

 28%|██▊       | 11/40 [00:10<00:26,  1.08it/s][A[A

 30%|███       | 12/40 [00:11<00:26,  1.07it/s][A[A

 32%|███▎      | 13/40 [00:11<00:24,  1.12it/s][A[A

 35%|███▌      | 14/40 [00:12<00:23,  1.09it/s][A[A

 38%|███▊      | 15/40 [00:13<00:23,  1.08it/s][A[A

 40%|████      | 16/40 [00:14<00:22,  1.07it/s][A[A

 42%|████▎     | 17/40 [00:15<00:21,  1.07it/s][A[A

 45%|████▌     | 18/40 [00:16<00:20,  1.06it/s][A[A

 48%|████▊     | 

Evaluation for the crafted samples:





  0%|          | 0/40 [00:00<?, ?it/s][A[A

Limiting the number of word changes to  20  in each sample:




  2%|▎         | 1/40 [00:01<01:14,  1.90s/it][A[A

  5%|▌         | 2/40 [00:03<01:12,  1.90s/it][A[A

  8%|▊         | 3/40 [00:05<01:10,  1.90s/it][A[A

 10%|█         | 4/40 [00:07<01:08,  1.89s/it][A[A

 12%|█▎        | 5/40 [00:09<01:06,  1.90s/it][A[A

 15%|█▌        | 6/40 [00:11<01:04,  1.90s/it][A[A

 18%|█▊        | 7/40 [00:11<00:49,  1.49s/it][A[A

 20%|██        | 8/40 [00:13<00:51,  1.62s/it][A[A

 22%|██▎       | 9/40 [00:15<00:52,  1.70s/it][A[A

 25%|██▌       | 10/40 [00:17<00:52,  1.75s/it][A[A

 28%|██▊       | 11/40 [00:19<00:52,  1.79s/it][A[A

 30%|███       | 12/40 [00:20<00:43,  1.57s/it][A[A

 32%|███▎      | 13/40 [00:22<00:42,  1.59s/it][A[A

 35%|███▌      | 14/40 [00:24<00:43,  1.68s/it][A[A

 38%|███▊      | 15/40 [00:25<00:37,  1.52s/it][A[A

 40%|████      | 16/40 [00:27<00:39,  1.63s/it][A[A

 42%|████▎     | 17/40 [00:28<00:39,  1.71s/it][A[A

 45%|████▌     | 18/40 [00:30<00:38,  1.75s/it][A[A

 48%|████▊     | 

KeyboardInterrupt: 

In [31]:
print("Crafting with variable changes for each sample:")

Crafting with variable changes for each sample:


In [32]:
crafted_x = []
num_changes = []
for x, y, grad in tqdm(zip(xs, ys, x_gradients), total=xs.shape[0]):
    new_x , changes = craft_sample(x, y, grad, fixed_changes=False)
    crafted_x.append(new_x)
    num_changes.append(changes)

crafted_x = np.array(crafted_x)
num_changes = np.array(num_changes)

print("Average number of changes per sample:", num_changes.mean())

imdb_clf.evaluate(crafted_x, ys_oe)

100%|██████████| 200/200 [14:33<00:00,  4.37s/it]

Average number of changes per sample: 45.19





[1.7112814259529114, 0.2]

In [35]:
def compare(id):
    print("Number of changes on the sample:",num_changes[id])
    print("Original label: ",ys[id]," | Predicted label: ",np.argmax(imdb_clf.predict(crafted_x[id].reshape(-1,maxlen))))
    print(tokenizer.sequences_to_texts(xs[id].reshape(-1,maxlen)))
    print()
    print(tokenizer.sequences_to_texts(crafted_x[id].reshape(-1,maxlen)))


In [38]:
compare(32)

Number of changes on the sample: 51
Original label:  0  | Predicted label:  1
['rented movie last week saw kevin spacey morgan freeman seemed promising justin timberlake came scene really bad actor allowed make movie ever mean one boring uninspired actors ever seen puts absolutely emotion lines whatsoever hell cast role josh pollack think matt damon would better choice kevin spacey another big disappointment character dull seems like bad mix character american beauty john doe se7en might sound cool believe dylan mcdermott acting good']

['watch movies movies never never something movies one look would people one people make like well people people one one one one one well would could make make hard well make make hard one first make first well use without things see first well role years bad films best watch lot bad seems like bad mix character american beauty john doe se7en might sound cool believe dylan mcdermott acting good']
