# Crafting Adversarial samples with text for LSTM

In [2]:
import numpy as np
import pandas as pd
from collections import Counter
import tensorflow as tf
from tqdm import tqdm
import re
import string

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM, Activation, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K
from tensorflow.keras.preprocessing.text import Tokenizer

import tensorflow_datasets as tfds
tfds.disable_progress_bar()



[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Dataset

We will be using IMDB review data set that can be classified as either a positive-negative review.

The data is available through Keras for retrieval. We can limit the total number of words in vocabulary.

In [62]:
tensor_train, tensor_test = tfds.load('imdb_reviews', split=['train','test'])
imdb_train = pd.DataFrame(list(tfds.as_numpy(tensor_train)))
imdb_test = pd.DataFrame(list(tfds.as_numpy(tensor_test)))
print("Shape of Train split: ", imdb_train.shape)
print("Shape of Test split: ", imdb_test.shape)
imdb_train.head()

Shape of Train split:  (25000, 2)
Shape of Test split:  (25000, 2)


Unnamed: 0,label,text
0,1,"b""As a lifelong fan of Dickens, I have invaria..."
1,1,"b""Oh yeah! Jenna Jameson did it again! Yeah Ba..."
2,1,"b""I saw this film on True Movies (which automa..."
3,1,b'This was a wonderfully clever and entertaini...
4,1,b'I have no idea what the other reviewer is ta...


In [63]:
# ### Decreasing the size of test set.
# np.random.seed(10)
# idx = np.random.choice(imdb_test.shape[0],2000)
# imdb_test = imdb_test.iloc[idx]
# imdb_test.shape

## Data pre-processing

In [64]:
eng_stopwords = stopwords.words('english')
stopwords_dict = Counter(eng_stopwords)

def remove_stopwords(text):
    text = ' '.join([word for word in word_tokenize(text.lower()) if word not in stopwords_dict])
    return text

def html_tags(text):
    text = re.sub('\<.*?\>+', ' ', text)
    return text

def remove_punctuation(text):
#     punctuations = string.punctuation.replace('\'','')+'\t\n'
    punctuations = string.punctuation+'\t\n'
    text = re.sub('[%s]' % re.escape(punctuations), ' ', text)
    return text

In [65]:
%%time
imdb_train['ptext'] = imdb_train.text.apply(lambda x : x.decode('utf-8'))
imdb_train['ptext'] = imdb_train.ptext.apply(html_tags)
imdb_train['ptext'] = imdb_train.ptext.apply(remove_punctuation)
imdb_train['ptext'] = imdb_train.ptext.apply(remove_stopwords)
imdb_test['ptext'] = imdb_test.text.apply(lambda x : x.decode('utf-8'))
imdb_test['ptext'] = imdb_test.ptext.apply(html_tags)
imdb_test['ptext'] = imdb_test.ptext.apply(remove_punctuation)
imdb_test['ptext'] = imdb_test.ptext.apply(remove_stopwords)
imdb_train.head()

CPU times: user 1min 16s, sys: 0 ns, total: 1min 16s
Wall time: 1min 16s


Unnamed: 0,label,text,ptext
0,1,"b""As a lifelong fan of Dickens, I have invaria...",lifelong fan dickens invariably disappointed a...
1,1,"b""Oh yeah! Jenna Jameson did it again! Yeah Ba...",oh yeah jenna jameson yeah baby movie rocks on...
2,1,"b""I saw this film on True Movies (which automa...",saw film true movies automatically made scepti...
3,1,b'This was a wonderfully clever and entertaini...,wonderfully clever entertaining movie shall ne...
4,1,b'I have no idea what the other reviewer is ta...,idea reviewer talking wonderful movie created ...


In [66]:
%%time
new_text = imdb_train.ptext.apply(word_tokenize).explode()
new_text.unique().shape[0]

CPU times: user 24.3 s, sys: 0 ns, total: 24.3 s
Wall time: 24.3 s


75088

In [67]:
%%time

# Cut texts after this number of words (among top max_features most common words)
max_features = 80000 

# Define tokenizer 
tokenizer = Tokenizer(num_words=max_features,
                      lower=True)
#                       oov_token="<unk>")

# Fit the 
tokenizer.fit_on_texts(imdb_train.ptext)

# Use the '0' index for the padding character
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

CPU times: user 4.79 s, sys: 0 ns, total: 4.79 s
Wall time: 4.79 s


In [68]:
x_train = tokenizer.texts_to_sequences(imdb_train.ptext)
y_train = imdb_train.label.values

x_test = tokenizer.texts_to_sequences(imdb_test.ptext)
y_test = imdb_test.label.values

In [69]:
print("Train data review statistics:")
pdlen = pd.Series(np.array([len(x) for x in x_train]))
print(pdlen.describe())
print()
print("Test data review statistics:")
pdlen = pd.Series(np.array([len(x) for x in x_test]))
print(pdlen.describe())
# print("Average number of words in each review:", lens.max())

Train data review statistics:
count    25000.000000
mean       121.044040
std         91.563644
min          4.000000
25%         64.000000
50%         90.000000
75%        148.000000
max       1435.000000
dtype: float64

Test data review statistics:
count    25000.000000
mean       116.113920
std         87.130015
min          3.000000
25%         63.000000
50%         87.000000
75%        141.000000
max       1122.000000
dtype: float64


We need to one-hot encode the labels, to use probabilities/logits for different classes

In [70]:
print("One-hot encoding of labels")
y_train_oe = to_categorical(y_train, 2)
y_test_oe = to_categorical(y_test, 2)
print('train labels shape:',y_train.shape)
print('test labels shape:',y_test.shape)

One-hot encoding of labels
train labels shape: (25000,)
test labels shape: (25000,)


Keras Embedding layer expects the input to have similar length for each review.
So we either need to pad or truncate the reviews as necessary.

We are padding/truncating at the end of the review.

In [71]:
maxlen = 120

x_train = sequence.pad_sequences(x_train, padding='post', truncating='post', maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, padding='post', truncating='post', maxlen=maxlen)

print('train data shape:', x_train.shape)
print('test data shape:', x_test.shape)

train data shape: (25000, 120)
test data shape: (25000, 120)


In [72]:
tokenizer.sequences_to_texts(x_train[0:2])

['lifelong fan dickens invariably disappointed adaptations novels although works presented extremely accurate telling human life every level victorian britain throughout pervasive thread humour could playful sarcastic narrative dictated way literary caricaturist cartoonist could serious hilarious sentence pricked pride lampooned arrogance celebrated modesty empathised loneliness poverty may cliché people writer comedy often missing interpretations time writing oliver twist dramatised serial form bbc television misery cruelty non humour irony savage lampoonery result dark dismal experience story penned journalist rather novelist really dickens oliver hand much closer mark mockery officialdom perfectly interpreted blustering beadle drunken magistrate classic stand beadle mr brownlow law described ass idiot better done harry secombe ideal choice blinding cruelty also callous indifference state cold hunger poverty loneliness presented surely',
 'oh yeah jenna jameson yeah baby movie rocks 

In [74]:
print("Setting up model-specific variables...")
K.clear_session()
batch_size = 64
embedding_size = 256
lstm_size = 128
val_split = 0.2
epochs = 15
num_classes = 2

Setting up model-specific variables...


## Model Training & Evaluation

In [75]:
seq_encode = Input(shape=(maxlen,))
embeddings = Embedding(max_features, embedding_size)(seq_encode)
lstm_out = LSTM(lstm_size)(embeddings)
dense_out = Dense(num_classes)(lstm_out)
out = Activation('softmax')(dense_out)

In [76]:
optimizer = Adam(lr=1e-3, decay=1e-3)
imdb_clf = Model(inputs=seq_encode, outputs=out)
imdb_clf.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
imdb_clf.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 120)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 120, 256)          20480000  
_________________________________________________________________
lstm (LSTM)                  (None, 128)               197120    
_________________________________________________________________
dense (Dense)                (None, 2)                 258       
_________________________________________________________________
activation (Activation)      (None, 2)                 0         
Total params: 20,677,378
Trainable params: 20,677,378
Non-trainable params: 0
_________________________________________________________________


In [77]:
train_history = imdb_clf.fit(x_train, y_train_oe,
                             validation_data=(x_test, y_test_oe),
#                              validation_split=val_split,
                             batch_size=batch_size,
                             epochs=epochs
                             )

Train on 25000 samples, validate on 25000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [78]:
loss, accuracy = imdb_clf.evaluate(x_test, y_test_oe)
print('Accuracy over Test data:', accuracy)

Accuracy over Test data: 0.82692


In [79]:
loss, accuracy = imdb_clf.evaluate(x_train, y_train_oe)
print('Accuracy over Train data:', accuracy)

Accuracy over Train data: 0.99572


In [80]:
_, accuracy = imdb_clf.evaluate(x_test, y_test_oe, verbose=0)
print('Accuracy over Test data:', accuracy)

_, accuracy = imdb_clf.evaluate(x_train, y_train_oe, verbose=0)
print('Accuracy over Train data:', accuracy)

Accuracy over Test data: 0.82692
Accuracy over Train data: 0.99572


## Saving the model

In [81]:
imdb_clf.save("saved_models/imdb_compiled_clf_120dim.h5")

## Load the model

In [24]:
# imdb_clf = load_model("saved_models/imdb_compiled_clf_70dim.h5")

## Retrieve Embeddings for all the words in the Vocabulary

In [82]:
vocab_embeddings = imdb_clf.layers[1].embeddings.numpy()
print("Shape of the generated embeddings: ",vocab_embeddings.shape)

Shape of the generated embeddings:  (80000, 256)


## Keras function to extract embeddings for samples

In [83]:
get_embeddings = K.function([imdb_clf.layers[0].input],
                                  imdb_clf.layers[1].output)

print("Testing the embedding function with a single sample...")
test_embed = get_embeddings(x_test[0])
print("Shape of generated embeddings:",test_embed.shape)

Testing the embedding function with a single sample...
Shape of generated embeddings: (120, 256)


## Defining Submodel - from Embeddings to logits

In [84]:
### Defining necessary layers
embed_input = Input(shape=test_embed.shape)
embed_lstm = LSTM(lstm_size, dropout=0.2, recurrent_dropout=0.2)(embed_input)
embed_dense = Dense(num_classes)(embed_lstm)

### Define model with Embedding inputs and Logit outputs
embed_model = Model(inputs=embed_input, outputs=embed_dense)

### Transferring the trained weights from our IMDB Classifier model (imdb_clf)
embed_model.layers[1].set_weights(imdb_clf.layers[2].get_weights())
embed_model.layers[2].set_weights(imdb_clf.layers[3].get_weights())
# embed_model.summary()

## Adversarial crafting

### Calculate Jacobian matrix for all the words in the input

In [85]:
def compute_input_jacobian(x, y, model):
    x_embed = get_embeddings(x)
    x_tensor = tf.convert_to_tensor(x_embed.reshape(-1,maxlen,embedding_size), tf.float32)
    x_var = tf.Variable(x_tensor, dtype=tf.float32)

    with tf.GradientTape(watch_accessed_variables=False) as tape:
        tape.watch(x_var)
        # Get logits
        pred_y = model(x_var)

    # Calculate gradients
    x_gradients = tape.batch_jacobian(pred_y, x_var).numpy()
    print("Shape of the Jacobian:", x_gradients.shape)

    return x_gradients

In [86]:
def craft_sample(x, y, x_gradient, max_changes=maxlen):

    x_copy = x.copy()
    pred = np.argmax(imdb_clf.predict_on_batch(x_copy.reshape(-1,maxlen)))
    if pred != y :
        return x, 0

    for word in range(max_changes):

        word_grad = x_gradient[y, word]

        jac_sign = np.sign(word_grad)
        vocab_sign = np.sign(word_grad - vocab_embeddings)
        
        sum_up = np.absolute(np.add.reduce(vocab_sign - jac_sign, axis=1))
        sum_up[0] = 1000
        match_word = np.argmin(sum_up)
        x_copy[word] = match_word

        pred = np.argmax(imdb_clf.predict_on_batch(x_copy.reshape(-1,maxlen)))
        if pred != y : break

    return  x_copy, word

In [131]:
np.random.seed(10)

num_samples_class = 100

crafted_x = []
num_changes = []

idx0 = np.random.choice(np.argwhere(y_train == 0).reshape(-1,), num_samples_class, replace=False)
idx1 = np.random.choice(np.argwhere(y_train == 1).reshape(-1,), num_samples_class, replace=False)
idx = np.concatenate((idx0,idx1))
# np.random.shuffle(idx)

xs, ys, ys_oe = x_train[idx].copy(), y_train[idx].copy(), y_train_oe[idx].copy()

In [134]:
%%time

print("Calculating gradients...")
x_gradients = compute_input_jacobian(xs,ys,embed_model)
loss, acc = imdb_clf.evaluate(xs, ys_oe, verbose=0)
results = [('Original', 0, loss, acc)]
print("Loss and accuracy of selected samples: %.4f and %.4f"%(loss, acc))

Calculating gradients...
Shape of the Jacobian: (200, 2, 120, 256)
Loss and accuracy of selected samples: 0.0068 and 1.0000
CPU times: user 1min 8s, sys: 0 ns, total: 1min 8s
Wall time: 1min 6s


In [135]:
print("Crafting adversarial samples...")

Crafting adversarial samples...


In [136]:
print("Crafting with defined number of changes for each sample:")
max_changes = [10,20,30,40,60,80,100,120]

Crafting with defined number of changes for each sample:


In [137]:
loss_arr = []
acc_arr = []
perturbed_x = []
changes_track = []
for change in max_changes:
    print("Limiting the number of word changes to ",change," in each sample:")
    crafted_x = []
    num_changes = []
    
    for x, y, grad in tqdm(zip(xs, ys, x_gradients), total=xs.shape[0]):
        new_x , changes = craft_sample(x, y, grad, max_changes=change)
        crafted_x.append(new_x)
        num_changes.append(changes)
    
    crafted_x = np.array(crafted_x)
    num_changes = np.array(num_changes)
    
    perturbed_x.append(crafted_x)
    changes_track.append(num_changes)
    
    avg_changes = num_changes.mean()
    
    print("Evaluation for the crafted samples:")
    loss, acc = imdb_clf.evaluate(crafted_x, ys_oe)
    print("Average number of changes per sample:", avg_changes)
    results.append((str(change), avg_changes, loss, acc))
    loss_arr.append(loss)
    acc_arr.append(acc)
    

  0%|          | 0/200 [00:00<?, ?it/s]

Limiting the number of word changes to  10  in each sample:


100%|██████████| 200/200 [07:36<00:00,  2.28s/it]

Evaluation for the crafted samples:






  0%|          | 0/200 [00:00<?, ?it/s]

Average number of changes per sample: 8.465
Limiting the number of word changes to  20  in each sample:


100%|██████████| 200/200 [13:52<00:00,  4.16s/it]

Evaluation for the crafted samples:






  0%|          | 0/200 [00:00<?, ?it/s]

Average number of changes per sample: 16.55
Limiting the number of word changes to  30  in each sample:


100%|██████████| 200/200 [18:04<00:00,  5.42s/it]

Evaluation for the crafted samples:






  0%|          | 0/200 [00:00<?, ?it/s]

Average number of changes per sample: 22.97
Limiting the number of word changes to  40  in each sample:


100%|██████████| 200/200 [18:57<00:00,  5.69s/it]

Evaluation for the crafted samples:






  0%|          | 0/200 [00:00<?, ?it/s]

Average number of changes per sample: 28.045
Limiting the number of word changes to  60  in each sample:


100%|██████████| 200/200 [24:05<00:00,  7.23s/it]

Evaluation for the crafted samples:






  0%|          | 0/200 [00:00<?, ?it/s]

Average number of changes per sample: 35.04
Limiting the number of word changes to  80  in each sample:


100%|██████████| 200/200 [27:31<00:00,  8.26s/it]

Evaluation for the crafted samples:






  0%|          | 0/200 [00:00<?, ?it/s]

Average number of changes per sample: 39.375
Limiting the number of word changes to  100  in each sample:


100%|██████████| 200/200 [28:38<00:00,  8.59s/it]

Evaluation for the crafted samples:






  0%|          | 0/200 [00:00<?, ?it/s]

Average number of changes per sample: 41.445
Limiting the number of word changes to  120  in each sample:


100%|██████████| 200/200 [29:34<00:00,  8.87s/it]

Evaluation for the crafted samples:




Average number of changes per sample: 42.85


In [138]:
# print("Crafting with variable changes for each sample:")

In [None]:
# crafted_x = []
# num_changes = []
# for x, y, grad in tqdm(zip(xs, ys, x_gradients), total=xs.shape[0]):
#     new_x , changes = craft_sample(x, y, grad)
#     crafted_x.append(new_x)
#     num_changes.append(changes)

# crafted_x = np.array(crafted_x)
# num_changes = np.array(num_changes)

# print("Average number of changes per sample:", num_changes.mean())

# imdb_clf.evaluate(crafted_x, ys_oe)

In [None]:
perturbed_x = np.array(perturbed_x)
changes_track = np.array(changes_track)

In [142]:
import pickle
with open('crafted x', 'ab') as fo:     
    pickle.dump(perturbed_x, fo)    

with open('Changes tracker', 'ab') as fo:     
    pickle.dump(changes_track, fo)  

In [151]:
perturbed_x.shape

(8, 200, 120)

In [143]:
results_df = pd.DataFrame(results,columns=['Max changes', 'Average changes', 'Loss', 'Accuracy'])
results_df.head(10)

Unnamed: 0,Max changes,Average changes,Loss,Accuracy
0,Original,0.0,0.00678,1.0
1,10,8.465,0.199264,0.88
2,20,16.55,0.466769,0.705
3,30,22.97,0.695085,0.575
4,40,28.045,0.872956,0.45
5,60,35.04,1.132386,0.29
6,80,39.375,1.27762,0.14
7,100,41.445,1.313585,0.08
8,120,42.85,1.32762,0.065


In [144]:
results_df.to_csv("Text clf attack 120dim results.csv")

In [160]:
def compare(tid, change_index=0):
    max_change = max_changes[change_index]
    print("Number of changes on the sample:",changes_track[change_index,tid])
    print("Original label: ",ys[tid]," | Predicted label: ",np.argmax(imdb_clf.predict(perturbed_x[change_index,tid].reshape(-1,maxlen))))
    print(tokenizer.sequences_to_texts(xs[tid].reshape(-1,maxlen)))
    print()
    print(tokenizer.sequences_to_texts(perturbed_x[change_index,tid].reshape(-1,maxlen)))


In [169]:
compare(2,3)

Number of changes on the sample: 39
Original label:  0  | Predicted label:  0
['disappearance couple take family vacation new mexico find deep trouble taking detour main highway visit town seemingly abandoned 1948 unknown reasons town weaver seems harmless first tourist appeal family stranded overnight begin good reason suspect others experienced predicament fatal outcomes henleys watch blair witch project esquire video diary left town last victim ironically demonstrates best performance anyone movie although hamlin dey performances much better supporting casts emotional affect seems flat throughout movie disappearance appeal movie much suspense good direction however plot takes unexpected implausible turns seemingly make sense worse yet really understanding exactly going movie makes bizarre ending less tolerable appeared movie makers focused making stream suspenseful scenes threw away elements good story making plot development gradual explanation themes symbols']

['like time great c