In [1]:
import numpy as np
import tensorflow as tf

from keras.preprocessing.sequence import pad_sequences
import pickle

Using TensorFlow backend.


In [54]:
import data_utils
import embedding_utils
import display_utils

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
np.random.seed(1001)
tf.set_random_seed(1001)

### Load data

In [72]:
VOCAB_SIZE  = 50000
#load dataset
with open('aux_files/dataset_%d.pkl' %VOCAB_SIZE, 'rb') as f:
    dataset = pickle.load(f)

In [5]:

#load conter-fitted word 
dist_mat = np.load('aux_files/dist_counter_%d.npy' %VOCAB_SIZE)
# Prevent returning 0 as most similar word because it is not part of the dictionary
dist_mat[0,:] = 100000
dist_mat[:,0] = 100000

skip_list = np.load('aux_files/missed_embeddings_counter_%d.npy' %VOCAB_SIZE)


In [6]:
doc_len = [len(dataset.val_X[i]) for i in 
           range(len(dataset.val_X))]

### Demonstrating how we find the most similar words

In [61]:
src_word = dataset.dict['好']
src_word
dataset.inv_full_dict[50000]

343

In [64]:
no_repl_word = ['的','得','地','了','在']
[dataset.full_dict[word] for word in no_repl_word]

[31, 109, 67, 62, 26]

In [71]:
a, b = None, None
print(a)

None


In [8]:
for i in range(6230, 6232):
    src_word = i
    nearest, nearest_dist = embedding_utils.pick_most_similar_words(src_word, dist_mat,20, 0.5)
        
    print('Closest to `%s` are:' %(dataset.inv_dict[src_word]))
    for w_id, w_dist in zip(nearest, nearest_dist):
          print(' -- ', dataset.inv_dict[w_id], ' ', w_dist)

    print('----')

Closest to `国税局` are:
----
Closest to `翼` are:
 --  机翼   0.22788355864099952
 --  辅   0.22916311992500127
 --  翅膀   0.2738878111520009
 --  援   0.3341176668279999
 --  辅助   0.34568997297100035
 --  副   0.3563555582999989
 --  扶持   0.36099929984099965
 --  救援   0.3625574274400005
 --  支架   0.3828246505230002
 --  帮   0.390361236298
 --  援助   0.3941633762969998
 --  养   0.4035929243710008
 --  救   0.4049656322590014
 --  架   0.417898789983
 --  赞助   0.42399925209200195
 --  扶   0.42721877353100135
 --  资助   0.43061200651000053
 --  助益   0.4325585738910007
 --  声援   0.43391956708499957
 --  托   0.43443959343899996
----


### Loading the NLI model

In [9]:
from NLImodel.models import NLImodelClass


In [10]:
model = NLImodelClass(is_train=False)


Learning Rate:  0.0007
Model Loaded.


In [11]:
dataset.val_X[0], dataset.val_Y[0], dataset.val_Z[0], np.argmax(dataset.val_Z[0])

(array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
          328, 50000,    29,  2923,  1991,   236,   108, 50000,  2631,
           29,   106,  2348, 50000,  1099,    29,  2106,    15,    52,
          620,    62, 50000], dtype=int32),
 array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,    29,   900,   262,    15,    52,
         3915,  1670, 50000], dtype=int32),
 array([0., 0., 1.]),
 2)

In [12]:
np.argmax(model.predict(dataset.val_X[0], dataset.val_X[0])) 

1

## Try Attack

In [76]:
from attacks import GreedyAttack, GeneticAtack

### Main Attack 

In [77]:
pop_size = 32
n1 = 8
goog_lm = None

ga_atttack = GeneticAtack(model, model, model, dataset, dist_mat, 
                                  skip_list,
                                  goog_lm, max_iters=30, 
                                   pop_size=pop_size,
                                  
                                  n1 = n1,
                                  n2 = 4,
                                 use_lm = False, use_suffix=False)

In [78]:
TEST_SIZE = 1000
#use val data for attack
test_idx = np.random.choice(len(dataset.val_Z), TEST_SIZE, replace=False)#随机index,总共1000个
test_len = []
for i in range(TEST_SIZE):
    test_len.append(len(dataset.val_Y[test_idx[i]]))
print('Shortest sentence in our test set is %d words' %np.min(test_len))

test_list = []
orig_list = []
orig_list_no_max = []
orig_label_list = []
adv_list = []
dist_list = []

for i in range(TEST_SIZE):
    x_orig = [dataset.val_X[test_idx[i]], dataset.val_Y[test_idx[i]]]
    x_orig_no_max = [dataset.val_X_nomax[test_idx[i]], dataset.val_Y_nomax[test_idx[i]]]
    orig_label = np.argmax(dataset.val_Z[test_idx[i]])
    orig_preds=  model.predict(x_orig[0], x_orig[1])
    # print(orig_label, orig_preds, np.argmax(orig_preds))
    if np.argmax(orig_preds) != orig_label:
        print('skipping wrong classifed ..')
        print('--------------------------')
        continue
    x_len = np.sum(np.sign(x_orig[1]))
    if x_len >= 30:
        print('skipping too long input..')
        print('--------------------------')
        continue
    if orig_label == 0:
        print('skipping neutral label..')
        print('--------------------------')
        continue
    # if np.max(orig_preds) < 0.90:
    #    print('skipping low confidence .. \n-----\n')
    #    continue
    print('****** ', len(test_list) + 1, ' ********')
    test_list.append(test_idx[i])
    orig_list.append(x_orig)
    orig_list_no_max.append(x_orig_no_max)
    target_label = 1 if orig_label == 2 else 2
    orig_label_list.append(orig_label)
    x_adv = ga_atttack.attack( x_orig, target_label)
    adv_list.append(x_adv)
    if x_adv is None:
        print('%d failed' %(i+1))
        dist_list.append(100000)
    else:
        num_changes = np.sum(x_orig[1] != x_adv)
        print('%d - %d changed.' %(i+1, num_changes))
        dist_list.append(num_changes)
        # display_utils.visualize_attack(sess, model, dataset, x_orig, x_adv)
    print('--------------------------')
    if (len(test_list)>= 50):
        break

Shortest sentence in our test set is 30 words
skipping neutral label..
--------------------------
skipping wrong classifed ..
--------------------------
******  1  ********
		 0  --  0.27355286
		 1  --  0.28572702
		 2  --  0.28572702
		 3  --  0.2857271
		 4  --  0.2857271
		 5  --  0.2857271
		 6  --  0.2857271
		 7  --  0.2857271
		 8  --  0.2857271
		 9  --  0.28572708
		 10  --  0.28572708
		 11  --  0.28572708
		 12  --  0.28572708
		 13  --  0.28572708
		 14  --  0.28572708
		 15  --  0.28572708
		 16  --  0.28572708
		 17  --  0.28572708
		 18  --  0.28572708
		 19  --  0.28572708
		 20  --  0.28572708
		 21  --  0.28572708
		 22  --  0.28572708
		 23  --  0.28572708
		 24  --  0.28572708
		 25  --  0.28572708
		 26  --  0.28572708
		 27  --  0.28572708
		 28  --  0.28572708
		 29  --  0.28572708
3 failed
--------------------------
******  2  ********
		 0  --  0.28171492
		 1  --  0.3245815
		 2  --  0.42097265
4 - 1 changed.
--------------------------
******  3  ********




		 0  --  0.5886891
5 - 1 changed.
--------------------------
skipping neutral label..
--------------------------
******  4  ********
		 0  --  0.415536
7 - 1 changed.
--------------------------
skipping neutral label..
--------------------------
skipping wrong classifed ..
--------------------------
skipping neutral label..
--------------------------
******  5  ********
		 0  --  0.5568071
11 - 1 changed.
--------------------------
******  6  ********
		 0  --  0.17814709
		 1  --  0.27509725
		 2  --  0.32571614
		 3  --  0.38372794
12 - 1 changed.
--------------------------
******  7  ********
		 0  --  0.55844045
13 - 1 changed.
--------------------------
******  8  ********
		 0  --  0.3715922
14 - 1 changed.
--------------------------
skipping wrong classifed ..
--------------------------
skipping wrong classifed ..
--------------------------
******  9  ********
		 0  --  0.6947248
17 - 1 changed.
--------------------------
******  10  ********
		 0  --  0.2916692
		 1  --  0.460

		 0  --  0.7458899
88 - 1 changed.
--------------------------
skipping neutral label..
--------------------------
******  40  ********
		 0  --  0.60944116
90 - 1 changed.
--------------------------
skipping wrong classifed ..
--------------------------
skipping wrong classifed ..
--------------------------
******  41  ********
		 0  --  0.3277387
		 1  --  0.3611592
		 2  --  0.37541673
		 3  --  0.37541673
		 4  --  0.37541673
		 5  --  0.38720146
93 - 1 changed.
--------------------------
skipping wrong classifed ..
--------------------------
skipping wrong classifed ..
--------------------------
skipping wrong classifed ..
--------------------------
******  42  ********
		 0  --  0.5267775
97 - 1 changed.
--------------------------
******  43  ********
		 0  --  0.21421513
		 1  --  0.3915852
98 - 1 changed.
--------------------------
******  44  ********
		 0  --  0.7135845
99 - 1 changed.
--------------------------
******  45  ********
		 0  --  0.5284755
100 - 1 changed.
------

### Visualize some output results

In [79]:
print('Median number of modifications ' , np.median([x for x in dist_list if x != 100000]))

Median number of modifications  1.0


In [80]:
orig_len = [np.sum(np.sign(x)) for x in orig_list[1]]
ratio = [dist_list[i]/orig_len[i] for i in range(len(orig_list[1]))]

In [81]:
ratio_success = [x for x in ratio if x < 1.0]
np.mean(ratio_success)

0.05263157894736842

In [84]:
#visual_idx = 0
for visual_idx in range(len(adv_list)):
    print(visual_idx)
    display_utils.visualize_attack(model, dataset, orig_list_no_max[visual_idx], adv_list[visual_idx])

0
1
Original Prediction = Contradiction. (Confidence = 65.43) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Entailment. (Confidence = 36.60) 


2
Original Prediction = Contradiction. (Confidence = 95.19) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Entailment. (Confidence = 50.22) 


3
Original Prediction = Entailment. (Confidence = 57.64) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 43.48) 


4
Original Prediction = Entailment. (Confidence = 50.24) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 52.03) 


5
Original Prediction = Contradiction. (Confidence = 55.17) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 35.12) 


6
Original Prediction = Contradiction. (Confidence = 63.84) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Entailment. (Confidence = 39.05) 


7
Original Prediction = Contradiction. (Confidence = 91.95) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 34.38) 


8
Original Prediction = Contradiction. (Confidence = 97.35) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Entailment. (Confidence = 59.06) 


9
Original Prediction = Contradiction. (Confidence = 78.38) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 40.14) 


10
Original Prediction = Contradiction. (Confidence = 50.96) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 42.10) 


11
Original Prediction = Contradiction. (Confidence = 81.74) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Entailment. (Confidence = 33.68) 


12
Original Prediction = Entailment. (Confidence = 77.27) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 56.99) 


13
Original Prediction = Entailment. (Confidence = 47.36) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 43.12) 


14
Original Prediction = Entailment. (Confidence = 47.56) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 35.59) 


15
Original Prediction = Entailment. (Confidence = 52.97) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 68.80) 


16
Original Prediction = Entailment. (Confidence = 76.12) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 41.21) 


17
Original Prediction = Entailment. (Confidence = 48.45) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 37.08) 


18
Original Prediction = Entailment. (Confidence = 80.85) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 46.76) 


19
Original Prediction = Entailment. (Confidence = 46.23) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 42.76) 


20
Original Prediction = Contradiction. (Confidence = 93.46) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 50.08) 


21
Original Prediction = Entailment. (Confidence = 68.62) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 56.35) 


22
Original Prediction = Entailment. (Confidence = 52.27) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 44.67) 


23
Original Prediction = Entailment. (Confidence = 50.78) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 39.38) 


24
Original Prediction = Contradiction. (Confidence = 72.77) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Entailment. (Confidence = 60.07) 


25
Original Prediction = Entailment. (Confidence = 43.40) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 41.23) 


26
Original Prediction = Contradiction. (Confidence = 42.54) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 46.73) 


27
Original Prediction = Contradiction. (Confidence = 83.59) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 42.01) 


28
Original Prediction = Entailment. (Confidence = 75.67) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 45.75) 


29
Original Prediction = Contradiction. (Confidence = 84.46) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 44.15) 


30
31
Original Prediction = Entailment. (Confidence = 44.27) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 37.84) 


32
Original Prediction = Contradiction. (Confidence = 83.19) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Entailment. (Confidence = 45.39) 


33
34
Original Prediction = Entailment. (Confidence = 61.06) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 38.17) 


35
Original Prediction = Contradiction. (Confidence = 85.28) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 36.30) 


36
37
38
Original Prediction = Contradiction. (Confidence = 55.78) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Entailment. (Confidence = 69.72) 


39
Original Prediction = Contradiction. (Confidence = 89.76) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Entailment. (Confidence = 56.52) 


40
Original Prediction = Contradiction. (Confidence = 42.03) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 47.24) 


41
Original Prediction = Entailment. (Confidence = 49.69) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 51.57) 


42
Original Prediction = Entailment. (Confidence = 79.74) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 40.08) 


43
Original Prediction = Contradiction. (Confidence = 78.46) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Entailment. (Confidence = 61.45) 


44
Original Prediction = Contradiction. (Confidence = 94.20) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Entailment. (Confidence = 38.86) 


45
Original Prediction = Contradiction. (Confidence = 90.88) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Contradiction. (Confidence = 43.71) 


46
Original Prediction = Contradiction. (Confidence = 77.91) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Entailment. (Confidence = 48.15) 


47
48
Original Prediction = Contradiction. (Confidence = 44.19) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Entailment. (Confidence = 37.47) 


49
Original Prediction = Contradiction. (Confidence = 98.01) 
Premise


Hypothesis


---------  After attack -------------
New Prediction = Entailment. (Confidence = 44.23) 
