In [36]:
from keras.preprocessing import sequence
from keras.models import Sequential, Model, load_model, model_from_yaml
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb
import tensorflow as tf
from keras import backend as K

tf.logging.set_verbosity(tf.logging.ERROR)
import numpy as np
import pandas as pd

from skater.core.local_interpretation.dnni.deep_interpreter import DeepInterpreter
from skater.core.visualizer.text_relevance_visualizer import build_explainer, show_in_notebook
from skater.util.dataops import convert_dataframe_to_dict

In [37]:
# Create a TensorFlow session and register it with Keras. It will use this session to initialize all the variables
sess = tf.Session()
K.set_session(sess)

In [51]:
# set parameters:
max_features = 20000
maxlen = 80
batch_size = 32
embedding_dims = 128
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 3

### Load the Dataset
#### IMDB dataset: 
##### 1. http://ai.stanford.edu/~amaas//data/sentiment/
##### 2. http://ai.stanford.edu/~ang/papers/acl11-WordVectorsSentimentAnalysis.pdf ( Section 4.1 )

In [174]:
# The Dataset contains 50,000 reviews(Train:25,000 and Test:25,000)
# More info about the dataset: https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

Loading data...
25000 train sequences
25000 test sequences


In [175]:
# https://stackoverflow.com/questions/42821330/restore-original-text-from-keras-s-imdb-dataset
# Reading raw text
INDEX_FROM = 3
# Get the {Word: Index} mapping
word_to_id = imdb.get_word_index()

def adjust_word_id_offset(word_id_dict):
    word_id_dict = {k:(v+INDEX_FROM) for k,v in word_id_dict.items()}
    word_id_dict["<PAD>"] = 0
    word_id_dict["<START>"] = 1
    word_id_dict["<UNK>"] = 2
    return word_id_dict

w_to_id = adjust_word_id_offset(word_to_id)

def get_raw_txt(word_id_dict, input_data):
    id_to_word = {value:key for key,value in word_id_dict.items()}
    return ' '.join([(id_to_word[_id] if _id in id_to_word else 'None') for _id in input_data])

r_t = get_raw_txt(w_to_id, x_train[20])
print(r_t + "\n")
print("Length: {}".format(len(r_t.split(' '))))

<START> shown in australia as <UNK> this incredibly bad movie is so bad that you become <UNK> and have to watch it to the end just to see if it could get any worse and it does the storyline is so predictable it seems written by a high school dramatics class the sets are pathetic but marginally better than the <UNK> and the acting is wooden br br the infant <UNK> seems to have been stolen from the props cupboard of <UNK> <UNK> there didn't seem to be a single original idea in the whole movie br br i found this movie to be so bad that i laughed most of the way through br br malcolm mcdowell should hang his head in shame he obviously needed the money

Length: 129


In [176]:
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (25000, 80)
x_test shape: (25000, 80)


In [177]:
# Raw text post selecting the top most frequently occurring words
r_t_r = get_raw_txt(w_to_id, x_train[20])
print(r_t_r + "\n")
print("Length: {}".format(len(r_t_r.split(' '))))

dramatics class the sets are pathetic but marginally better than the <UNK> and the acting is wooden br br the infant <UNK> seems to have been stolen from the props cupboard of <UNK> <UNK> there didn't seem to be a single original idea in the whole movie br br i found this movie to be so bad that i laughed most of the way through br br malcolm mcdowell should hang his head in shame he obviously needed the money

Length: 80


In [56]:
# Reference: https://github.com/keras-team/keras/blob/master/examples/imdb_cnn.py
print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))

# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

Build model...


In [57]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [58]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fe18dfce7b8>

In [59]:
score, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Test accuracy: 0.8376


#### Summarize the Model

In [62]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 80, 128)           2560000   
_________________________________________________________________
dropout_5 (Dropout)          (None, 80, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 78, 250)           96250     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 250)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 250)               62750     
_________________________________________________________________
dropout_6 (Dropout)          (None, 250)               0         
_________________________________________________________________
activation_5 (Activation)    (None, 250)               0         
__________

### Persist the model for future use

In [63]:
# Save and persist the trained keras model in YAML format
model_yaml = model.to_yaml()
with open("model_cnn_imdb_{}.yaml".format(epochs), "w") as yaml_file:
    yaml_file.write(model_yaml)
# serialize weights to HDF5
model.save_weights("model_cnn_imdb_{}.h5".format(epochs))
print("Save model to disk")

Save model to disk


### Load the saved model

In [64]:
# load the model
K.set_learning_phase(0)
yaml_file = open('model_cnn_imdb_{}.yaml'.format(epochs), 'r')
loaded_model_yaml = yaml_file.read()
yaml_file.close()
loaded_model = model_from_yaml(loaded_model_yaml)
# load weights into new model
loaded_model.load_weights('model_cnn_imdb_{}.h5'.format(epochs))
print("Loaded model from disk")


# Validate model performance with the reload of persisted model
loaded_model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
score, acc = loaded_model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Loaded model from disk
Test accuracy: 0.8376


### Lets ask Skater to help us in interpreting the model

In [182]:
index = 1
K.set_learning_phase(0)
with DeepInterpreter(session=K.get_session()) as di:
    print("learning_phase {}".format(K.learning_phase()))
    yaml_file = open('model_cnn_imdb_{}.yaml'.format(epochs), 'r')
    loaded_model_yaml = yaml_file.read()
    yaml_file.close()
    
    loaded_model = model_from_yaml(loaded_model_yaml)
    # load weights into new model
    loaded_model.load_weights('model_cnn_imdb_{}.h5'.format(epochs))
    print("Load model from disk")    
    
    # Input data
    xs = np.array([x_test[index]])
    ys = np.array([y_test[index]])

    print('Predicted class : {}'.format(loaded_model.predict_classes(np.array([x_test[index]]))))
    print('Ground Truth: {}'.format(ys))
    
    embedding_tensor = loaded_model.layers[0].output
    input_tensor = loaded_model.layers[0].input
    
    embedding_out = di.session.run(embedding_tensor, {input_tensor: xs});
    # Using Integrated Gradient for computing feature relevance
    relevance_scores = di.explain('integ_grad', loaded_model.layers[-2].output * ys, 
                                  loaded_model.layers[1].input, embedding_out);

learning_phase 0
Load model from disk
Predicted class : [[1]]
Ground Truth: [1]


In [166]:
# building a dataframe with columns 'features' and 'relevance scores'
# Since, the relevance score is compute over the embedding vector, we aggregate it by computing 'mean'
# over the embedding to get scalar coefficient for the features
relevance_scores_df = pd.DataFrame(relevance_scores[0]).mean(axis=1)
relevance_scores_df.describe()

count    80.000000
mean     -0.000077
std       0.006043
min      -0.026439
25%      -0.001454
50%      -0.000017
75%       0.001561
max       0.021326
dtype: float64

In [179]:
# merging the dataframe columnwise
# Retrieve the text
r_t = get_raw_txt(w_to_id, x_test[index])
print(r_t)
words_ = r_t.split(' ')

words_df = pd.DataFrame({'features': words_})
scores_df = pd.DataFrame({'relevance_scores': relevance_scores_df.tolist()})
words_scores_df = words_df.join(scores_df)
words_scores_df.describe()

as he spouts the one liners out i also like the scenes with <UNK> at the beginning find her very sexy when she's wearing all that fetish gear i can't be the only one surely i personally think bride of chucky is a fantastic film total entertainment from start to finish great humour horror in equal measure at only 85 minutes long it never becomes boring or dull a personal favourite of mine watch it as soon as you can


Unnamed: 0,relevance_scores
count,80.0
mean,-7.7e-05
std,0.006043
min,-0.026439
25%,-0.001454
50%,-1.7e-05
75%,0.001561
max,0.021326


#### Visualize the results

In [180]:
words_scores_dict = convert_dataframe_to_dict('features', 'relevance_scores', words_scores_df)
build_explainer(r_t, words_scores_dict, highlight_oov=True)

In [181]:
show_in_notebook('./rendered.html')

In [72]:
new_txt = r_t.replace("favourite", "preference")

### Convert the dataset to engineered feature format

In [195]:
# Reference: https://stackoverflow.com/questions/42964375/how-to-input-new-text-for-prediction-in-keras-while-using-an-inbuilt-dataset

def input_formatter_imdb(input_txt, word_index_mapping):
    x_i_test = [[word_index_mapping[wrds] if wrds in word_index_mapping else '<UNK>' for wrds in input_txt.split(' ')]]
    x_i_test = sequence.pad_sequences(x_i_test, maxlen=maxlen)
    txt_vector = np.array([x_i_test.flatten()])
    return txt_vector

In [196]:
input_vector = input_formatter_imdb(new_txt, w_to_id)
input_vector

array([[   17,    29, 14578,     4,    31,  2433,    46,    13,    82,
           40,     4,   139,    19,     2,    33,     4,   454,   169,
           41,    55,  1279,    54,   442,  1658,    32,    15,  7717,
         5745,    13,   191,    30,     4,    64,    31,  1348,    13,
         1276,   104,  3452,     7, 16495,     9,     6,   777,    22,
          964,   722,    39,   380,     8,  1363,    87,  1285,   189,
           11,  3215,  4160,    33,    64,  7304,   234,   196,    12,
          115,   461,   357,    42,   753,     6,   965, 12265,     7,
         1923,   106,    12,    17,   515,    17,    25,    70]],
      dtype=int32)

In [184]:
K.set_learning_phase(0)
with DeepInterpreter(session=K.get_session()) as di:
    print("learning_phase {}".format(K.learning_phase()))
    yaml_file = open('model_cnn_imdb_{}.yaml'.format(epochs), 'r')
    loaded_model_yaml = yaml_file.read()
    yaml_file.close()
    
    loaded_model = model_from_yaml(loaded_model_yaml)
    # load weights into new model
    loaded_model.load_weights('model_cnn_imdb_{}.h5'.format(epochs))
    print("Load model from disk")    
    
    # Input data
    xs = input_vector
    ys = np.array([1])

    print('Predicted class : {}'.format(loaded_model.predict_classes(np.array([x_test[index]]))))
    print('Ground Truth: {}'.format(ys))
    
    embedding_tensor = loaded_model.layers[0].output
    input_tensor = loaded_model.layers[0].input
    
    embedding_out = di.session.run(embedding_tensor, {input_tensor: xs});
    # Using Integrated Gradient for computing feature relevance
    relevance_scores = di.explain('integ_grad', loaded_model.layers[-2].output * ys, 
                                  loaded_model.layers[1].input, embedding_out);

learning_phase 0
Load model from disk
Predicted class : [[1]]
Ground Truth: [1]


In [194]:
relevance_scores_df = pd.DataFrame(relevance_scores[0]).mean(axis=1)
relevance_scores_df.describe()

# Retrieve the text
_in = input_vector.reshape(-1)
r_t = get_raw_txt(w_to_id, _in)
print("Original Document:\n\n {}".format(r_t))

words_ = r_t.split(' ')

# merging the dataframe columnwise
words_df = pd.DataFrame({'features': words_})
scores_df = pd.DataFrame({'relevance_scores': relevance_scores_df.tolist()})
words_scores_df = words_df.join(scores_df)
words_scores_df.describe()

words_scores_dict = convert_dataframe_to_dict('features', 'relevance_scores', words_scores_df)
build_explainer(r_t, words_scores_dict, highlight_oov=True)
show_in_notebook('./rendered.html')

Original Document:

 as he spouts the one liners out i also like the scenes with <UNK> at the beginning find her very sexy when she's wearing all that fetish gear i can't be the only one surely i personally think bride of chucky is a fantastic film total entertainment from start to finish great humour horror in equal measure at only 85 minutes long it never becomes boring or dull a personal preference of mine watch it as soon as you can
