In [3]:
# Reference: https://github.com/keras-team/keras/blob/master/examples/imdb_lstm.py
from keras.preprocessing import sequence
from keras.models import Sequential, Model, load_model, model_from_yaml
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.layers import Dense, Dropout, Flatten, Activation
from keras import backend as K
from keras.datasets import imdb
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.ERROR)
import numpy as np
import pandas as pd

from skater.core.local_interpretation.dnni.deep_interpreter import DeepInterpreter
from skater.core.visualizer.text_relevance_visualizer import build_explainer, show_in_notebook
from skater.util.dataops import convert_dataframe_to_dict

In [4]:
# Create a TensorFlow session and register it with Keras. It will use this session to initialize all the variables
sess = tf.Session()
K.set_session(sess)

In [12]:
max_features = 20000
maxlen = 80  # wrap the texts after this number of words (among top max_features most common words)
batch_size = 32

### Load the Dataset
#### IMDB dataset: 
##### 1. http://ai.stanford.edu/~amaas//data/sentiment/
##### 2. http://ai.stanford.edu/~ang/papers/acl11-WordVectorsSentimentAnalysis.pdf ( Section 4.1 )

In [14]:
# The Dataset contains 50,000 reviews(Train:25,000 and Test:25,000)
# More info about the dataset: https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification
INDEX_FROM = 3
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features, index_from=INDEX_FROM)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

Loading data...
25000 train sequences
25000 test sequences


In [15]:
# https://stackoverflow.com/questions/42821330/restore-original-text-from-keras-s-imdb-dataset
# Reading raw text
word_to_id = imdb.get_word_index()

def get_raw_txt(word_id_dict, input_data):
    word_id_dict = {k:(v+INDEX_FROM) for k,v in word_id_dict.items()}
    word_id_dict["<PAD>"] = 0
    word_id_dict["<START>"] = 1
    word_id_dict["<UNK>"] = 2
    id_to_word = {value:key for key,value in word_id_dict.items()}
    return ' '.join(id_to_word[_id] for _id in input_data)

r_t = get_raw_txt(word_to_id, x_train[20])
print(r_t + "\n")
print("Length: {}".format(len(r_t.split(' '))))

<START> shown in australia as <UNK> this incredibly bad movie is so bad that you become <UNK> and have to watch it to the end just to see if it could get any worse and it does the storyline is so predictable it seems written by a high school dramatics class the sets are pathetic but marginally better than the <UNK> and the acting is wooden br br the infant <UNK> seems to have been stolen from the props cupboard of <UNK> <UNK> there didn't seem to be a single original idea in the whole movie br br i found this movie to be so bad that i laughed most of the way through br br malcolm mcdowell should hang his head in shame he obviously needed the money

Length: 129


In [16]:
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (25000, 80)
x_test shape: (25000, 80)


In [17]:
r_t_r = get_raw_txt(word_to_id, x_train[20])
print(r_t_r + "\n")
print("Length: {}".format(len(r_t_r.split(' '))))

dramatics class the sets are pathetic but marginally better than the <UNK> and the acting is wooden br br the infant <UNK> seems to have been stolen from the props cupboard of <UNK> <UNK> there didn't seem to be a single original idea in the whole movie br br i found this movie to be so bad that i laughed most of the way through br br malcolm mcdowell should hang his head in shame he obviously needed the money

Length: 80


### Build an LSTM model using word-embeddings

In [18]:
print('Build a model...')
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1))
model.add(Activation('sigmoid'))

Build a model...


In [19]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

### Train a model

In [20]:
n_epoch = 3
print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=n_epoch,
          validation_data=(x_test, y_test))

Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fc54499aa58>

### Persist the model for future use

In [23]:
# Save and persist the trained keras model in YAML format
model_yaml = model.to_yaml()
with open("model_lstm_{}.yaml".format(n_epoch), "w") as yaml_file:
    yaml_file.write(model_yaml)
# serialize weights to HDF5
model.save_weights("model_lstm_{}.h5".format(n_epoch))
print("Saved model to disk")

Saved model to disk


### Load the saved model

In [24]:
# load the model
K.set_learning_phase(0)
yaml_file = open('model_lstm_{}.yaml'.format(n_epoch), 'r')
loaded_model_yaml = yaml_file.read()
yaml_file.close()
loaded_model = model_from_yaml(loaded_model_yaml)
# load weights into new model
loaded_model.load_weights("model_lstm_{}.h5".format(n_epoch))
print("Loaded model from disk")

Loaded model from disk


In [25]:
loaded_model.layers

[<keras.layers.embeddings.Embedding at 0x7fc544909550>,
 <keras.layers.recurrent.LSTM at 0x7fc544909cc0>,
 <keras.layers.core.Dense at 0x7fc579acf908>,
 <keras.layers.core.Activation at 0x7fc5451a72b0>]

### Evaluating the model's performance ( e.g. Accuracy )

In [27]:
loaded_model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
score, acc = loaded_model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Test accuracy: 0.83384


### Lets ask Skater to help us in interpreting the model

In [28]:
K.set_learning_phase(0)
with DeepInterpreter(session=K.get_session()) as di:
    print("learning_phase {}".format(K.learning_phase()))
    yaml_file = open('model_lstm_3.yaml', 'r')
    loaded_model_yaml = yaml_file.read()
    yaml_file.close()
    
    loaded_model = model_from_yaml(loaded_model_yaml)
    # load weights into new model
    loaded_model.load_weights("model_lstm_3.h5")
    print("Load model from disk")    
    
    # Input data
    xs = np.array([x_test[1]])
    ys = np.array([y_test[1]])

    print('Predicted class : {}'.format(loaded_model.predict_classes(np.array([x_test[0]]))))
    print('Ground Truth: {}'.format(ys))
    
    embedding_tensor = loaded_model.layers[0].output
    input_tensor = loaded_model.layers[0].input
    
    embedding_out = di.session.run(embedding_tensor, {input_tensor: xs});
    # Using Integrated Gradient for computing feature relevance
    relevance_scores = di.explain('integ_grad', loaded_model.layers[-2].output * ys, 
                                  loaded_model.layers[1].input, embedding_out);

learning_phase 0
Load model from disk
Predicted class : [[1]]
Ground Truth: [1]


In [29]:
# Retrieve the text
r_t = get_raw_txt(word_to_id, x_test[1])
print(r_t)
words_ = r_t.split(' ')

as he spouts the one liners out i also like the scenes with <UNK> at the beginning find her very sexy when she's wearing all that fetish gear i can't be the only one surely i personally think bride of chucky is a fantastic film total entertainment from start to finish great humour horror in equal measure at only 85 minutes long it never becomes boring or dull a personal favourite of mine watch it as soon as you can


In [31]:
# building a dataframe with columns 'features' and 'relevance scores'
# Since, the relevance score is compute over the embedding vector, we aggregate it by computing 'mean'
# over the embedding to get scalar coefficient for the features
relevance_scores_df = pd.DataFrame(relevance_scores[0]).mean(axis=1)
relevance_scores_df.head()

0    0.001132
1    0.000699
2   -0.004048
3    0.000460
4    0.000073
dtype: float64

In [32]:
# merging the dataframe columnwise
words_df = pd.DataFrame({'features': words_})
scores_df = pd.DataFrame({'relevance_scores': relevance_scores_df.tolist()})
words_scores_df = words_df.join(scores_df)
words_scores_df.head()

Unnamed: 0,features,relevance_scores
0,as,0.001132
1,he,0.000699
2,spouts,-0.004048
3,the,0.00046
4,one,7.3e-05


#### Visualize the results

In [33]:
words_scores_dict = convert_dataframe_to_dict('features', 'relevance_scores', words_scores_df)
build_explainer(r_t, words_scores_dict, highlight_oov=True)

In [34]:
show_in_notebook('./rendered.html')