In [2]:
#Import Packages

#To build a NN model
from keras.models import Sequential

#To build a densely connected NN layer
from keras.layers import Dense

#To build LSTM
from keras.layers import LSTM

#Word Embeddings - Words are converted to real value vectors in n-dimension space
#Embedding in keras provides a convenient way to convert positive integer representations of words into a word embeddings
from keras.layers.embeddings import Embedding

#For padding
from keras.preprocessing import sequence

#Math ops
import numpy

#IMDB Dataset
from keras.datasets import imdb

Using Theano backend.


In [3]:
#Load dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

In [5]:
print "First Movie Review (In Integer Representation):"
print X_train[0]
print "First Movie Review's Target Category:"
print y_train[0]

First Movie Review (In Integer Representation):
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 2, 19, 178, 32]
First Movie Review's Target

In [6]:
#Truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

In [7]:
embedding_vecor_length = 32

#Build NN model
model = Sequential()

#Build Embedding Layer
#In this example
    #top_words = input_dim ie) the largest integer (i.e. word index) in the input should be no larger than 5000 (top_words).
    #embedding_vecor_length = output_dim ie)each word is converted to real valued vectors of dimension 1*32
    #input_length = max_review_length = 500
#Input to this layer is 500 words (in integer representation) and output is 500*32 ie) each word (out of 500 words) 
#is converted to real valued vector of dimension 1*32
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))

#Build LSTM model
model.add(LSTM(100))

#Using Dense add output layer which contains 1 output unit
model.add(Dense(1, activation='sigmoid'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________


**Model Parameters:** In our example total paramters is **213,301**. Lets see how is this calculated.

Total Params = 213,301 = 160000 + 53200 + 101

**160000** = 5000 * 32 = 160000

where 5000 = top_words and 32 = embedding_vector_length

**53200** = (((100*32)+(100*100))*4) + (100*4)

where 100 = LSTM units, 32 = Input units, 4 = No of gates (Candidate State, Input, Output and Forget Gates) and 100 = Bias units

**101** = (1*100) + 1

where 1 = Output unit, 100 = LSTM units and 1 = Bias unit

In [8]:
#Configure the learning process.
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [9]:
#Train the LSTM model for 3 epochs and after training the model with a batch_size of 64 training samples, perform parameter update.
model.fit(X_train, y_train, epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f1be51e8cd0>

In [10]:
#Compute the loss on test data.
score, acc = model.evaluate(X_test, y_test, verbose=0)
#Print the accuracy
print("Accuracy: %.2f%%" % (acc*100))

Accuracy: 86.59%


In [11]:
#Lets now perform prediction
predicted_classes = model.predict_classes(X_test, verbose=0)
#Print target values and its predicted values.
print 'Target Review Category:%d, Predicted Review Category:%d' %(y_test[0], predicted_classes[0])
print 'Target Review Category:%d, Predicted Review Category:%d' %(y_test[99], predicted_classes[99])
print 'Target Review Category:%d, Predicted Review Category:%d' %(y_test[9999], predicted_classes[9999])

Target Review Category:1, Predicted Review Category:1
Target Review Category:1, Predicted Review Category:1
Target Review Category:0, Predicted Review Category:0


Reference:

    1. https://github.com/fchollet/keras/blob/master/examples/imdb_lstm.py