In [20]:
import numpy as np
from keras.datasets import imdb
from keras.models import Sequential, model_from_json, load_model
from keras.layers import Dense
from keras.layers import LSTM, Dropout, Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras_tqdm import TQDMNotebookCallback
import pickle

In [2]:
np.random.seed(7)

In [3]:
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(path='imdb.npz',num_words=top_words)

The Dataset has been preprocessed, read: https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification

In [4]:
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

# Model 1: Simple LSTM

In [5]:
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None


In [6]:
train_model = False
if (train_model):
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64, verbose=0, callbacks=[TQDMNotebookCallback(leave_inner=False, leave_outer=True)])
    model.save('models/model1.h5')
else:
    model = load_model('models/model1.h5')

In [7]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 87.46%


# Custom Input

In [8]:
imdb_word_index = imdb.get_word_index()

In [9]:
imdb_word_index['dinosaur'], imdb_word_index['to']

(4454, 5)

In [10]:
imdb_words = {}
imdb_words[0] = ''
for key, value in imdb_word_index.items():
    value = int(value)
    imdb_words[value] = key

In [11]:
imdb_words[4454], imdb_words[5]

('dinosaur', 'to')

In [12]:
def vector_to_sentence(vec):
    vec = vec.flatten()
    sentence = ''
    for index in vec:
        word = imdb_words[index]+' '
        sentence += word
    return sentence.strip()      

In [13]:
vector_to_sentence(X_train[90])

"the this enough and and better executed ability br and with his her and movie it stick politics i i was one is excellent cut this and only natural with lot br of how truly full this of want f br and pop and off that however of here br and and me will her points violent this and of 1 for from me in and of guy to simple or and this seen money and sees hours is exciting and guy to sets and and ability his is displays really me of truly and br and and with is quite touches in of i've been stick politics i i of she's it for of house and for its great does it of complicated down roth cute dies are and are arrived really disaster and rented movie trite of thought group fun to big was can about previous are and to and historical those are for top and with is word bath this of neil whether and to audience and with table and who private and yes br complicated and was let musicals was two that with and happen can't ok in out imagination i i they by my complicated other moore is am and place br o

# Model 2: LSTM with Dropout

Introduction of Dropout layers to reduce overfitting

In [14]:
model2 = Sequential()
model2.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model2.add(Dropout(0.2))
model2.add(LSTM(100))
model2.add(Dropout(0.2))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model2.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 500, 32)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
train_model = False
if (train_model):
    model2.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64, verbose=0, callbacks=[TQDMNotebookCallback(leave_inner=True, leave_outer=True)])
    model2.save('models/model2.h5')
else:
    model2 = load_model('models/model2.h5')




In [16]:
scores = model2.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 86.35%


Convergence to the minima is slow because of the dropout, Model 2 will outperform Model 1 on increasing epochs

# Model 3: LSTM and CNN

CNN is efficient in detecting patterns

In [21]:
model3 = Sequential()
model3.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model3.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model3.add(MaxPooling1D(pool_size=2))
model3.add(LSTM(100))
model3.add(Dense(1, activation='sigmoid'))
model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model3.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 500, 32)           3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 250, 32)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 216,405
Trainable params: 216,405
Non-trainable params: 0
_________________________________________________________________
None


In [22]:
train_model = True
if (train_model):
    model3.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64, verbose=0, callbacks=[TQDMNotebookCallback(leave_inner=True, leave_outer=True)])
    model3.save('models/model3.h5')
else:
    model3 = load_model('models/model3.h5')




In [23]:
scores = model2.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 86.35%
