In [1]:
%matplotlib inline
from theano.sandbox import cuda
import utils; reload(utils)
from utils import *
from __future__ import division, print_function

from keras.utils.data_utils import get_file
import pickle
import numpy as np
from keras import *
from keras.models import Sequential, Model

Using Theano backend.


In [2]:
path = 'data/imdb'

# Setup Data

Data consists of IMDB reviews that contain either 1 for positive review, or 0 for negative review.

In [3]:
from keras.datasets import imdb
idx = imdb.get_word_index()

In [4]:
idx_arr = sorted(idx, key = idx.get)
idx_arr[:10]

['the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it', 'i']

In [5]:
idx2word = {v:k for k, v in idx.iteritems()}

This is the download path to the reviews.

In [6]:
imdb_path = get_file('imdb_full.pkl',
                origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
                md5_hash='d091312047c43cf9e4e38fef92437263')
f = open(imdb_path, 'rb')
(x_train, labels_train), (x_test, labels_test) = pickle.load(f)

In [7]:
len(x_train)

25000

In [8]:
idx_arr[23021]

'bromwell'

In [9]:
idx2word[2]

'and'

Since a lot of the rarely seen words are for the most part irellevant, we'll just take the top 4999 words along with creating the 5000th index as every word below that.

In [10]:
vocab_size = 5000

In [11]:
trn = [np.array([i if i < vocab_size - 1 else vocab_size - 1 for i in s]) for s in x_train]
test = [np.array([i if i < vocab_size - 1 else vocab_size - 1 for i in s]) for s in x_test]

We also want to make sure all the reviews are a set amount of words so that all reviews are on a level playing ground.

In [12]:
lens = np.array(map(len, trn))
(lens.max(), lens.min(), lens.mean())

(2493, 10, 237.71364)

In [13]:
seq_len = 500

In [14]:
trn = sequence.pad_sequences(trn, maxlen = seq_len, value = 0)
test = sequence.pad_sequences(test, maxlen = seq_len, value = 0)

In [15]:
trn.shape

(25000, 500)

# Single Hidden Layer Neural Network

In [19]:
model = Sequential([
        Embedding(vocab_size, 32, input_length = seq_len),
        Flatten(),
        Dense(100, activation = 'relu'),
        Dropout(.7),
        Dense(1, activation = 'sigmoid')
    ])

In [20]:
model.compile(Adam(), loss = 'binary_crossentropy', metrics = ['accuracy'])

In [21]:
model.fit(trn, labels_train, validation_data = (test, labels_test), nb_epoch = 2, batch_size = 64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f96ebe0b450>

# Single Convolutional Layer Neural Network

In [22]:
conv1 = Sequential([
        Embedding(vocab_size, 32, input_length = seq_len, dropout = .2),
        Dropout(.2),
        Convolution1D(64, 5, border_mode = 'same', activation = 'relu'),
        Dropout(.2),
        MaxPooling1D(),
        Flatten(),
        Dense(100, activation = 'relu'),
        Dropout(.7),
        Dense(1, activation = 'sigmoid')
    ])

In [23]:
conv1.compile(Adam(), loss = 'binary_crossentropy', metrics = ['accuracy'])

In [24]:
conv1.fit(trn, labels_train, validation_data = (test, labels_test), batch_size = 64, nb_epoch = 4)

Train on 25000 samples, validate on 25000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f96d6dcccd0>

# Multi-Size Convolutional Neural Network

In [35]:
from keras.layers import Merge

In [36]:
graph_in = Input((vocab_size, 50))
convs = []
for fsz in range(3,6):
    x = Convolution1D(64, fsz, border_mode = 'same', activation = 'relu')(graph_in)
    x = MaxPooling1D()(x)
    x = Flatten()(x)
    convs.append(x)
out = Merge(mode = 'concat')(convs)
graph = Model(graph_in, out)

In [37]:
model = Sequential([
        Embedding(vocab_size, 50, input_length = seq_len, dropout = .2),
        Dropout(.2),
        graph,
        Dropout(.5),
        Dense(100, activation = 'relu'),
        Dropout(.7),
        Dense(1, activation = 'sigmoid')
    ])

In [38]:
model.compile(Adam(), loss = 'binary_crossentropy', metrics = ['accuracy'])

In [39]:
model.fit(trn, labels_train, validation_data = (test, labels_test), nb_epoch = 3, batch_size = 64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f96c63b23d0>

# LSTM

In [16]:
model = Sequential([
    Embedding(vocab_size, 32, input_length = seq_len, mask_zero = True, W_regularizer = l2(1e-6), dropout = 0.2),
    LSTM(100),
    Dense(1, activation = 'sigmoid')
])
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 500, 32)       160000      embedding_input_1[0][0]          
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 100)           53200       embedding_1[0][0]                
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 1)             101         lstm_1[0][0]                     
Total params: 213301
____________________________________________________________________________________________________


In [None]:
model.fit(trn, labels_train, validation_data = (test, labels_test), nb_epoch = 5, batch_size = 64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
 1728/25000 [=>............................] - ETA: 340s - loss: 0.6908 - acc: 0.5394