In [1]:
from keras.datasets import imdb

NUM_WORDS = 10000

(dev_X, dev_y) , (test_X, test_y) = imdb.load_data(num_words=NUM_WORDS,
                                                      skip_top=0,
                                                      maxlen=None)

Using TensorFlow backend.


The point of all this thing is to try and beat their score
http://nbviewer.jupyter.org/github/fchollet/deep-learning-with-python-notebooks/blob/master/3.5-classifying-movie-reviews.ipynb

In [2]:
dev_X.shape

(25000,)

In [3]:
dev_y.shape

(25000,)

In [4]:
import numpy as np

In [5]:
from keras.layers import Dense, Dropout, Embedding, LSTM
from keras.models import Sequential
from keras.callbacks import TensorBoard

First I will try sequental model with few dense layers without autoencoder

In [6]:
def vectorize_y(Y):
    return np.asarray(Y).astype('float32')

In [7]:
y = vectorize_y(dev_y)
y[:2]

array([ 1.,  0.], dtype=float32)

In [8]:
def one_hot_encode(X):
    seq = np.zeros((len(X),NUM_WORDS))
    for (i,val) in enumerate(X):
        seq[i][val] = 1
    return seq

In [9]:
X = one_hot_encode(dev_X)
X[:2]

array([[ 0.,  1.,  1., ...,  0.,  0.,  0.],
       [ 0.,  1.,  1., ...,  0.,  0.,  0.]])

In [120]:
model = Sequential([
    Dense(16, input_shape=(NUM_WORDS,), activation="tanh"),
    Dropout(0.5),
    Dense(64, activation="tanh"),
    Dense(1, activation="sigmoid")
])

In [121]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [122]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_70 (Dense)             (None, 16)                160016    
_________________________________________________________________
dropout_22 (Dropout)         (None, 16)                0         
_________________________________________________________________
dense_71 (Dense)             (None, 64)                1088      
_________________________________________________________________
dense_72 (Dense)             (None, 1)                 65        
Total params: 161,169
Trainable params: 161,169
Non-trainable params: 0
_________________________________________________________________


In [123]:
model.fit(X, y, epochs=3, validation_split=0.2, batch_size=712, callbacks=[TensorBoard(log_dir="./logs")])

Train on 20000 samples, validate on 5000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f0a603cec18>

In [109]:
X_test = one_hot_encode(test_X)
y_test = vectorize_y(test_y)

In [124]:
results = model.evaluate(X_test, y_test)
results



[0.2873726367521286, 0.88404000000000005]

Yup it seems I have won this one [0.2873726367521286, 0.88404000000000005] :)

And now last but not least, lets check how simple long short term memory network handles the classification

In [9]:
def pad_seq(X):
    max_len_seq = max([len(x) for x in X])
    seq = []
    for x in X:
        x_clone = np.append(x, [np.zeros((max_len_seq - len(x)), dtype="int32") ])
        seq.append(x_clone)
    return (max_len_seq, np.array(seq))

In [10]:
max_len_seq, X = pad_seq(dev_X)
X[:2]

array([[   1,   14,   22, ...,    0,    0,    0],
       [   1,  194, 1153, ...,    0,    0,    0]])

In [11]:
model = Sequential([
    Embedding(max_len_seq, 128),
    LSTM(128),
    Dense(1, activation="sigmoid")
])

In [12]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [13]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 128)         319232    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 450,945
Trainable params: 450,945
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.fit(X, y, epochs=12, validation_split=0.2, batch_size=256)

Train on 20000 samples, validate on 5000 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7fe3ea743588>

In [17]:
from keras.preprocessing.sequence import pad_sequences
maxlen = 80
X = pad_sequences(dev_X, maxlen=maxlen)

In [23]:
model = Sequential([
    Embedding(NUM_WORDS, 128),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation="sigmoid")
])

In [24]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [25]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 128)         1280000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 129       
Total params: 1,411,713
Trainable params: 1,411,713
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X, y, epochs=64, validation_split=0.2, batch_size=32)

Train on 20000 samples, validate on 5000 samples
Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64