In [59]:
import os
import numpy as np
import keras
from keras.preprocessing import sequence
from keras.models import Sequential, Model
from keras.layers import Input, Embedding, LSTM
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.regularizers import l2, l1
from keras.optimizers import SGD, RMSprop, Adam
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.utils import get_file


## Setup data

We're going to look at the IMDB dataset, which contains movie reviews from IMDB, along with their sentiment. Keras comes with some helpers for this dataset.

In [2]:
from keras.datasets import imdb
idx = imdb.get_word_index()

In [51]:
model_path = '../../models/'

In [3]:
print(len(idx.keys()))
print(idx['the'])
print(str(idx)[:100])

88584
1
{'fawn': 34701, 'tsukino': 52006, 'nunnery': 52007, 'sonja': 16816, 'vani': 63951, 'woods': 1408, 's


This is the word list:

In [4]:
#idx_arr = sorted(idx, key=idx.get)
#idx_arr[:10]

...and this is the mapping from id to word

In [5]:
idx2word = {v: k for k, v in idx.items()}
print(idx2word[1463])
print(idx2word[1])
print(idx2word[2])
print(idx2word[3])
print(len(idx2word))

spot
the
and
a
88584


In [6]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz",
                                                      num_words=None,
                                                      skip_top=0,
                                                      maxlen=None,
                                                      seed=113,
                                                      start_char=1,
                                                      oov_char=2,
                                                      index_from=3)


We download the reviews using code copied from keras.datasets:

In [7]:
print(len(x_train),len(x_test))
print(x_train[10])

25000 25000
[1, 785, 189, 438, 47, 110, 142, 7, 6, 7475, 120, 4, 236, 378, 7, 153, 19, 87, 108, 141, 17, 1004, 5, 30432, 883, 10789, 23, 8, 4, 136, 13772, 11631, 4, 7475, 43, 1076, 21, 1407, 419, 5, 5202, 120, 91, 682, 189, 2818, 5, 9, 1348, 31, 7, 4, 118, 785, 189, 108, 126, 93, 13772, 16, 540, 324, 23, 6, 364, 352, 21, 14, 9, 93, 56, 18, 11, 230, 53, 771, 74, 31, 34, 4, 2834, 7, 4, 22, 5, 14, 11, 471, 9, 17547, 34, 4, 321, 487, 5, 116, 15, 6584, 4, 22, 9, 6, 2286, 4, 114, 2679, 23, 107, 293, 1008, 1172, 5, 328, 1236, 4, 1375, 109, 9, 6, 132, 773, 14799, 1412, 8, 1172, 18, 7865, 29, 9, 276, 11, 6, 2768, 19, 289, 409, 4, 5341, 2140, 20250, 648, 1430, 10136, 8914, 5, 27, 3000, 1432, 7130, 103, 6, 346, 137, 11, 4, 2768, 295, 36, 7740, 725, 6, 3208, 273, 11, 4, 1513, 15, 1367, 35, 154, 14040, 103, 19100, 173, 7, 12, 36, 515, 3547, 94, 2547, 1722, 5, 3547, 36, 203, 30, 502, 8, 361, 12, 8, 989, 143, 4, 1172, 3404, 10, 10, 328, 1236, 9, 6, 55, 221, 2989, 5, 146, 165, 179, 770, 15, 50, 713, 5

Here's the 1st review. As you see, the words have been replaced by ids. The ids can be looked up in idx2word.

Here's the whole review, mapped from ids to words.

## Muito cuidado: a conversão do índice para palavras possui offset de -3. Por exemplo: 'french' é 785 

In [8]:
' '.join([idx2word[o-3] for o in x_train[10][1:]])

"french horror cinema has seen something of a revival over the last couple of years with great films such as inside and switchblade romance bursting on to the scene maléfique preceded the revival just slightly but stands head and shoulders over most modern horror titles and is surely one of the best french horror films ever made maléfique was obviously shot on a low budget but this is made up for in far more ways than one by the originality of the film and this in turn is complimented by the excellent writing and acting that ensure the film is a winner the plot focuses on two main ideas prison and black magic the central character is a man named carrère sent to prison for fraud he is put in a cell with three others the quietly insane lassalle body building transvestite marcus and his retarded boyfriend daisy after a short while in the cell together they stumble upon a hiding place in the wall that contains an old journal after translating part of it they soon realise its magical powers

The labels are 1 for positive, 0 for negative.

In [9]:
y_train[:10]

array([1, 0, 0, 1, 0, 0, 1, 0, 1, 0])

Reduce vocab size by setting rare words to max index.

In [10]:
VOCAB_SIZE = 5000

trn  = [np.array([i if i<VOCAB_SIZE-1 else VOCAB_SIZE-1 for i in s]) for s in x_train]
test = [np.array([i if i<VOCAB_SIZE-1 else VOCAB_SIZE-1 for i in s]) for s in x_test]

Look at distribution of lengths of sentences.

In [11]:
lens = np.array(list(map(len, trn)))
(lens.max(), lens.min(), lens.mean())

(2494, 11, 238.71364)

Pad (with zero) or truncate each sentence to make consistent length.

In [14]:
SEQ_LEN = 500

trn_pad  = sequence.pad_sequences(trn,  maxlen=SEQ_LEN, value=0,
                                  padding='post', truncating='post')
test_pad = sequence.pad_sequences(test, maxlen=SEQ_LEN, value=0, 
                                  padding='post', truncating='post')

In [15]:
print(trn_pad[0])

[   1   14   22   16   43  530  973 1622 1385   65  458 4468   66 3941    4
  173   36  256    5   25  100   43  838  112   50  670 4999    9   35  480
  284    5  150    4  172  112  167 4999  336  385   39    4  172 4536 1111
   17  546   38   13  447    4  192   50   16    6  147 2025   19   14   22
    4 1920 4613  469    4   22   71   87   12   16   43  530   38   76   15
   13 1247    4   22   17  515   17   12   16  626   18 4999    5   62  386
   12    8  316    8  106    5    4 2223 4999   16  480   66 3785   33    4
  130   12   16   38  619    5   25  124   51   36  135   48   25 1415   33
    6   22   12  215   28   77   52    5   14  407   16   82 4999    8    4
  107  117 4999   15  256    4 4999    7 3766    5  723   36   71   43  530
  476   26  400  317   46    7    4 4999 1029   13  104   88    4  381   15
  297   98   32 2071   56   26  141    6  194 4999   18    4  226   22   21
  134  476   26  480    5  144   30 4999   18   51   36   28  224   92   25
  104    4  

This results in nice rectangular matrices that can be passed to ML algorithms. Reviews shorter than 500 words are pre-padded with zeros, those greater are truncated.

In [16]:
trn_pad.shape

(25000, 500)

## Create simple models

### Single hidden layer NN

The simplest model that tends to give reasonable results is a single hidden layer net. So let's try that. Note that we can't expect to get any useful results by feeding word ids directly into a neural net - so instead we use an embedding to replace them with a vector of 32 (initially random) floats for each word in the vocab.

In [21]:
model = Sequential([
    Embedding(VOCAB_SIZE, 32, input_length=SEQ_LEN),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [24]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 16000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               1600100   
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 1,760,201
Trainable params: 1,760,201
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.fit(trn_pad, y_train, validation_data=(test_pad, y_test), epochs=1, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/1


<keras.callbacks.History at 0x1169119e8>

The [stanford paper](http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf) that this dataset is from cites a state of the art accuracy (without unlabelled data) of 0.883. So we're short of that, but on the right track.

### Single conv layer with max pooling

A CNN is likely to work better, since it's designed to take advantage of ordered data. We'll need to use a 1D CNN, since a sequence of words is 1D.

In [28]:
conv1 = Sequential([
    Embedding(VOCAB_SIZE, 32, input_length=SEQ_LEN),
    keras.layers.SpatialDropout1D(0.2),
    Conv1D(64, 5, padding='same', activation='relu'),
    Dropout(0.2),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [29]:
conv1.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [30]:
conv1.fit(trn_pad, y_train, validation_data=(test_pad, y_test), epochs=1, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/1


<keras.callbacks.History at 0x11af179b0>

That's well past the Stanford paper's accuracy - another win for CNNs!

In [None]:
conv1.save_weights(model_path + 'conv1.h5')

In [None]:
conv1.load_weights(model_path + 'conv1.h5')

## Pre-trained vectors

You may want to look at wordvectors.ipynb before moving on.

In this section, we replicate the previous CNN, but using pre-trained embeddings.

In [33]:
filename = 'glove.6B.zip'
origin = "http://nlp.stanford.edu/data/glove.6B.zip"
fpath = get_file(filename,origin=origin,extract=True)

Downloading data from http://nlp.stanford.edu/data/glove.6B.zip


In [37]:
# first, build index mapping words in the embeddings set
# to their embedding vector
print('Indexing word vectors.')
GLOVE_DIR = '../../../.keras/datasets/'

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.50d.txt'), encoding='utf-8')
for line in f:
    values = line.split()
    word  = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


Indexing word vectors.
Found 400000 word vectors.


In [38]:
print(trn_pad[0])

[   1   14   22   16   43  530  973 1622 1385   65  458 4468   66 3941    4
  173   36  256    5   25  100   43  838  112   50  670 4999    9   35  480
  284    5  150    4  172  112  167 4999  336  385   39    4  172 4536 1111
   17  546   38   13  447    4  192   50   16    6  147 2025   19   14   22
    4 1920 4613  469    4   22   71   87   12   16   43  530   38   76   15
   13 1247    4   22   17  515   17   12   16  626   18 4999    5   62  386
   12    8  316    8  106    5    4 2223 4999   16  480   66 3785   33    4
  130   12   16   38  619    5   25  124   51   36  135   48   25 1415   33
    6   22   12  215   28   77   52    5   14  407   16   82 4999    8    4
  107  117 4999   15  256    4 4999    7 3766    5  723   36   71   43  530
  476   26  400  317   46    7    4 4999 1029   13  104   88    4  381   15
  297   98   32 2071   56   26  141    6  194 4999   18    4  226   22   21
  134  476   26  480    5  144   30 4999   18   51   36   28  224   92   25
  104    4  

The glove word ids and imdb word ids use different indexes. So we create a simple function that creates an embedding matrix using the indexes from imdb, and the embeddings from glove (where they exist).

In [39]:
print('Preparing embedding matrix: ')

# prepare embedding matrix
EMBEDDING_DIM = len(embeddings_index['the'])

Preparing embedding matrix: 


In [40]:
embedding_matrix = np.random.rand(VOCAB_SIZE, EMBEDDING_DIM)

for i in range(1+3, VOCAB_SIZE):
    word = idx2word[i-3]
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print(embedding_matrix.shape)
#embeddings_index = None

(5000, 50)


## Conferindo se o embedding está correto: "french" é 785

In [41]:
print(idx2word[785 - 3])
print(embeddings_index['french'])

french
[ 0.45339     0.26154    -1.30939996 -0.90161997 -0.41122001 -0.069015
 -0.76038998  0.40610999 -0.87055999 -0.42965001  0.91373998 -0.028381
 -0.31878999  0.17625     0.22379    -0.98918003 -0.020874   -0.52390999
 -0.86188    -0.73082     0.1003      0.64379001 -0.51221001  0.47626999
 -0.53171998 -1.64590001 -0.93770999 -0.28375    -0.53267002  1.2579
  2.87129998 -0.13863    -0.79716003  0.67035002 -0.11338    -0.26626
 -0.38982999  0.49285001 -0.28016999 -0.25558001  1.58039999  0.29426
  1.8829     -1.30649996  0.42648     0.13531999 -0.72596002  0.19035999
  0.50769001 -0.61831999]


In [42]:
print(embedding_matrix[785])

[ 0.45339     0.26154    -1.30939996 -0.90161997 -0.41122001 -0.069015
 -0.76038998  0.40610999 -0.87055999 -0.42965001  0.91373998 -0.028381
 -0.31878999  0.17625     0.22379    -0.98918003 -0.020874   -0.52390999
 -0.86188    -0.73082     0.1003      0.64379001 -0.51221001  0.47626999
 -0.53171998 -1.64590001 -0.93770999 -0.28375    -0.53267002  1.2579
  2.87129998 -0.13863    -0.79716003  0.67035002 -0.11338    -0.26626
 -0.38982999  0.49285001 -0.28016999 -0.25558001  1.58039999  0.29426
  1.8829     -1.30649996  0.42648     0.13531999 -0.72596002  0.19035999
  0.50769001 -0.61831999]


In [43]:
for i in np.nonzero(embedding_matrix.max(axis=1) == 0.0)[0]:
    if i>3:
        print(idx2word[i-3])

We pass our embedding matrix to the Embedding constructor, and set it to non-trainable.

In [44]:
model = Sequential([
    Embedding(VOCAB_SIZE, EMBEDDING_DIM, 
              input_length=SEQ_LEN,
              weights=[embedding_matrix], trainable=False),
    keras.layers.SpatialDropout1D(0.25),
    Conv1D(64, 5, padding='same', activation='relu'),
    Dropout(0.25),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [45]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [46]:
model.fit(trn_pad, y_train, validation_data=(test_pad, y_test), epochs=1, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/1


<keras.callbacks.History at 0x13934e9b0>

We already have beaten our previous model! But let's fine-tune the embedding weights - especially since the words we couldn't find in glove just have random embeddings.

In [47]:
model.layers[0].trainable=True

In [48]:
model.optimizer.lr=1e-4

In [49]:
model.fit(trn_pad, y_train, validation_data=(test_pad, y_test), epochs=1, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/1


<keras.callbacks.History at 0x13ac6a7b8>

As expected, that's given us a nice little boost. :)

In [52]:
model.save_weights(model_path+'glove50.h5')

## Multi-size CNN

This is an implementation of a multi-size CNN as shown in Ben Bowles' [excellent blog post](https://quid.com/feed/how-quid-uses-deep-learning-with-small-data).

In [53]:
from keras.layers import Merge

We use the functional API to create multiple conv layers of different sizes, and then concatenate them.

In [54]:
graph_in = Input ((VOCAB_SIZE, EMBEDDING_DIM))
convs = [ ] 
for fsz in range (3, 6): 
    x = Conv1D(64, fsz, padding='same', activation="relu")(graph_in)
    x = MaxPooling1D()(x) 
    x = Flatten()(x) 
    convs.append(x)
out = keras.layers.concatenate(convs) 
graph = Model(graph_in, out) 

We then replace the conv/max-pool layer in our original CNN with the concatenated conv layers.

In [55]:
model = Sequential ([
    Embedding(VOCAB_SIZE, EMBEDDING_DIM, 
              input_length=SEQ_LEN, 
              weights=[embedding_matrix]),
    keras.layers.SpatialDropout1D(0.2),
    graph,
    Dropout (0.5),
    Dense (100, activation="relu"),
    Dropout (0.7),
    Dense (1, activation='sigmoid')
    ])

In [56]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [57]:
model.fit(trn_pad, y_train, validation_data=(test_pad, y_test), epochs=1, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/1


<keras.callbacks.History at 0x1393c2cc0>

Interestingly, I found that in this case I got best results when I started the embedding layer as being trainable, and then set it to non-trainable after a couple of epochs. I have no idea why!

In [None]:
model.layers[0].trainable=False

In [None]:
model.optimizer.lr=1e-5

In [None]:
model.fit(trn, labels_train, validation_data=(test, labels_test), nb_epoch=2, batch_size=64)

This more complex architecture has given us another boost in accuracy.

## LSTM

We haven't covered this bit yet!

In [61]:
model = Sequential([
    Embedding(VOCAB_SIZE, 32, input_length=SEQ_LEN, 
              mask_zero=True,
              embeddings_regularizer=l2(1e-6)),
    LSTM(100, implementation=2),
    Dense(1, activation='sigmoid')])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________


In [62]:
model.fit(trn_pad, y_train, validation_data=(test_pad, y_test), epochs=1, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/1


<keras.callbacks.History at 0x140f089e8>