https://github.com/fchollet/deep-learning-with-python-notebooks/blob/master/LICENSE

In [1]:
import keras
keras.__version__

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


'2.1.6'

There are two ways to obtain word embeddings:

1. Learn word embeddings jointly with the main task you care about (e.g. document classification or sentiment prediction). In this setup, you would start with random word vectors, then learn your word vectors in the same way that you learn the weights of a neural network.
2. Load into your model word embeddings that were pre-computed using a different machine learning task than the one you are trying to solve. These are called "pre-trained word embeddings".

# Learning word embeddings with the Embedding layer

In [2]:
from keras.layers import Embedding

In [3]:
# the embedding layer takes at least two arguments
# the number of possible tokens, here (1000) (1+maximum word index)
# and the dimensionality of the embeddings, here 64
embedding_layer = Embedding(1000, 64)

- The Embedding layer is best understood as a dictionary mapping integer indices (which stand for specific words) to dense vectors.
- The Embedding layer takes as input a 2D tensor of integers, of shape (samples, sequence_length), where each entry is a sequence of integers
- It can embed sequences of variable lengths
- All sequences in a batach must have the same length
- This layer returns a 3D floating tensor, of shape (samples, sequence_length, embedding_dimensionality)

# IMDB movie review sentiment prediciton task

In [4]:
from keras.datasets import imdb
from keras import preprocessing

In [5]:
# number of words to consider as features
max_features = 10000
# cut texts after this number of words
maxlen = 20

In [6]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = max_features)

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [7]:
X_train.shape

(25000,)

In [8]:
X_train[:1,]

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32])],
      dtype=object)

In [9]:
# this turns our lists of integers into a 2D integer tensor of shape (samples, maxlen)
X_train = preprocessing.sequence.pad_sequences(X_train, maxlen = maxlen)
X_test  = preprocessing.sequence.pad_sequences(X_test, maxlen = maxlen)

In [10]:
?preprocessing.sequence.pad_sequences

In [11]:
X_train.shape

(25000, 20)

In [12]:
X_train

array([[  65,   16,   38, ...,   19,  178,   32],
       [  23,    4, 1690, ...,   16,  145,   95],
       [1352,   13,  191, ...,    7,  129,  113],
       ...,
       [  11, 1818, 7561, ...,    4, 3586,    2],
       [  92,  401,  728, ...,   12,    9,   23],
       [ 764,   40,    4, ...,  204,  131,    9]], dtype=int32)

In [13]:
from keras.models import Sequential
from keras.layers import Flatten, Dense

In [14]:
model = Sequential()
model.add(Embedding(10000, 8, input_length = maxlen))
# our activations have shape (samples, maxlen, 8) now
model.add(Flatten())
# shape is (samples, maxlen*8)
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = ['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________


In [15]:
history = model.fit(X_train, y_train,
                   epochs = 10, batch_size = 32, 
                   validation_split = 0.2, verbose = 2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
 - 2s - loss: 0.6759 - acc: 0.6043 - val_loss: 0.6398 - val_acc: 0.6810
Epoch 2/10
 - 1s - loss: 0.5657 - acc: 0.7428 - val_loss: 0.5467 - val_acc: 0.7206
Epoch 3/10
 - 1s - loss: 0.4752 - acc: 0.7808 - val_loss: 0.5113 - val_acc: 0.7384
Epoch 4/10
 - 1s - loss: 0.4263 - acc: 0.8079 - val_loss: 0.5008 - val_acc: 0.7454
Epoch 5/10
 - 1s - loss: 0.3930 - acc: 0.8257 - val_loss: 0.4981 - val_acc: 0.7540
Epoch 6/10
 - 1s - loss: 0.3668 - acc: 0.8395 - val_loss: 0.5013 - val_acc: 0.7534
Epoch 7/10
 - 1s - loss: 0.3435 - acc: 0.8534 - val_loss: 0.5051 - val_acc: 0.7518
Epoch 8/10
 - 1s - loss: 0.3223 - acc: 0.8658 - val_loss: 0.5132 - val_acc: 0.7486
Epoch 9/10
 - 1s - loss: 0.3022 - acc: 0.8765 - val_loss: 0.5213 - val_acc: 0.7492
Epoch 10/10
 - 1s - loss: 0.2839 - acc: 0.8860 - val_loss: 0.5302 - val_acc: 0.7466


# Using pre-trained word embeddings

## Download imdb raw data

In [23]:
import os

imdb_dir = './data/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

In [24]:
labels[:5]

[0, 0, 0, 0, 0]

In [25]:
texts[:5]

['I love Alec Guinness. And that\'s saying a lot after this film. Actually, he is not bad in it. He just seems to stand aside, be urbane and his usual delightful self, but invest nada. It is obvious the girl he is matched with is a featherweight, even as an inexperienced young French girl. Sir Alec wouldn\'t have chosen her when he was young and very obviously isn\'t too happy about it now.<br /><br />The interesting character is the brooding brother of the odd "Suzanne", another twit. "Donald" aspires to be a French Heathcliffe and I waited in vain for the source of his mystery. What deep dark secret was he hiding behind that forehead? Was he in love with the father\'s mistress? Why did he jerk Suzanne\'s hair when she plotted to bring the disparate parts of this turkey together on the country estate? Or perhaps he had simply had enough of her obnoxious acting.<br /><br />The film would have been charming with Guiness and the "older woman" reminiscing and seeing Paris together. THAT w

## Tokenize the data

In [26]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [27]:
maxlen = 100  # We will cut reviews after 100 words
training_samples = 200  # We will be training on 200 samples
validation_samples = 10000  # We will be validating on 10000 samples
max_words = 10000  # We will only consider the top 10,000 words in the dataset

In [28]:
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [29]:
word_index = tokenizer.word_index
word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'he': 26,
 'be': 27,
 'one': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'who': 34,
 'so': 35,
 'from': 36,
 'like': 37,
 'her': 38,
 'or': 39,
 'just': 40,
 'about': 41,
 "it's": 42,
 'out': 43,
 'if': 44,
 'has': 45,
 'some': 46,
 'there': 47,
 'what': 48,
 'good': 49,
 'more': 50,
 'when': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'time': 55,
 'she': 56,
 'even': 57,
 'my': 58,
 'would': 59,
 'which': 60,
 'only': 61,
 'story': 62,
 'really': 63,
 'see': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'were': 68,
 'me': 69,
 'well': 70,
 'than': 71,
 'we': 72,
 'much': 73,
 'been': 74,
 'bad': 75,
 'get': 76,
 'will': 77,
 'do': 78,
 'also': 79,
 'into': 80,
 'people': 81,
 'other': 82,
 '

In [30]:
data = pad_sequences(sequences, maxlen = maxlen)

In [31]:
labels = np.asarray(labels)
labels

array([0, 0, 0, ..., 1, 1, 1])

In [32]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

In [33]:
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

## Pre-process the embeddings

In [34]:
glove_dir = './data/glove'

In [35]:
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

Let's build an embedding matrix that we will be able to load into an embedding layer. It must be a matrix of shape (max_words, embedding_dim). Note that the index 0 is not supposed to stand for any word or token -- it's a placeholder.

In [36]:
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if i < max_words:
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

## Define a model

In [37]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

In [38]:
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length = maxlen))
model.add(Flatten())
model.add(Dense(32, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_2 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________


## Load the GloVe embeddings in the model

In [39]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

## Train and evaluate

In [40]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(x_val, y_val),
                    verbose = 2)

Train on 200 samples, validate on 10000 samples
Epoch 1/10
 - 1s - loss: 1.4161 - acc: 0.5200 - val_loss: 0.7040 - val_acc: 0.5121
Epoch 2/10
 - 0s - loss: 0.6880 - acc: 0.6600 - val_loss: 0.7343 - val_acc: 0.5146
Epoch 3/10
 - 0s - loss: 0.5421 - acc: 0.7350 - val_loss: 0.8689 - val_acc: 0.4946
Epoch 4/10
 - 0s - loss: 0.3586 - acc: 0.8500 - val_loss: 0.8633 - val_acc: 0.5041
Epoch 5/10
 - 0s - loss: 0.2856 - acc: 0.9000 - val_loss: 0.6956 - val_acc: 0.5648
Epoch 6/10
 - 0s - loss: 0.1613 - acc: 0.9850 - val_loss: 0.7378 - val_acc: 0.5529
Epoch 7/10
 - 1s - loss: 0.1113 - acc: 0.9900 - val_loss: 0.7838 - val_acc: 0.5451
Epoch 8/10
 - 1s - loss: 0.1684 - acc: 0.9300 - val_loss: 0.7475 - val_acc: 0.5715
Epoch 9/10
 - 1s - loss: 0.0512 - acc: 1.0000 - val_loss: 0.8037 - val_acc: 0.5654
Epoch 10/10
 - 1s - loss: 0.0306 - acc: 1.0000 - val_loss: 0.8054 - val_acc: 0.5690


In [41]:
import matplotlib.pyplot as plt

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>