In [215]:
import keras
import tensorflow as tf
from keras.datasets import reuters
from keras.preprocessing.text import Tokenizer
from keras.layers import BatchNormalization
from keras.models import Model
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Input
from keras.callbacks import TensorBoard, ModelCheckpoint
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
import os


In [216]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(path="reuters.npz",
                                                         num_words=None,
                                                         skip_top=0,
                                                         maxlen=None,
                                                         test_split=0.2,
                                                         seed=113,
                                                         start_char=1,
                                                         oov_char=2,
                                                         index_from=3)

In [217]:
#Look at a small part of the data
print('Training data: ', x_train[0])
print('Training label: ', y_train[0])
print('Length of training data', len(x_train))
print('Length of test data', len(x_test))

Training data:  [1, 27595, 28842, 8, 43, 10, 447, 5, 25, 207, 270, 5, 3095, 111, 16, 369, 186, 90, 67, 7, 89, 5, 19, 102, 6, 19, 124, 15, 90, 67, 84, 22, 482, 26, 7, 48, 4, 49, 8, 864, 39, 209, 154, 6, 151, 6, 83, 11, 15, 22, 155, 11, 15, 7, 48, 9, 4579, 1005, 504, 6, 258, 6, 272, 11, 15, 22, 134, 44, 11, 15, 16, 8, 197, 1245, 90, 67, 52, 29, 209, 30, 32, 132, 6, 109, 15, 17, 12]
Training label:  3
Length of training data 8982
Length of test data 2246


In [218]:
word_index = reuters.get_word_index(path="reuters_word_index.json")
#Check to see what index the word sport is, a rudimentary test of the index loading
word_index["sport"]

13074

In [219]:
#The index is organized to look up the integer value, it would be better to look up the key
integer_word_index = dict([(value, key) for (key, value) in word_index.items()])

# Now we can search for the word that aligns to a certain key, we looked up sport before so lets check that index
print('The word at index 13074 is: ',integer_word_index[13074])

#how many different words are in the index
print('There are', len(integer_word_index)+1, 'words in the word index')

The word at index 13074 is:  sport
There are 30980 words in the word index


In [220]:
#Max words in an article
max_words = 10000
#46 labels
LABEL_DIMENSIONS = max(y_train)+1

tokenizer = Tokenizer(num_words=max_words)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')

y_train = keras.utils.to_categorical(y_train, LABEL_DIMENSIONS)
y_test = keras.utils.to_categorical(y_test, LABEL_DIMENSIONS)

print(x_train[0])
print(len(x_train[0]))

print(y_train[0])
print(len(y_train[0]))

[0. 1. 0. ... 0. 0. 0.]
10000
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
46


In [240]:
def build_network(vocab_size, embedding_dim, sequence_length):
    input=Input(shape=(sequence_length,), name="Input")
    embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length = sequence_length,
                          name="embedding")(input)
    lstm1 = LSTM(15, activation='tanh', return_sequences=False, dropout=0.1, recurrent_dropout=0.1, name='lstm1')(embedding)
    output= Dense(46, activation = 'sigmoid', name='sigmoid')(lstm1)
    model=Model(inputs=input, outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [241]:
def create_callbacks(name):
    tensorboard_callback = TensorBoard(log_dir=os.path.join(os.getcwd(), "tensorboard_log", name), write_graph=True, write_grads=False)
    checkpoint_callback = ModelCheckpoint(filepath="./model-weights-" + name + ".{epoch:02d}-{val_loss:.6f}.hdf5", monitor='val_loss',
                                          verbose=0, save_best_only=True)
    return [tensorboard_callback]

In [242]:
datamodel = build_network(vocab_size = len(integer_word_index), embedding_dim=100, sequence_length=10000)
callbacks =create_callbacks("reuters")
                          

In [243]:
model.fit(x=x_train, y=y_train, batch_size=24, epochs=10, validation_data=(x_test, y_test),callbacks=callbacks)

Train on 8982 samples, validate on 2246 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1947e681438>