# Trains and evaluate a simple MLP on the Reuters newswire topic classification task.

In [1]:
from __future__ import print_function

import numpy as np
import keras

from keras.datasets import reuters
from keras.preprocessing.text import Tokenizer

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

Using TensorFlow backend.


In [39]:
# the length of an article is kept
max_words = 2000
# seed for split the reuters newswire dataset
seed = 113
# how many records from the dataset are used for test
test_split = 0.2

In [2]:
word_index = reuters.get_word_index(path="reuters_word_index.json")

In [40]:
print('Loading data...')
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words, test_split=0.2)
word_index = reuters.get_word_index(path="reuters_word_index.json")

print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

Loading data...
8982 train sequences
2246 test sequences
46 classes


In [41]:
print(type(x_train), type(x_train[0]), '\n')
print(x_train[0])
print(len(word_index))
print(word_index['spectrum'])

<class 'numpy.ndarray'> <class 'list'> 

[1, 2, 2, 8, 43, 10, 447, 5, 25, 207, 270, 5, 3095, 111, 16, 369, 186, 90, 67, 7, 89, 5, 19, 102, 6, 19, 124, 15, 90, 67, 84, 22, 482, 26, 7, 48, 4, 49, 8, 864, 39, 209, 154, 6, 151, 6, 83, 11, 15, 22, 155, 11, 15, 7, 48, 9, 4579, 1005, 504, 6, 258, 6, 272, 11, 15, 22, 134, 44, 11, 15, 16, 8, 197, 1245, 90, 67, 52, 29, 209, 30, 32, 132, 6, 109, 15, 17, 12]
30979
9385


In [42]:
print('Vectorizing sequence data...')
tokenizer = Tokenizer(num_words = max_words)
# mode: one of "binary", "count", "tfidf", "freq" (default: "binary").
x_train_m = tokenizer.sequences_to_matrix(x_train, mode = 'binary')
x_test_m = tokenizer.sequences_to_matrix(x_test, mode = 'binary')
print('\t x_train_m shape:', x_train_m.shape)
print('\t x_test_m shape:', x_test_m.shape, '\n')

print('Convert class vector to binary class matrix\n'
      '\t(for use with categorical_crossentropy)')
y_train_m = keras.utils.to_categorical(y_train, num_classes)
y_test_m = keras.utils.to_categorical(y_test, num_classes)
print('\t y_train shape:', y_train_m.shape)
print('\t y_test shape:', y_test_m.shape)

Vectorizing sequence data...
	 x_train_m shape: (8982, 20000)
	 x_test_m shape: (2246, 20000) 

Convert class vector to binary class matrix
	(for use with categorical_crossentropy)
	 y_train shape: (8982, 46)
	 y_test shape: (2246, 46)


In [43]:
print('Building model...')
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

batch_size = 32
epochs = 10

history = model.fit(x_train_m, y_train_m,
                    batch_size = batch_size,
                    epochs = epochs,
                    verbose = 1,
                    validation_split = 0.1)

score = model.evaluate(x_test_m, y_test_m,
                       batch_size = batch_size, verbose = 1)

print('Test score:', score[0])
print('Test accuracy:', score[1])

Building model...
Train on 8083 samples, validate on 899 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.790293855797
