# Train and Evaluate a Simple MLP on the Reuters newswire Topic Classification Task

In [8]:
from __future__ import print_function

import numpy as np
import keras

from keras.datasets import reuters
from keras.preprocessing.text import Tokenizer

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

# different words will be kept (The length of the vocabulary)
max_words = 2000
# seed for split the reuters newswire dataset
seed = 113
# how many records from the dataset are used for test
test_split = 0.2



In [4]:
# build inverse word dictionary
word_index = reuters.get_word_index(path="reuters_word_index.json")
inverse_word_dict = np.ndarray(shape=(len(word_index)+1,), dtype=object)
for key in word_index:
    index = word_index[key]
    inverse_word_dict[index] = key

print('Loading data...')
(x_train, y_train), (x_test, y_test) = reuters.load_data(test_split=0.2)

print(len(x_train), 'train sequences are loaded.\n')
print(' '.join(inverse_word_dict[x_train[0]]), '\n')
print(len(x_test), 'test sequences are loaded.\n')
print(' '.join(inverse_word_dict[x_test[0]]), '\n')

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

Loading data...
8982 train sequences
the wattie nondiscriminatory mln loss for plc said at only ended said commonwealth could 1 traders now april 0 a after said from 1985 and from foreign 000 april 0 prices its account year a but in this mln home an states earlier and rise and revs vs 000 its 16 vs 000 a but 3 psbr oils several and shareholders and dividend vs 000 its all 4 vs 000 1 mln agreed largely april 0 are 2 states will billion total and against 000 pct dlrs
2246 test sequences
the in wants intermediate 3 how types could mln at against 2 guidelines vs end products opec he will will along results and willingly exports 3 purchased each it stubbornly profit 3 avondale profit agreement in around corp should for 3 cannot mln in ended said avondale a 54 but 3 stubbornly a only generally political primarily date other under well a in british rate gain if demand at an div its taking report montreal systems were is and production had vs 000 7 issued year for 0 a in buffer rate loss may r

In [5]:
print('Vectorizing sequence data...')
tokenizer = Tokenizer(num_words = max_words)
# mode: one of "binary", "count", "tfidf", "freq" (default: "binary").
x_train_m = tokenizer.sequences_to_matrix(x_train, mode = 'binary')
x_test_m = tokenizer.sequences_to_matrix(x_test, mode = 'binary')
print('\t x_train_m shape:', x_train_m.shape)
print('\t x_test_m shape:', x_test_m.shape, '\n')

print('Convert class vector to binary class matrix:\n\t(for use with categorical_crossentropy)')
y_train_m = keras.utils.to_categorical(y_train, num_classes)
y_test_m = keras.utils.to_categorical(y_test, num_classes)
print('\t y_train shape:', y_train_m.shape)
print('\t y_test shape:', y_test_m.shape, '\n')

Vectorizing sequence data...
	 x_train_m shape: (8982, 2000)
	 x_test_m shape: (2246, 2000) 

Convert class vector to binary class matrix
	(for use with categorical_crossentropy)
	 y_train shape: (8982, 46)
	 y_test shape: (2246, 46)


In [13]:
print('Building model...')
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

batch_size = 32
epochs = 10

history = model.fit(x_train_m, y_train_m,
                    batch_size = batch_size,
                    epochs = epochs,
                    verbose = 1,
                    validation_split = 0.1)

score = model.evaluate(x_test_m, y_test_m,
                       batch_size = batch_size, verbose = 1)

print('\nTest score:', score[0])
print('Test accuracy:', score[1])

Building model...
Train on 10182 samples, validate on 1132 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test score: 15.283515053
Test accuracy: 0.0517790759466
