In [1]:
from keras.datasets import reuters

Using TensorFlow backend.


In [2]:
(train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000)

Downloading data from https://s3.amazonaws.com/text-datasets/reuters.npz

In [3]:
len(train_data), len(test_data)

(8982, 2246)

In [4]:
train_data[10]

[1,
 245,
 273,
 207,
 156,
 53,
 74,
 160,
 26,
 14,
 46,
 296,
 26,
 39,
 74,
 2979,
 3554,
 14,
 46,
 4689,
 4329,
 86,
 61,
 3499,
 4795,
 14,
 61,
 451,
 4329,
 17,
 12]

In [5]:
word_index = reuters.get_word_index()

Downloading data from https://s3.amazonaws.com/text-datasets/reuters_word_index.json

In [6]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
decoded_newswire = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[10]])

In [7]:
decoded_newswire

'? period ended december 31 shr profit 11 cts vs loss 24 cts net profit 224 271 vs loss 511 349 revs 7 258 688 vs 7 200 349 reuter 3'

In [8]:
# Let's proceed to vectorize the data and prepare it for Keras as a tensor vector

In [9]:
import numpy as np

def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros([len(sequences), dimension])
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.0
    return results

In [10]:
x_train = vectorize_sequences(train_data)

In [11]:
x_test = vectorize_sequences(test_data)

In [12]:
# Let's encode the vector using one-hot encoding instead

In [13]:
def to_one_hot(labels, dimension=64):
    results = np.zeros((len(labels), dimension))
    for i, label in enumerate(labels):
        results[i, label] = 1.0
    return results

In [14]:
one_hot_train_labels = to_one_hot(train_labels)

In [15]:
one_hot_test_labels = to_one_hot(test_labels)

In [16]:
# Other possible solution could be to use the package utility

In [17]:
from keras.utils.np_utils import to_categorical

one_hot_train_labels = to_categorical(train_labels)
one_hot_test_labels = to_categorical(test_labels)

In [18]:
# Since we have 46 different classes, 16 hidden units can cause information bottleneck in the system.
# So lets go to 64 hidden units to avoid that

In [19]:
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(46, activation='softmax'))

In [20]:
model.compile(optimizer='rmsprop',
             loss='categorical_crossentropy',
             metrics = ['accuracy'])

In [21]:
x_val = x_train[:10000]
partial_x_train = x_train[10000:]

y_val = one_hot_train_labels[:10000]
partial_y_train = one_hot_train_labels[10000:]

In [22]:
partial_x_train.shape

(0, 10000)

In [23]:
model.compile(optimizer='rmsprop',
             loss='categorical_crossentropy',
             metrics = ['accuracy'])

history = model.fit(partial_x_train, partial_y_train, epochs=20, batch_size=512, validation_data=(x_val, y_val))

Train on 0 samples, validate on 8982 samples
Epoch 1/20


AttributeError: 'ProgbarLogger' object has no attribute 'log_values'

In [None]:
import matplotlib.pyplot as plt
history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values, 'bo', label = 'Training loss')
plt.plot(epochs, val_loss_values, 'b', label = 'Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
plt.clf()
acc = history_dict['acc']
val_acc = history_dict['val_acc']
plt.plot(epochs, acc, 'bo', label = 'Training accuracy')
plt.plot(epochs, val_acc, 'b', label = 'Validation accuracy')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [24]:
# Retrain the model from scratch again based on new epochs learnings.

In [25]:
model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(46, activation='softmax'))

model.compile(optimizer='rmsprop',
             loss='categorical_crossentropy',
             metrics = ['accuracy'])

model.fit(partial_x_train, partial_y_train, epochs=9, batch_size=512, validation_data=(x_val, y_val))

results = model.evaluate(x_test, one_hot_test_labels)

Train on 0 samples, validate on 8982 samples
Epoch 1/9


AttributeError: 'ProgbarLogger' object has no attribute 'log_values'

In [None]:
results

In [26]:
import copy

test_lables_copy = copy.copy(test_labels)
np.random.shuffle(test_lables_copy)
hits_array = np.array(test_labels) == np.array(test_lables_copy)
float(np.sum(hits_array)) / len(test_labels)

In [None]:
predictions = model.predict(x_test)

In [None]:
predictions[0].shape

In [None]:
# The co-eff in this vector sum to 1.0 probability

np.sum(predictions[0])

In [None]:
# The largest entry in the predicted class is

np.argmax(predictions[0])