## The Reutres dataset

A set of short newswires and their topics, published by Reuters in 1986. There are 46 different topics and each topic has at least 10 examples in the training set.

### Loading the Reuters dataset

In [1]:
from tensorflow.keras.datasets import reuters
from tensorflow.keras.utils import to_categorical # Converts a class vector (integers) to binary class matrix.
from tensorflow.keras import models
from tensorflow.keras import layers

import numpy as np

In [2]:
# Restrict the data to the 10,000 most frequently occurring words found in the data.
(train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000)

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [3]:
print("Train shape:")
print(train_data.shape)
print(train_labels.shape)
print("\nTest shape:")
print(test_data.shape)
print(test_labels.shape)

Train shape:
(8982,)
(8982,)

Test shape:
(2246,)
(2246,)


In [4]:
print(train_data[0]) # word indices

[1, 2, 2, 8, 43, 10, 447, 5, 25, 207, 270, 5, 3095, 111, 16, 369, 186, 90, 67, 7, 89, 5, 19, 102, 6, 19, 124, 15, 90, 67, 84, 22, 482, 26, 7, 48, 4, 49, 8, 864, 39, 209, 154, 6, 151, 6, 83, 11, 15, 22, 155, 11, 15, 7, 48, 9, 4579, 1005, 504, 6, 258, 6, 272, 11, 15, 22, 134, 44, 11, 15, 16, 8, 197, 1245, 90, 67, 52, 29, 209, 30, 32, 132, 6, 109, 15, 17, 12]


In [5]:
print(train_labels[0]) # Topic index (integer between 0 and 45)

3


### Decoding newswires back to text

In [6]:
word_index = reuters.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
decoded_newswire = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]]) # offset by 3 cause reserved indices
print(decoded_newswire)

? ? ? said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3


### Preparing the data

#### Encoding the data

In [7]:
def vectorize_seq(sequences, dim=10000):
    results = np.zeros((len(sequences), dim))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results

In [8]:
x_train = vectorize_seq(train_data) # Vectorized train data
x_test = vectorize_seq(test_data) # Vectorized test data

In [9]:
# To vectorize the labels
one_hot_train_labels = to_categorical(train_labels)
one_hot_test_labels = to_categorical(test_labels)

### Building the network

In [10]:
model = models.Sequential()

model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(64, activation='relu'))

# For each input sample, the network will output a 46-dimensional vector.
model.add(layers.Dense(46, activation='softmax')) # Softmax converts a real vector to a vector of categorical probabilities.

### Compiling the model

In [11]:
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

### Validating

#### Setting aside a validation set

In [12]:
x_val = x_train[:1000]
partial_x_train = x_train[1000:]

y_val = one_hot_train_labels[:1000]
partial_y_train = one_hot_train_labels[1000:]

### Training the model

In [None]:
# For 20 epochs
EPOCHS = 20
model.fit(partial_x_train,
          partial_y_train,
          epochs=EPOCHS,
          batch_size=512,
          validation_data=(x_val, y_val))