# Text Classification in Keras

In [74]:
import keras
from keras.datasets import reuters

In [75]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None,
                                                        test_split=0.2)
word_index = reuters.get_word_index()

number of classes will be the maximum value in the y_train + 1,  
since the classes start from 0

In [76]:
print('# of training sample: {}'.format(len(x_train)))
print('# of training sample: {}'.format(len(x_test)))

num_classes = max(y_train) + 1
print('# of classes: {}'.format(num_classes))

# of training sample: 8982
# of training sample: 2246
# of classes: 46


[1, 27595, 28842, 8 ..]  
what this means is that, the sentence is composed of, 1st most frequent word, 27595 most frequent word, basically each of these numbers is mapped to a word and the numebr is how frequent that word is used.

In [77]:
print(x_train[0])
print(y_train[0])

[1, 27595, 28842, 8, 43, 10, 447, 5, 25, 207, 270, 5, 3095, 111, 16, 369, 186, 90, 67, 7, 89, 5, 19, 102, 6, 19, 124, 15, 90, 67, 84, 22, 482, 26, 7, 48, 4, 49, 8, 864, 39, 209, 154, 6, 151, 6, 83, 11, 15, 22, 155, 11, 15, 7, 48, 9, 4579, 1005, 504, 6, 258, 6, 272, 11, 15, 22, 134, 44, 11, 15, 16, 8, 197, 1245, 90, 67, 52, 29, 209, 30, 32, 132, 6, 109, 15, 17, 12]
3


'money' is the 236 most frequent word

In [78]:
word_index['money']

236

In [79]:
index_to_word = {}
for k, v in word_index.items():
    index_to_word[v] = k

In [80]:
index_to_word[236]

'money'

In [81]:
print(' '.join([index_to_word[x] for x in x_train[0]]))


the wattie nondiscriminatory mln loss for plc said at only ended said commonwealth could 1 traders now april 0 a after said from 1985 and from foreign 000 april 0 prices its account year a but in this mln home an states earlier and rise and revs vs 000 its 16 vs 000 a but 3 psbr oils several and shareholders and dividend vs 000 its all 4 vs 000 1 mln agreed largely april 0 are 2 states will billion total and against 000 pct dlrs


In [82]:
from keras.preprocessing.text import Tokenizer

max_words = 10000

tokenizer = Tokenizer(num_words=max_words)

x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')

y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [83]:
print(x_train.shape)
print(x_train[0])

print(y_train.shape)
print(y_train[0])

(8982, 10000)
[0. 1. 0. ... 0. 0. 0.]
(8982, 46)
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [84]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

model = Sequential()
model.add(Dense(512, input_shape=(max_words, )))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [85]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.metrics_names)

['loss', 'acc']


In [90]:
batch_size = 128
epochs = 10

history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs
                   , verbose=1, validation_split=0.1)
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)

Train on 8083 samples, validate on 899 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 1.0624586941403995
Test accuray: 0.7996438118037855


In [97]:
print('Test loss: {0}'.format(score[0], 2))
print('Test accuray: {0}'.format(score[1]))

Test loss: 1.0624586941403995
Test accuray: 0.7996438118037855
