In [1]:
import keras
import itertools
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

from keras.datasets import reuters
from nltk.corpus import reuters as NLTK_reuters
from sklearn.datasets import fetch_20newsgroups

Using TensorFlow backend.


In [2]:
sequence_length = 200

def load_reuters_nltk():
    data = {'train':[], 'test':[]}

    vals = {doc:(NLTK_reuters.words(doc), NLTK_reuters.categories(doc)) for doc in NLTK_reuters.fileids()}
    for key, val in vals.items():
        text = val[0][0:sequence_length]
        cats = val[1]
        for cat in cats:
            p = (text, cat)
            if 'train' in key: data['train'].append(p)
            if 'test' in key: data['test'].append(p) 

    (x_test, y_test) = zip(*data['test'])
    (x_train, y_train) = zip(*data['train'])
    return (x_train, y_train), (x_test, y_test)

def load_reuters_keras():
    return reuters.load_data(path="reuters.npz", num_words=None, skip_top=0, maxlen=sequence_length, 
                             test_split=0.2, seed=113, start_char=1, oov_char=2, index_from=3)


In [3]:
(x_train, y_train), (x_test, y_test) =  load_reuters_keras()
y_train = keras.utils.to_categorical(y_train)
y_test = keras.utils.to_categorical(y_test)

In [4]:
num_classes = y_test.shape[1]
embedding_vecor_length = 32
batch_size = 64
epochs = 5

In [5]:
x_train = sequence.pad_sequences(x_train, maxlen=sequence_length)
x_test = sequence.pad_sequences(x_test, maxlen=sequence_length)

In [6]:
hidden_neurons = 200 

model = Sequential()
model.add(Embedding(np.max(x_train) + 1, hidden_neurons, input_length=sequence_length))
model.add(LSTM(hidden_neurons,  dropout=0.2, recurrent_dropout=0.2, return_sequences=False))
model.add(Dense(num_classes))
model.add(Activation('relu'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 200)          6195400   
_________________________________________________________________
lstm_1 (LSTM)                (None, 200)               320800    
_________________________________________________________________
dense_1 (Dense)              (None, 46)                9246      
_________________________________________________________________
activation_1 (Activation)    (None, 46)                0         
Total params: 6,525,446
Trainable params: 6,525,446
Non-trainable params: 0
_________________________________________________________________
None


In [7]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=epochs, batch_size=batch_size)

Train on 7076 samples, validate on 1770 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f7f2773b090>