In [None]:
# check news data 

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import reuters  # reuter news data from keras dataset- tokenization, encoding completed): news = 11,258  category = 46


In [None]:
# data split(8:2) 

(X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)

print('news as training data : {}'.format(len(X_train)))
print('news as test data : {}'.format(len(X_test)))
num_classes = len(set(y_train))
print('news_classes : {}'.format(num_classes))

In [None]:
print('first news in traing data :',X_train[0])
print('news label of the first news :',y_train[0])

In [None]:
print('the longest length of the news samples :{}'.format(max(len(sample) for sample in X_train)))
print('the average length of the news samples :{}'.format(sum(map(len, X_train))/len(X_train)))

plt.hist([len(sample) for sample in X_train], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
# distribution of news labels
fig, axe = plt.subplots(ncols=1)
fig.set_size_inches(12,5)
sns.countplot(y_train)

In [None]:
unique_elements, counts_elements = np.unique(y_train, return_counts=True)
print("frequency of each label:")
print(np.asarray((unique_elements, counts_elements)))

In [None]:
# word index
word_to_index = reuters.get_word_index()
#print(word_to_index)

In [None]:
index_to_word = {}
for key, value in word_to_index.items():
    index_to_word[value+3] = key  # index to word +3 (0: padding, 1: sos, 2: OOV)

print('first word in the freq list  : {}'.format(index_to_word[4]))
print('128th word in the freq list  : {}'.format(index_to_word[131]))

In [None]:
# include 3 tokens (pad, sos, unk) into index_to_word (following Keras rules for reuter news dataset)
for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
  index_to_word[index] = token

print(' '.join([index_to_word[index] for index in X_train[0]]))  

In [None]:
# news classification using LSTM

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

vocab_size = 1000
max_len = 100

(X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=vocab_size, test_split=0.2)

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [None]:
embedding_dim = 128
hidden_units = 128
num_classes = 46

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(LSTM(hidden_units))
model.add(Dense(num_classes, activation='softmax'))   # multi-class classification 

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)  # val_loss increases 4 times continuously, stop training to avoid overfitting
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True) # store the model only val_acc is better than before

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
history = model.fit(X_train, y_train, batch_size=128, epochs=30, callbacks=[es, mc], validation_data=(X_test, y_test))

In [None]:
loaded_model = load_model('best_model.h5')
print("\n test accuracy: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))

In [None]:
epochs = range(1, len(history.history['acc']) + 1)
plt.plot(epochs, history.history['loss'])
plt.plot(epochs, history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()