In [1]:
from keras.models import Model
from keras.layers import Input, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
import numpy as np
import os

Using TensorFlow backend.


In [2]:
BASE_PATH = './'
GLOVE_PATH = os.path.join(BASE_PATH, 'glove.6B/glove.6B.50d.txt')
NEWS_PATH = os.path.join(BASE_PATH, '20_newsgroup')
MAX_VOCAB = 20000
MAX_SEQ_LEN = 1000
EMBEDDING_DIM = 50

In [3]:
glove_dic = {}
with open(GLOVE_PATH) as f:
    for line in f:
        values = line.split()
        glove_dic[values[0]] = np.array(values[1:], dtype=np.float32)

In [4]:
texts = []
label_index = {}
label_ids = []

for label in os.listdir(NEWS_PATH):
    label_id = len(label_index)
    label_index[label] = label_id
    path = os.path.join(NEWS_PATH, label)
    
    for fname in os.listdir(path):
        with open(os.path.join(path, fname), encoding='latin-1') as f:
            t = f.read()
            i = t.find('\n\n')  # skip header
            if i != -1:
                t = t[i+2:]
            texts.append(t)
            
        label_ids.append(label_id)

In [5]:
tokenizer = Tokenizer(num_words=MAX_VOCAB)
tokenizer.fit_on_texts(texts)

data = tokenizer.texts_to_sequences(texts)
data = pad_sequences(data, maxlen=MAX_SEQ_LEN)
labels = to_categorical(np.array(label_ids))

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

In [None]:
word_index = tokenizer.word_index
num_vocab = min(MAX_VOCAB, len(word_index)+1)

embedding_matrix = np.zeros((num_vocab, EMBEDDING_DIM))
for word, index in word_index.items():
    if index >= MAX_VOCAB:
        continue
    word_vec = glove_dic.get(word)
    if word_vec is not None:
        embedding_matrix[index] = word_vec    

In [6]:
# build model
seq_input = Input(shape=(MAX_SEQ_LEN,), dtype='int32')
seq_embedding = Embedding(num_vocab,
                          EMBEDDING_DIM,
                          weights=[embedding_matrix],
                          input_length=MAX_SEQ_LEN,
                          trainable=False)(seq_input)

x = Conv1D(128, 5, activation='relu')(seq_embedding)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(label_index), activation='softmax')(x)
model = Model(seq_input, preds)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 50)          1000000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 996, 128)          32128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
__________

In [7]:
model.fit(data, labels, batch_size=64, epochs=3, validation_split=0.2)

Train on 15997 samples, validate on 4000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f7d1a288da0>

In [8]:
model.save('model.h5')