In [7]:
import tensorflow as tf

from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import os
import tarfile

## Getting data from "20 Newsgroup dataset"

In [3]:
path = get_file('news20.tar.gz', origin='http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz')

tar = tarfile.open(path, "r:gz")
tar.extractall()
tar.close()

NEWS_GROUP_20 = "./20_newsgroup"

Downloading data from http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz

In [68]:
from keras.utils import to_categorical

texts = [] 
labels_index = {}  
labels = []

for name in sorted(os.listdir(NEWS_GROUP_20)):
    path = os.path.join(NEWS_GROUP_20, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                if sys.version_info < (3,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                t = f.read()
                i = t.find('\n\n')  # skip header
                if 0 < i:
                    t = t[i:]
                texts.append(t)
                f.close()
                labels.append(label_id)

print('Found %s texts.' % len(texts))


labels = to_categorical(np.asarray(labels))


Found 19997 texts.


In [139]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(texts)
seq = tokenizer.texts_to_sequences(texts)

In [140]:
idx = tokenizer.word_index
print(f'Number of unique tokens {len(idx)}')

Number of unique tokens 174074


In [141]:
MAX_LEN = 1000
padded_seq = pad_sequences(seq, maxlen=MAX_LEN)

In [142]:
print('Shape of data tensor:', padded_seq.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (19997, 1000)
Shape of label tensor: (19997, 20)


### Splitting data into training/validation set

In [143]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(padded_seq, labels, test_size=0.3)

print('Shape of train data tensor:', train_x.shape)
print('Shape of train label tensor:', train_y.shape)
print('Shape of test data tensor:', test_x.shape)
print('Shape of test label tensor:', test_y.shape)

Shape of train data tensor: (13997, 1000)
Shape of train label tensor: (13997, 20)
Shape of test data tensor: (6000, 1000)
Shape of test label tensor: (6000, 20)


### Downloading Glove embeddings

From this [link](http://nlp.stanford.edu/data/glove.6B.zip)

In [80]:
path = get_file('glove.6B.zip', origin='http://nlp.stanford.edu/data/glove.6B.zip')

import zipfile

GLOVE_DIR ="./glove"

zip_ref = zipfile.ZipFile(path, 'r')
zip_ref.extractall(GLOVE_DIR)
zip_ref.close()

### Reading embedding data

In [83]:
embeddings = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings[word] = coefs

print(f'Found {len(embeddings)} word vectors.')

Found 400000 word vectors.


### Building embedding matrix

In [160]:
EMBEDDING_DIM = 100
embedding_matrix = np.zeros((len(idx) + 1, EMBEDDING_DIM))
for k,v in idx.items():
    if k in embeddings:
        embedding_matrix[v] = embeddings[k]
print(f'Embedding matrix shape: {embedding_matrix.shape}')

Embedding matrix shape: (174075, 100)


In [163]:
from keras.layers import Embedding

embedding_layer = Embedding(len(idx) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_LEN,
                            trainable=False)

### Training Convolutional NN classifier

In [None]:
from keras.layers import Conv1D, MaxPooling1D, Dense, Flatten, Dropout
from keras.models import Sequential

model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(30))
model.add(Flatten())
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(len(labels_index), activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.fit(train_x, train_y, validation_data=(test_x, test_y),
         epochs=20, batch_size=128)

Train on 13997 samples, validate on 6000 samples
Epoch 1/20