In [1]:
from sklearn.datasets import fetch_20newsgroups
from keras.layers import  Dropout, Dense
from keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import metrics

Using TensorFlow backend.


# TFIDF

In [2]:
def TFIDF(X_train, X_test,MAX_NB_WORDS=75000):
    vectorizer_x = TfidfVectorizer(max_features=MAX_NB_WORDS)
    X_train = vectorizer_x.fit_transform(X_train).toarray()
    X_test = vectorizer_x.transform(X_test).toarray()
    print("tf-idf with",str(np.array(X_train).shape[1]),"features")
    return (X_train,X_test)

# Building DNN model 

In [3]:
def Build_Model_DNN_Text(shape, nClasses, dropout=0.5):
    """
    buildModel_DNN_Tex(shape, nClasses,dropout)
    Build Deep neural networks Model for text classification
    Shape is input feature space
    nClasses is number of classes
    """
    model = Sequential()
    node = 512 # number of nodes
    nLayers = 4 # number of  hidden layer
    model.add(Dense(node,input_dim=shape,activation='relu'))
    model.add(Dropout(dropout))
    for i in range(0,nLayers):
        model.add(Dense(node,input_dim=node,activation='relu'))
        model.add(Dropout(dropout))
    model.add(Dense(nClasses, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

# Loading Dataset

In [4]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

In [5]:
X_train_tfidf,X_test_tfidf = TFIDF(X_train,X_test)
model_DNN = Build_Model_DNN_Text(X_train_tfidf.shape[1], 20)
model_DNN.fit(X_train_tfidf, y_train,
                              validation_data=(X_test_tfidf, y_test),
                              epochs=10,
                              batch_size=128,
                              verbose=2)

tf-idf with 75000 features
Train on 11314 samples, validate on 7532 samples
Epoch 1/10
 - 50s - loss: 2.7930 - accuracy: 0.1034 - val_loss: 2.0432 - val_accuracy: 0.3204
Epoch 2/10
 - 42s - loss: 1.4573 - accuracy: 0.4764 - val_loss: 0.9871 - val_accuracy: 0.6762
Epoch 3/10
 - 42s - loss: 0.6628 - accuracy: 0.7542 - val_loss: 0.8145 - val_accuracy: 0.7430
Epoch 4/10
 - 44s - loss: 0.3388 - accuracy: 0.8828 - val_loss: 0.8094 - val_accuracy: 0.7767
Epoch 5/10
 - 45s - loss: 0.1859 - accuracy: 0.9390 - val_loss: 0.8621 - val_accuracy: 0.7924
Epoch 6/10
 - 44s - loss: 0.1107 - accuracy: 0.9668 - val_loss: 0.9012 - val_accuracy: 0.8020
Epoch 7/10
 - 43s - loss: 0.0994 - accuracy: 0.9725 - val_loss: 0.9136 - val_accuracy: 0.7983
Epoch 8/10
 - 42s - loss: 0.0671 - accuracy: 0.9806 - val_loss: 1.0476 - val_accuracy: 0.7900
Epoch 9/10
 - 42s - loss: 0.0532 - accuracy: 0.9844 - val_loss: 0.9900 - val_accuracy: 0.7973
Epoch 10/10
 - 42s - loss: 0.0474 - accuracy: 0.9864 - val_loss: 0.9722 - val_

<keras.callbacks.callbacks.History at 0x20272ab2278>

In [9]:
predicted = model_DNN.predict_classes(X_test_tfidf)

In [10]:
predicted

array([ 9, 12,  0, ...,  9,  3, 15], dtype=int64)

In [11]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.73      0.79      0.76       319
           1       0.64      0.73      0.68       389
           2       0.73      0.64      0.68       394
           3       0.58      0.78      0.67       392
           4       0.83      0.74      0.78       385
           5       0.69      0.77      0.73       395
           6       0.82      0.84      0.83       390
           7       0.95      0.83      0.88       396
           8       0.89      0.95      0.92       398
           9       0.94      0.93      0.93       397
          10       0.98      0.95      0.97       399
          11       0.96      0.87      0.91       396
          12       0.71      0.72      0.72       393
          13       0.86      0.77      0.81       396
          14       0.90      0.85      0.88       394
          15       0.92      0.87      0.89       398
          16       0.74      0.90      0.81       364
          17       0.99    