# Preprocessing Data

Import neccesary libararies in our code

In [None]:

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from keras.layers import *
import xgboost
from tqdm import tqdm
import numpy as np
import gensim
from tqdm import tqdm
from time import time
from gensim.models import KeyedVectors
import pickle
import matplotlib.pyplot as plt
import scikitplot as skplt
import numpy as np
import keras as kr

## Feature Engineering

In [None]:
from sklearn.datasets import fetch_20newsgroups

categories = None
remove = ('headers', 'footers', 'quotes')
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories,
                                     shuffle=True, random_state=42)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories,
                                     shuffle=True, random_state=42)
target_names = newsgroups_train.target_names


A simple preprocessing: removing the stop words and lemmatization 

In [None]:
from gensim.utils import simple_preprocess 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english')) 
wordnet_lemmatizer = WordNetLemmatizer()
def preprocessing(corpus):
    res = []
    for doc in corpus:
        words = []
        for word in simple_preprocess(doc):
            if word not in stop_words:
                word1 = wordnet_lemmatizer.lemmatize(word, pos = "n")
                word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v")
                word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a"))
                words.append(word3)
                pass
            pass
        res.append(' '.join(words))        
        pass
    return res

In [None]:
newsgroups_train.data[0]

In [None]:
X_data = preprocessing(newsgroups_train.data)
X_data[0]

In [None]:
y_data = newsgroups_train.target
y_data[0:100]

In [None]:
X_test = preprocessing(newsgroups_test.data)
y_test = newsgroups_test.target

In [None]:
type_name = ''
n_class = 20

### TF-IDF Vectors

In [None]:
# word level - max number of words equal to 30000 except all words (100k+ words)
tfidf_vect = TfidfVectorizer(analyzer='word', max_features=30000)
tfidf_vect.fit(X_data) # learn vocabulary and idf from training set
X_data_tfidf =  tfidf_vect.transform(X_data)
# assume that we don't have test set before
X_test_tfidf =  tfidf_vect.transform(X_test)
pickle.dump(tfidf_vect, open(type_name + "tfidf.pickle", "wb"))

In [None]:
# ngram level - max number of words equal to 30000 except all words (100k+ words)
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', max_features=30000, ngram_range=(1, 2))
tfidf_vect_ngram.fit(X_data)
X_data_tfidf_ngram =  tfidf_vect_ngram.transform(X_data)
# assume that we don't have test set before
X_test_tfidf_ngram =  tfidf_vect_ngram.transform(X_test)
pickle.dump(tfidf_vect_ngram, open(type_name + "tfidf_ngram.pickle", "wb"))

#### Transform by SVD to decrease number of dimensions

##### Word Level

In [None]:
svd = TruncatedSVD(n_components= 2000, random_state=42)
svd.fit(X_data_tfidf)
pickle.dump(svd, open(type_name + "tfidf_svd.pickle", "wb"))
X_data_tfidf_svd = svd.transform(X_data_tfidf)
X_test_tfidf_svd = svd.transform(X_test_tfidf)

##### ngram Level

In [None]:
svd_ngram = TruncatedSVD(n_components=2000, random_state=42)
svd_ngram.fit(X_data_tfidf_ngram)
pickle.dump(svd_ngram, open(type_name + "tfidf_ngram_svd.pickle", "wb"))
X_data_tfidf_ngram_svd = svd_ngram.transform(X_data_tfidf_ngram)
X_test_tfidf_ngram_svd = svd_ngram.transform(X_test_tfidf_ngram)

### Convert y to categorical

In [None]:
from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()
y_data_n = encoder.fit_transform(y_data)
y_test_n = encoder.fit_transform(y_test)
encoder.classes_
results = []

# Model

Plot function of NN model's learning curve

In [None]:
def plot_history(history):
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()

Training function

In [None]:
def train_model(name, classifier, X_data, y_data, X_test, y_test, is_neuralnet=False, n_epochs=3): 
    print('_' * 80)
    print("Training: " + name)
    X_train, y_train = X_data, y_data
    t0 = time()
    train_time = time() - t0
    test_time = -1
    if is_neuralnet:
        es_callback = kr.callbacks.EarlyStopping(monitor='val_loss', patience=10)
        history = classifier.fit(X_train, y_train, epochs=n_epochs, validation_split = 0.2, batch_size=512, callbacks=[es_callback], verbose = 0)
        train_time = time() - t0
        plot_history(history)
        t0 = time()
        test_probas = classifier.predict(X_test)
        test_time = time() - t0
        test_predictions = test_probas.argmax(axis=-1)
        if name != '': classifier.save(name + '.h5')
        skplt.metrics.plot_roc(y_test, test_probas, figsize = (10,8), title = name,)
        skplt.metrics.plot_confusion_matrix(y_test, test_predictions, figsize = (10,9), title = name, normalize=True)
    else:
        classifier.fit(X_train, y_train)
        train_time = time() - t0
        t0 = time()
        test_predictions = classifier.predict(X_test)
        test_time = time() - t0
        if name != '':
            f = open(name + '.pickle', 'wb')
            pickle.dump(classifier, f)
            f.close()
        y_probas = classifier.predict_proba(X_test)
        skplt.metrics.plot_roc(y_test, y_probas, figsize = (10,8), title = name, )
        skplt.estimators.plot_learning_curve(classifier, X_train, y_train, title='Learning Curve (' + name + ')')
        skplt.metrics.plot_confusion_matrix(y_test,test_predictions, figsize = (10,9), title = name, normalize=True)
    acc_score = metrics.accuracy_score(y_test, test_predictions)
    F1_score = metrics.f1_score(y_test, test_predictions, average = None)

    print("train time: %0.3fs" % train_time)
    print("test time:  %0.3fs" % test_time)
    print("accuracy:   %0.3f" % acc_score)
    print("classification report:")
    print(metrics.classification_report(y_test, test_predictions,
                                        target_names=target_names))
    clf_descr = str(classifier).split('(')[0]
    return name, acc_score, train_time, test_time

## Naive Bayes

In [None]:
from sklearn import naive_bayes

In [None]:
results.append(train_model(type_name + 'naive_bayes', naive_bayes.MultinomialNB(alpha = 0.1), X_data_tfidf, y_data,
            X_test_tfidf, y_test, is_neuralnet=False))

In [None]:
results.append(train_model(type_name + 'naive_bayes_ngram', naive_bayes.MultinomialNB(alpha = 0.1), X_data_tfidf_ngram, y_data,
            X_test_tfidf_ngram, y_test, is_neuralnet=False))

### Other type Naive Bayes

In [None]:
results.append(train_model(type_name + 'naive_bayes_bernoulli', naive_bayes.BernoulliNB(alpha = 0.1), X_data_tfidf, y_data, 
            X_test_tfidf, y_test, is_neuralnet=False))

In [None]:
results.append(train_model(type_name + 'naive_bayes_bernoulli_ngram', naive_bayes.BernoulliNB(alpha = 0.1), X_data_tfidf_ngram, y_data,
            X_test_tfidf_ngram, y_test, is_neuralnet=False))

In [None]:
results.append(train_model(type_name + 'naive_bayes_complement', naive_bayes.ComplementNB(alpha = 0.1), X_data_tfidf, y_data, 
            X_test_tfidf, y_test, is_neuralnet=False))

In [None]:
results.append(train_model(type_name + 'naive_bayes_complement', naive_bayes.ComplementNB(alpha = 0.1), X_data_tfidf_ngram, y_data, 
            X_test_tfidf_ngram, y_test, is_neuralnet=False))

## Linear Classifier

In [None]:
results.append(train_model(type_name + 'linear_model',linear_model.LogisticRegression(penalty = 'l2', C = 3.0), X_data_tfidf, y_data,
            X_test_tfidf, y_test, is_neuralnet=False))


In [None]:
results.append(train_model(type_name + 'linear_model_ngram',linear_model.LogisticRegression(penalty = 'l2', C = 3.0), X_data_tfidf_ngram, y_data,
            X_test_tfidf_ngram, y_test, is_neuralnet=False))

## SVM Model

In [None]:
results.append(train_model(type_name + 'svm', svm.SVC(kernel = 'linear', C = 3.0), X_data_tfidf_svd, y_data,
            X_test_tfidf_svd, y_test, is_neuralnet=False))

In [None]:
results.append(train_model(type_name + 'svm_ngram',svm.SVC(kernel = 'linear', C = 3.0), X_data_tfidf_ngram_svd, y_data,
            X_test_tfidf_ngram_svd, y_test, is_neuralnet=False))

## Bagging Model

In [None]:
results.append(train_model(type_name + 'bagging',ensemble.RandomForestClassifier(criterion = 'entropy'), X_data_tfidf_svd, y_data,
            X_test_tfidf_svd, y_test, is_neuralnet=False))

In [None]:
results.append(train_model(type_name + 'bagging_ngram',ensemble.RandomForestClassifier(criterion = 'entropy'), X_data_tfidf_ngram_svd, y_data, 
            X_test_tfidf_ngram_svd, y_test, is_neuralnet=False))

## Boosting Model

In [None]:
results.append(train_model(type_name + 'boosting',xgboost.XGBClassifier(), X_data_tfidf_svd, y_data,
            X_test_tfidf_svd, y_test, is_neuralnet=False))

In [None]:
results.append(train_model(type_name + 'boosting_ngram',xgboost.XGBClassifier(), X_data_tfidf_ngram_svd, y_data, 
            X_test_tfidf_ngram_svd, y_test, is_neuralnet=False))

## Deep Neural Network

In [None]:
def create_dnn_model(n_class):
    input_layer = Input(shape=(600,))
    layer = Dense(128, activation='relu')(input_layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(64, activation='relu')(layer)
    layer = Dropout(0.4)(layer)

    output_layer = Dense(n_class, activation='softmax')(layer)
    
    classifier = models.Model(input_layer, output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy',
                       metrics=['accuracy'])
    
    return classifier

In [None]:
classifier = create_dnn_model(n_class)
results.append(train_model(name = type_name + 'dnn', classifier=classifier, X_data=X_data_tfidf_svd, y_data=y_data_n,
            X_test=X_test_tfidf_svd, y_test=y_test_n, n_epochs= 200, is_neuralnet=True))

In [None]:
classifier = create_dnn_model(n_class)
results.append(train_model(name = type_name + 'dnn_ngram', classifier=classifier, X_data=X_data_tfidf_ngram_svd,
            y_data=y_data_n, X_test=X_test_tfidf_ngram_svd, y_test=y_test_n, n_epochs= 200,
            is_neuralnet=True))

## Recurrent Convolutional Neural Network 

In [None]:
def create_rcnn_model(n_class):
    input_layer = Input(shape=(600,))
    
    layer = Reshape((20, 30))(input_layer)
    layer = Bidirectional(GRU(128, activation='relu', return_sequences=True))(layer)    
    layer = Convolution1D(100, 3, activation="relu")(layer)
    layer = Flatten()(layer)
    layer = Dense(128, activation='relu')(layer)
    layer = Dropout(0.2)(layer)
    
    output_layer = Dense(n_class, activation='softmax')(layer)
    
    classifier = models.Model(input_layer, output_layer)
    classifier.summary()
    classifier.compile(optimizer=optimizers.Adam(),
                       loss='sparse_categorical_crossentropy',
                       metrics=['accuracy'])
    
    return classifier

In [None]:
classifier = create_rcnn_model(n_class)
results.append(train_model(name = type_name + 'rcnn', classifier=classifier, 
            X_data=X_data_tfidf_svd, y_data=y_data_n,
            X_test=X_test_tfidf_svd, y_test=y_test_n, 
            is_neuralnet=True, n_epochs=200))

In [None]:
classifier = create_rcnn_model(n_class)
results.append(train_model(name = 'rcnn_ngram', classifier=classifier,
            X_data=X_data_tfidf_ngram_svd, y_data=y_data_n,
            X_test=X_test_tfidf_ngram_svd, y_test=y_test_n,
            is_neuralnet=True, n_epochs=200))

# Plot

Sumary and compare the performance between models

In [None]:
indices = np.arange(len(results))

results = [[x[i] for x in results] for i in range(4)]

#clf_descr, acc_score, F1_score, train_time, test_time
clf_names, acc_score, training_time, test_time = results
training_time = np.array(training_time) / np.max(training_time)
test_time = np.array(test_time) / np.max(test_time)

plt.figure(figsize=(12, 8))
plt.title("Accuracy  and time (normalize)")
plt.barh(indices, acc_score, .2, label="accuracy", color='navy')
plt.barh(indices + .3, training_time, .2, label="training time", color='c')
plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange')
plt.yticks(())
plt.legend(loc='best')
plt.subplots_adjust(left=.25)
plt.subplots_adjust(top=.95)
plt.subplots_adjust(bottom=.05)

for i, c in zip(indices, clf_names):
    plt.text(-.3, i, c)
plt.savefig('acc_time.png')
plt.show()