In [1]:
# using two datasets, they are 20newsgroups and Reuters-21578
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import reuters

from __future__ import absolute_import
from __future__ import print_function
import os
import sys
import re
import math
import numpy as np
from pprint import pprint
from time import time
import random
import json
import collections
from itertools import compress

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import pairwise_distances

import tensorflow as tf

import keras
from keras.datasets import reuters as reuters2
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout, Activation
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.models import load_model
from keras.models import Sequential
from keras.optimizers import SGD

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

import gensim

Using TensorFlow backend.


# Part 1 Loading Datasets and Classify Documents with Baselines

Two baseline system is used: one is a multilayer perceptron and the other is support vector machine.
Both of them classify documents using tf-idf vectors of documents.

In [3]:
def load_dataset(dataset_name):
    docs_train = []
    docs_test = []
    y_train = []
    y_test = []
    target_names = []
    
    # using reuters dataset from keras package
    if dataset_name == 'reuters':
#         target_names = reuters.categories()
#         for doc_id in reuters.fileids():
#             file_target_list = reuters.categories(doc_id)
#             if doc_id.startswith("train"):
#                 docs_train.append(reuters.raw(doc_id))
#                 y = []
#                 for file_target in file_target_list:                    
#                     y.append(target_names.index(file_target))
#                 y_train.append(y)
#             else:
#                 docs_test.append(reuters.raw(doc_id))
#                 y = []
#                 for file_target in file_target_list:
#                     y.append(target_names.index(file_target))
#                 y_test.append(y)
        word_index = reuters2.get_word_index(path="reuters_word_index.json")
        inverse_word_dict = np.ndarray(shape=(len(word_index)+1,), dtype=object)
        for key in word_index:
            index = word_index[key]
            inverse_word_dict[index] = key

        print('Loading reuters dataset...')
        (x_train, y_train), (x_test, y_test) = reuters2.load_data(test_split=0.2)
        
        for x in x_train:
            x = [t for t in x if t < len(word_index)]
            docs_train.append(' '.join(inverse_word_dict[x]))
        print(len(docs_train), 'train docs', len(y_train))
        
        for x in x_test:
            x = [t for t in x if t < len(word_index)]
            docs_test.append(' '.join(inverse_word_dict[x]))
        print(len(docs_test), 'test docs', len(y_test))
        
        target_names = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                        '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                        '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                        '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                        '1', '2', '3', '4', '5', '6']
    elif dataset_name == '20newsgroups':
        newsgroups_train = fetch_20newsgroups(subset='train')
        docs_train = newsgroups_train.data
        y_train = newsgroups_train.target
        newsgroups_test = fetch_20newsgroups(subset='test')
        docs_test = newsgroups_test.data
        y_test = newsgroups_test.target
        target_names = newsgroups_train.target_names
    
    print(len(docs_train), "training documents are loaded.")
    print(len(docs_test), "test documents are loaded.\n")
    
    return docs_train, y_train, docs_test, y_test, np.array(target_names)

# convert documents to bag of word vectors
def doc_2_matrix(vocab_size, docs_train, y_train, docs_test, y_test):
    
    MAX_NB_WORDS = vocab_size
    MAX_SEQUENCE_LENGTH = 2000
    
    # fit the tokenizer with the corpus
    tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
    docs = []
    docs.extend(docs_train)
    docs.extend(docs_test)
    tokenizer.fit_on_texts(docs)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    
    # vectorize texts into 2D integer tensors
    # mode: "binary", "count", "tfidf", "freq" (default: "binary")
    x_train_m = tokenizer.texts_to_matrix(docs_train, mode='tfidf')
    y_train_m = to_categorical(np.asarray(y_train))
    print('Shape of x_train_m:', x_train_m.shape)
    print('Shape of y_train_m:', y_train_m.shape)
    
    x_test_m = tokenizer.texts_to_matrix(docs_test, mode='tfidf')
    y_test_m = to_categorical(np.asarray(y_test))
    print('Shape of x_test_m:', x_test_m.shape)
    print('Shape of y_test_m:', y_test_m.shape)
    
    return x_train_m, y_train_m, x_test_m, y_test_m

# convert documents to word index sequences
def doc_2_sequences(docs_train, y_train, docs_test, y_test):
    # vectorize the text samples into a 2D integer tensor
    MAX_NB_WORDS = 60000
    MAX_SEQUENCE_LENGTH = 2000
    
    tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
    docs = []
    docs.extend(docs_train)
    docs.extend(docs_test)
    tokenizer.fit_on_texts(docs)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    
    x_train_m = pad_sequences(tokenizer.texts_to_sequences(docs_train), maxlen=MAX_SEQUENCE_LENGTH)
    y_train_m = to_categorical(np.asarray(y_train))
    print('Shape of x_train_m:', x_train_m.shape)
    print('Shape of y_train_m:', y_train_m.shape)
    
    x_test_m = pad_sequences(tokenizer.texts_to_sequences(docs_test), maxlen=MAX_SEQUENCE_LENGTH)
    y_test_m = to_categorical(np.asarray(y_test))
    print('Shape of x_test_m:', x_test_m.shape)
    print('Shape of y_test_m:', y_test_m.shape)
    
    return x_train_m, y_train_m, x_test_m, y_test_m

def mlp_base_line(vocab_size, num_classes, x_train_m, y_train_m, x_test_m, y_test_m, epochs, dataset, isload):
    
    model = Sequential()
    
    if isload == False:
        print('Building a MLP baseline model...')
#         model = Sequential()
        model.add(Dense(512, input_shape=(vocab_size,)))
        model.add(Activation('relu'))
        model.add(Dropout(0.5))
        model.add(Dense(num_classes))
        model.add(Activation('softmax'))
        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    else:
        print('loading a pretrained MLP baseline model...')
        model = load_model('Model/' + dataset + '_cnn_300.h5')
        
    batch_size = 128
    history = model.fit(x_train_m, y_train_m,
                        batch_size = batch_size,
                        epochs = epochs,
                        verbose = 1,
                        validation_split = 0.1)
    model.save('Model/' + dataset + '_cnn_300.h5')

    result = model.evaluate(x_test_m, y_test_m, batch_size = batch_size, verbose = 1)
    print('\nTest score:', score[0], 'Test accuracy:', score[1])
    
def svm_test(X, y, C1, X_test, y_test, C2):
    for c_value in C1:
        clf = svm.SVC(C=c_value, gamma=1/len(X[0])) 
        clf.fit(X, y)
        y_pred = clf.predict(X_test)

        counter = 0;
        for idx, pred in enumerate(y_pred):
            if y_test[idx] != pred:
                counter = counter + 1
        print("RBF: Correct rate =", 1 - counter/len(y_test), " When C =", c_value)
        
    for c_value in C2:
        lin_clf = svm.LinearSVC(C=c_value)
        lin_clf.fit(X, y)
        y_pred = lin_clf.predict(X_test)

        counter = 0;
        for idx, pred in enumerate(y_pred):
            if y_test[idx] != pred:
                counter = counter + 1
        print("Linear: Correct rate =", 1 - counter/len(y_test), " When C =", c_value)

In [19]:
[reuters_docs_train, reuters_docs_test, reuters_y_train, reuters_y_test,
 reuters_target_names] = load_dataset('reuters')
[news_docs_train, news_docs_test, news_y_train, news_y_test,
 news_target_names] = load_dataset('20newsgroups')

Loading reuters dataset...
8982 train docs 8982
2246 test docs 2246
8982 training documents are loaded.
2246 testing documents are loaded.

11314 training documents are loaded.
7532 testing documents are loaded.



#### Using MLP baseline to classifiy documents

In [5]:
[docs_train, y_train, docs_test, y_test, target_names] = load_dataset('reuters') # 20newsgroups
vocab_size = 30000
x_train_m, y_train_m, x_test_m, y_test_m = doc_2_matrix(vocab_size, docs_train, y_train, docs_test, y_test)
print(len(target_names), 'targets\n')
mlp_base_line(x_train_m.shape[1], len(target_names), x_train_m, y_train_m, x_test_m, y_test_m, 100, 'reuters', True)

Loading reuters dataset...
8982 train docs 8982
2246 test docs 2246
8982 training documents are loaded.
2246 testing documents are loaded.

Found 30976 unique tokens.
Shape of x_train_m: (8982, 30000)
Shape of y_train_m: (8982, 46)
Shape of x_test_m: (2246, 30000)
Shape of y_test_m: (2246, 46)
46 targets

loading a pretrained MLP baseline model...
Train on 8083 samples, validate on 899 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
E

#### Using SVM to classify documents

In [8]:
[docs_train, y_train, docs_test, y_test, target_names] = load_dataset('reuters') # 20newsgroups reuters
vocab_size = 30000
x_train_m, y_train_m, x_test_m, y_test_m = doc_2_matrix(vocab_size, docs_train, y_train, docs_test, y_test)
C1 = [] # RBF [8, 16, 32, 64, 128, 256]
C2 = [0.00005, 0.0001, 0.0005, 0.001, 0.01, 0.05, 0.5, 1.0] # Linear
print(len(target_names), 'targets\n')
svm_test(x_train_m, y_train, C1, x_test_m, y_test, C2)

Loading reuters dataset...
8982 train docs 8982
2246 test docs 2246
8982 training documents are loaded.
2246 test documents are loaded.

Found 30976 unique tokens.
Shape of x_train_m: (8982, 30000)
Shape of y_train_m: (8982, 46)
Shape of x_test_m: (2246, 30000)
Shape of y_test_m: (2246, 46)
46 targets

Linear:Correct rate = 0.8063223508459484 When C = 5e-05
Linear:Correct rate = 0.8227960819234195 When C = 0.0001
Linear:Correct rate = 0.8268032056990204 When C = 0.0005
Linear:Correct rate = 0.825912733748887 When C = 0.001
Linear:Correct rate = 0.8147818343722173 When C = 0.01
Linear:Correct rate = 0.807212822796082 When C = 0.05
Linear:Correct rate = 0.7960819234194123 When C = 0.5
Linear:Correct rate = 0.7853962600178095 When C = 1.0


In [9]:
[docs_train, y_train, docs_test, y_test, target_names] = load_dataset('20newsgroups') # 20newsgroups reuters
vocab_size = 30000
x_train_m, y_train_m, x_test_m, y_test_m = doc_2_matrix(vocab_size, docs_train, y_train, docs_test, y_test)
C1 = [] # RBF [8, 16, 32, 64, 128, 256]
C2 = [0.00005, 0.0001, 0.0005, 0.001, 0.01, 0.05, 0.5, 1.0] # Linear
print(len(target_names), 'targets\n')
svm_test(x_train_m, y_train, C1, x_test_m, y_test, C2)

11314 training documents are loaded.
7532 test documents are loaded.

Found 179209 unique tokens.
Shape of x_train_m: (11314, 30000)
Shape of y_train_m: (11314, 20)
Shape of x_test_m: (7532, 30000)
Shape of y_test_m: (7532, 20)
20 targets

Linear:Correct rate = 0.8572756240042485 When C = 5e-05
Linear:Correct rate = 0.8596654275092936 When C = 0.0001
Linear:Correct rate = 0.8570100902814657 When C = 0.0005
Linear:Correct rate = 0.8511683483802444 When C = 0.001
Linear:Correct rate = 0.8370950610727562 When C = 0.01
Linear:Correct rate = 0.8304567180031864 When C = 0.05
Linear:Correct rate = 0.8171800318640468 When C = 0.5
Linear:Correct rate = 0.8097450876261285 When C = 1.0


# Part 2 Convolutional Neural Network With Word Embeddings

In [10]:
BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, 'Dataset')
vocab_size = 60000
MAX_SEQUENCE_LENGTH = 2000
EMBEDDING_DIM = 300

print('Loading the Glove word embedding')

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.840B.300d.txt'), "rb") # glove.840B.300d   glove.6B.100d
for line in f:
    values = line.split()
    word = values[0].decode('UTF-8')
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Loading the Glove word embedding
Found 2196016 word vectors.


In [14]:
def build_CNN_model(dataset): # 20newsgroups reuters
    [docs_train, y_train, docs_test, y_test, target_names] = load_dataset('reuters') # 20newsgroups reuters
    MAX_NB_WORDS = 60000
    
    tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
    docs = []
    docs.extend(docs_train)
    docs.extend(docs_test)
    tokenizer.fit_on_texts(docs)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    # Prepare embedding matrix
    print('Preparing embedding matrix')
    num_words = min(vocab_size, len(word_index))
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i >= num_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    # load pre-trained word embeddings into an Embedding layer
    # set trainable = False to keep the embeddings fixed
    embedding_layer = Embedding(num_words,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)

    # train a 1D convnet with global maxpooling
    print('Building CNN model')
    drop_rate = 0.6
    model = Sequential()
    model.add(embedding_layer)
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(MaxPooling1D(5))
    # rate: float between 0 and 1. Fraction of the input units to drop.
    model.add(Dropout(drop_rate))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(MaxPooling1D(5))
    model.add(Dropout(drop_rate))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    #model.add(Dense(128, activation='relu'))
    model.add(Dense(len(target_names), activation='softmax'))
    # Choose the optimizer
    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop', # sgd rmsprop
                  metrics=['acc'])

    model.summary()
    return model
    
model_20newsgroups = build_CNN_model('20newsgroups') # 20newsgroups reuters
model_20newsgroups.save('Model/news_cnn_300_1.h5')
model_reuters = build_CNN_model('reuters') # 20newsgroups reuters
model_reuters.save('Model/reuter_cnn_300_1.h5')

Loading reuters dataset...
8982 train docs 8982
2246 test docs 2246
8982 training documents are loaded.
2246 test documents are loaded.

Found 30976 unique tokens.
Preparing embedding matrix
Building CNN model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2000, 300)         9000000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1996, 128)         192128    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 399, 128)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 399, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 395, 128)          82048     
________________________________________________________________

### 2.1 CNN Model Trained With 20 Newsgroup Dataset and 300 Dim Word Embedding

In [3]:
[docs_train, y_train, docs_test, y_test, target_names] = load_dataset('20newsgroups') # 20newsgroups reuters
x_train_m, y_train_m, x_test_m, y_test_m = doc_2_sequences(docs_train, y_train, docs_test, y_test)
print(len(target_names), 'targets\n')

model = load_model('Model/news_cnn_300_1.h5') #news_cnn_300_1
model.fit(x_train_m, y_train_m,
          batch_size=128,
          epochs=60,
          validation_data = (x_test_m, y_test_m))
model.save('Model/news_cnn_300_2.h5')

# test the trained model
score = model.evaluate(x_test_m, y_test_m, batch_size = 128, verbose = 1)
print('\nTest score:', score[0], 'Test accuracy:', score[1])

11314 training documents are loaded.
7532 testing documents are loaded.

Found 179209 unique tokens.
Shape of x_train_m: (11314, 2000)
Shape of y_train_m: (11314, 20)
Shape of x_test_m: (7532, 2000)
Shape of y_test_m: (7532, 20)
20 targets

Train on 10182 samples, validate on 1132 samples
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60

### 2.2 CNN Model Trained With Reuter Dataset and 300 Dim Word Embedding

In [None]:
[docs_train, y_train, docs_test, y_test, target_names] = load_dataset('reuters') # 20newsgroups reuters
x_train_m, y_train_m, x_test_m, y_test_m = doc_2_sequences(docs_train, y_train, docs_test, y_test)
print(len(target_names), 'targets\n')

# round 1 - epochs: 60, accuracy: 
# round 2 - epochs:
model = load_model('Model/reuter_cnn_300_1.h5') #reuter_cnn_300_1
model.fit(x_train_m, y_train_m,
          batch_size=128,
          epochs=60,
          validation_data = (x_test_m, y_test_m))
model.save('Model/reuter_cnn_300_2.h5')

# test the trained model
score = model.evaluate(x_test_m, y_test_m, batch_size = 128, verbose = 1)
print('\nTest score:', score[0], 'Test accuracy:', score[1])

Loading reuters dataset...
8982 train docs 8982
2246 test docs 2246
8982 training documents are loaded.
2246 test documents are loaded.

Found 30976 unique tokens.
Shape of x_train_m: (8982, 2000)
Shape of y_train_m: (8982, 46)
Shape of x_test_m: (2246, 2000)
Shape of y_test_m: (2246, 46)
46 targets

Train on 8982 samples, validate on 2246 samples
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60

# Part 3 Doc2Vector
Build the unsupervised model PV_DM

# Load the Reuters and 20newsgroup Datasets

In [3]:
[news_docs_train, news_docs_test, news_y_train, news_y_test, news_target_names] = load_dataset('20newsgroups') # reuters or 20newsgroups
[reuters_docs_train, reuters_docs_test, reuters_y_train, reuters_y_test, reuters_target_names] = load_dataset('reuters') # reuters or 20newsgroups

11314 training documents are loaded.
7532 testing documents are loaded.

7769 training documents are loaded.
3019 testing documents are loaded.



In [4]:
input_file = open('alldata.txt', 'w')

id_ = 0
for doc in news_docs_train:
    doc_id = 'news_train_%i' % id_
    id_ = id_ + 1
    tokens = nltk.word_tokenize(doc)
    doc_tokens = ' '.join(tokens).lower()
    doc_tokens = doc_tokens.encode('ascii', 'ignore')
    input_file.write('%s %s\n' % (doc_id, doc_tokens))
print(id_, "news group doc for training")
    
id_ = 0
for doc in news_docs_test:
    doc_id = 'news_test_%i' % id_
    id_ = id_ + 1
    tokens = nltk.word_tokenize(doc)
    doc_tokens = ' '.join(tokens).lower()
    doc_tokens = doc_tokens.encode('ascii', 'ignore')
    input_file.write('%s %s\n' % (doc_id, doc_tokens))
print(id_, "news group doc for test")
    
id_ = 0
for doc in reuters_docs_train:
    doc_id = 'reuters_train_%i' % id_
    id_ = id_ + 1
    tokens = nltk.word_tokenize(doc)
    doc_tokens = ' '.join(tokens).lower()
    doc_tokens = doc_tokens.encode('ascii', 'ignore')
    input_file.write('%s %s\n' % (doc_id, doc_tokens))
print(id_, "reuters doc for training")
    
id_ = 0
for doc in reuters_docs_test:
    doc_id = 'reuters_test_%i' % id_
    id_ = id_ + 1
    tokens = nltk.word_tokenize(doc)
    doc_tokens = ' '.join(tokens).lower()
    doc_tokens = doc_tokens.encode('ascii', 'ignore')
    input_file.write('%s %s\n' % (doc_id, doc_tokens))
print(id_, "reuters doc for test")
    
input_file.close()

11314 news group doc for training
7532 news group doc for test
7769 reuters doc for training
3019 reuters doc for test


In [11]:
docList = []

for idx in range(len(news_docs_train)):
    docList.append('news_train_' + str(idx))

for idx in range(len(news_docs_test)):
    docList.append('news_test_' + str(idx))

for idx in range(len(reuters_docs_train)):
    docList.append('reuters_train_' + str(idx))
    
for idx in range(len(reuters_docs_test)):
    docList.append('reuters_test_' + str(idx))

print(len(docList))
docList = set(docList)
print(len(docList))

29634
29634


In [36]:
def load_pretrain_doc_voc():
    print('Loading pretrained documents Vectors')
    doc_vector = {}
    f = open(os.path.join('Model', 'vectors300.txt'), "r") # vectors100.txt vectors300.txt
    for line in f:
        values = line.split()
        word = values[0]
        if word in docList:
            coefs = np.array([float(x) for x in values[1].split(',')])
            doc_vector[word] = coefs
    f.close()
    return doc_vector

doc_vector = load_pretrain_doc_voc()
print('%s document vectors are loaded.' % len(doc_vector))

news_x_train = []
for idx in range(len(news_docs_train)):
    news_x_train.append(doc_vector['news_train_' + str(idx)])
print(len(news_x_train), 'news training examples')

news_x_test = []
for idx in range(len(news_docs_test)):
    news_x_test.append(doc_vector['news_test_' + str(idx)])
print(len(news_x_test), 'news training examples')

print(len(news_x_test[0]))

Loading pretrained documents Vectors
29634 document vectors are loaded.
11314 news training examples
7532 news training examples
300


In [37]:
print("Starting test pretrained document vector for document classification:")
X = news_x_train
y = news_y_train
C1 = [8, 16, 32, 64, 128, 256]
X_test = news_x_test
y_test = news_y_test
C2 = [0.01, 0.05, 0.5, 1.0, 2.0, 4.0]
svm_test(X, y, C1, X_test, y_test, C2)

Starting test pretrained document vector for document classification:
RBF:Correct rate = 0.7006107275624005 When C = 8
RBF:Correct rate = 0.7282262347318109 When C = 16
RBF:Correct rate = 0.7338024429102497 When C = 32
RBF:Correct rate = 0.7430961232076474 When C = 64
RBF:Correct rate = 0.7458842272968667 When C = 128
RBF:Correct rate = 0.74429102496017 When C = 256
Linear:Correct rate = 0.7138874137015401 When C = 0.01
Linear:Correct rate = 0.7400424853956453 When C = 0.05
Linear:Correct rate = 0.7497344662772172 When C = 0.5
Linear:Correct rate = 0.7482740308019118 When C = 1.0
Linear:Correct rate = 0.7458842272968667 When C = 2.0
Linear:Correct rate = 0.739644184811471 When C = 4.0


## Part 3.2 Training Doc2Vec using gensim


In [4]:
import gensim

# Define a Function to Preprocess Text
def create_training_data_set():
    
    train_corpus = []
    for idx in range(len(news_docs_train)):
        # For training data, add tags
        train_corpus.append(gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(news_docs_train[idx]),
                                                                 ['news_train_' + str(idx)]))
    
    for idx in range(len(news_docs_test)):
        train_corpus.append(gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(news_docs_test[idx]),
                                                                 ['news_test_' + str(idx)]))
    
    for idx in range(len(reuters_docs_train)):
        train_corpus.append(gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(reuters_docs_train[idx]),
                                                                 ['reuters_train_' + str(idx)]))
    
    for idx in range(len(reuters_docs_test)):
        train_corpus.append(gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(reuters_docs_test[idx]),
                                                                 ['reuters_test_' + str(idx)]))
    return train_corpus

# load training files and test files
train_corpus = create_training_data_set()

# show first two training data
print(train_corpus[0])
print(news_docs_train[0])



[TaggedDocument(words=['from', 'lerxst', 'wam', 'umd', 'edu', 'where', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neig

In [21]:
# Define the Doc2Vec model
# with a vector size with 50 words and iterating over the training corpus 55 times
# set the minimum word count to 2 in order to give higher frequency words more weighting
# Model accuracy can be improved by increasing the number of iterations
model = gensim.models.doc2vec.Doc2Vec(dm = 1, dm_concat = 1, size=300, window=10, min_count=2, negative=10, iter=100, workers = 4)
model.build_vocab(train_corpus)
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)
model.save("Model/gensim300")

Wall time: 3h 31min 58s


In [22]:
# Test document vector generated by gensim for document classification
# model = Doc2Vec.load(fname)
doc_vector = model.docvecs

news_x_train = []
for idx in range(len(news_docs_train)):
    news_x_train.append(doc_vector['news_train_' + str(idx)])
print(len(news_x_train), 'news training examples')

news_x_test = []
for idx in range(len(news_docs_test)):
    news_x_test.append(doc_vector['news_test_' + str(idx)])
print(len(news_x_test), 'news training examples')

print("Starting test document vector generated by gensim for document classification:")
X = news_x_train
y = news_y_train
X_test = news_x_test
y_test = news_y_test
svm_test(X, y, X_test, y_test)

11314 news training examples
7532 news training examples
Starting test document vector generated by gensim for document classification:
RBF:Correct rate = 0.3332448220924057 When C = 0.5
RBF:Correct rate = 0.35661178969729157 When C = 1.0
RBF:Correct rate = 0.37121614445034523 When C = 2.0
RBF:Correct rate = 0.38528943175783326 When C = 4.0
RBF:Correct rate = 0.4013542219861922 When C = 8.0
RBF:Correct rate = 0.40693043016463093 When C = 16
RBF:Correct rate = 0.41768454593733406 When C = 32
RBF:Correct rate = 0.42644715878916617 When C = 64
RBF:Correct rate = 0.435740839086564 When C = 128
RBF:Correct rate = 0.4403876792352629 When C = 256
Linear:Correct rate = 0.5130111524163569 When C = 0.5
Linear:Correct rate = 0.5289431757833245 When C = 1.0
Linear:Correct rate = 0.5426181625066384 When C = 2.0
Linear:Correct rate = 0.5524429102496017 When C = 4.0
Linear:Correct rate = 0.5584174190122145 When C = 8.0
Linear:Correct rate = 0.5606744556558683 When C = 16
Linear:Correct rate = 0.55390