In [1]:
import numpy as np
import matplotlib.pyplot as plt
import re
import string
import tensorflow as tf
from collections import Counter
%matplotlib inline

## Data Preprocessing

In [2]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train',
                                  shuffle=True, random_state=11)
newsgroups_test = fetch_20newsgroups(subset='test', 
                                  shuffle=True, random_state=11)

In [3]:
print('Training text number:', len(newsgroups_train.data))
print('Testing text number:', len(newsgroups_test.data))

Training text number: 11314
Testing text number: 7532


## News to IDs

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
class readNews:
    '''
    Read 20news and transform them into vectors for training
    Args:
    train_data
    test_data
    '''
    def __init__(self, train_data, test_data):
        self._train_data = train_data
        self._test_data = test_data
        self._preprocess()
    
    
    def _preProcessor(self, s):
        #remove punctuation
        s = re.sub('['+string.punctuation+']', ' ', s)
        #remove digits
        s = re.sub('['+string.digits+']', ' ', s)
        #remove foreign characters
        s = re.sub('[^a-zA-Z]', ' ', s)
        #remove line ends
        s = re.sub('\n', ' ', s)
        #turn to lower case
        s = s.lower()
        s = re.sub('[ ]+',' ', s)
        s = s.rstrip()
        return s
    
    def _preprocess(self):
        '''Remove punctuations'''
        train_news = self._train_data.data
        test_news = self._test_data.data
        self._train_data.data = [self._preProcessor(item) for item in train_news]
        self._test_data.data = [self._preProcessor(item) for item in test_news]
        
    def _tfidf_vectorizer(self):
        ''''Vectorize news'''
        tfidfVectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 1), max_features=5000)
        X_train_tfidf = tfidfVectorizer.fit_transform(self._train_data.data)
        X_test_tfidf = tfidfVectorizer.transform(self._test_data.data)
        vocab_index_dict = tfidfVectorizer.vocabulary_
        return X_train_tfidf, X_test_tfidf, vocab_index_dict
    
    def tfidf_weight(self):
        '''Calculate TfIdf weights for each word within each news'''
        train_news_words, test_news_words = self._news2words()
        X_train_tfidf, X_test_tfidf, vocab_index_dict = self._tfidf_vectorizer()
        train_weights = []
        test_weights = []
        #Generate dicts for words and corresponding tfidf weights
        for i, news in enumerate(train_news_words):
            word_weight = []
            for word in news:
                try:
                    word_index = vocab_index_dict.get(word)
                    w = X_train_tfidf[i, word_index]
                    word_weight.append(w)
                except:
                    word_weight.append(0)
            train_weights.append(word_weight)
        for i, news in enumerate(test_news_words):
            word_weight = []
            for word in news:
                try:
                    word_index = vocab_index_dict.get(word)
                    w = X_test_tfidf[i, word_index]
                    word_weight.append(w)
                except:
                    word_weight.append(0)
            test_weights.append(word_weight)      
        return train_weights, test_weights
    
    def _news2words(self):
        #Split each news into words
        train_news_words = []
        test_news_words = []
        for news in self._train_data.data:
           #Collect words for each news
           train_news_words.append(news.split())
        for news in self._test_data.data:
            test_news_words.append(news.split())
        return train_news_words, test_news_words
    
    def buildVocab(self):
        words = []
        for news in self._train_data.data:
           #Collect all the chars
           words.extend(news.split())
        #Calculate frequencies of each character
        word_freq = Counter(words)
        #Filter out those low frequency characters
        vocab = [u for u,v in word_freq.items() if v>3]
        if 'UNK' not in vocab:
            vocab.append('UNK')
        #Map each char into an ID
        word_id_map = dict(zip(vocab, range(len(vocab))))
        #Map each ID into a word
        id_word_map = dict(zip(word_id_map.values(), word_id_map.keys()))
        return vocab, word_id_map, id_word_map
    
    def news2vecs(self):
        #Map each word into an ID
        train_news_words, test_news_words = self._news2words()
        vocab, word_id_map, id_word_mapp = self.buildVocab()
        def word2id(c):
            try:
               ID = word_id_map[c]
            except:#Trun those less frequent words into UNK
               ID = word_id_map['UNK']
            return ID
        #Turn each news into a list of word Ids
        words_vecs = lambda words: [word2id(w) for w in words]
        train_news_vecs = [words_vecs(words) for words in train_news_words]
        train_news_labels = self._train_data.target
        test_news_vecs = [words_vecs(words) for words in test_news_words]
        test_news_labels = self._test_data.target
        return train_news_vecs, train_news_labels, test_news_vecs, test_news_labels

In [5]:
#Create a readnews object
rn = readNews(newsgroups_train, newsgroups_test)
train_news_vecs, train_news_labels, test_news_vecs, test_news_labels = rn.news2vecs()

In [6]:
#Record tfidf weights for each word in each news
train_weights, test_weights = rn.tfidf_weight()

In [7]:
#Get the vocabulary and dictionary of words as well as corresponding ids
vocab, word_id_map, id_word_map = rn.buildVocab()

In [8]:
def word2id(c):
    try:
        ID = word_id_map[c]
    except:#Trun those less frequent words into UNK
        ID = word_id_map['UNK']
    return ID
def id2word(c):
    try:
        word = id_word_map[c]
    except:
        word='UNK'
    return word

In [9]:
train_news_length = [len(news) for news in train_news_vecs]
print('Min Length', np.amin(train_news_length))
print('Max Length', np.max(train_news_length))
print('Median Length', np.median(train_news_length))

Min Length 17
Max Length 15804
Median Length 184.0


In [10]:
np.percentile(train_news_length, [0, 25, 50, 75, 90, 95])

array([  17.,  116.,  184.,  301.,  509.,  769.])

It seems the length varies much, perhaps we need buckets to put news with similar lengths together.

## Create Batch Data Generator

In [95]:
import random
class generateSamples:
    '''Generate samples for training and testing'''
    
    def __init__(self, news_vecs, news_labels, weights, max_len=800):
        '''Pass batch size and poems vectors'''
        self.index = 0
        self.news_vecs = news_vecs
        self.news_labels = news_labels
        self.weights= weights
        self.news_count = len(news_vecs)
        self.max_news_len = max_len
        
    def generate_batch(self, batch_size=64, is_training=True):
        '''Generate a training sample each time'''
        
        selected_samples = []
        selected_labels = []
        batch_weights = []
        #For training, select random samples
        if is_training:
            selected_index = np.random.choice(len(self.news_vecs), batch_size, replace=False)
            for index in selected_index:
                selected_samples.append(self.news_vecs[index])
                selected_labels.append(self.news_labels[index])
                batch_weights.append(self.weights[index])
        #For testing, select a few samples each time
        else:#Testing model
            start = self.index%self.news_count
            end = (start + batch_size)%self.news_count
            #In case end goes beyong the range of the samples
            if end > start:
                selected_samples = self.news_vecs[start: end]
                selected_labels = self.news_labels[start: end]
                #record weights
                batch_weights = self.weights[start:end]
                self.index = end
            else:
                print('Test Samples come to an end!')
                selected_samples = self.news_vecs[start: ]
                #record weights
                batch_weights = self.weights[start:]
                selected_labels = self.news_labels[start: ]
                self.index = 0
            
        #Set the max lengths as the size of the input
        #max_len = max(map(len, data))
        #Record lengths for each text
        lengths = [len(item) for item in selected_samples]
        lengths = np.array(lengths)
        #Get the max length in current batch
        max_len = self.max_news_len
        #max_len = self.max_news_len if max_len > self.max_news_len else max_len

        #Create input and label
        x = np.full((batch_size, max_len), word2id('UNK'), np.int32)
        w = np.full((batch_size, max_len), 0, np.float32)
        y = np.zeros(batch_size)
        for i in range(batch_size):
            #the first n elements as input
            if len(selected_samples[i]) < max_len:
                x[i, :len(selected_samples[i])] = selected_samples[i]
                w[i, :len(selected_samples[i])] = batch_weights[i]
                y[i] = selected_labels[i]
            #If the news is very long
            #Cut it to the max_news_len
            else:
                x[i, :] = selected_samples[i][:max_len]
                w[i, :] = batch_weights[i][:max_len]
                y[i] = selected_labels[i]
        return x, y, lengths, w
    

From the data exploration, it is clear that the length of the news varies much, ranging from 10 to 10000. In order to deal with that case, we can take buckets into consideration, similar to seq2seq model.

In [20]:
class trainConfig:
    vocab_size = len(vocab)
    max_doc_len = max(map(len, train_news_vecs))
    label_size = 20
    embed_size = 128
    batch_size = 64
    layer_size = 2

In [21]:
class testConfig:
    vocab_size = len(vocab)
    max_doc_len = max(map(len, train_news_vecs))
    label_size = 20
    embed_size = 128
    batch_size = 64
    layer_size = 2
    
class singleConfig:
    vocab_size = len(vocab)
    max_doc_len = max(map(len, train_news_vecs))
    label_size = 20
    embed_size = 128
    batch_size = 1
    layer_size = 2

In [22]:
train_chunk_num = int(len(train_news_vecs)/trainConfig.batch_size)
test_chunk_num = int(len(test_news_vecs)/trainConfig.batch_size)
remain_num = len(test_news_labels) - trainConfig.batch_size*test_chunk_num
remain_num

44

In [23]:
train_samples = generateSamples(train_news_vecs, train_news_labels, train_weights)

In [24]:
test_samples = generateSamples(test_news_vecs, test_news_labels, test_weights)

In [25]:
x, y, lengths, w = train_samples.generate_batch()

In [73]:
x, y, lengths, w = test_samples.generate_batch()

## CNN Model

In this model, we first transform each news as a series of word vectors. Then we put the series of news into a RNN system to get the final state vectors. Next, we do classification based on the news vectors.

In [28]:
import functools
from tensorflow.contrib.layers.python.layers import encoders
def lazy_property(function):
    attribute = '_cache_' + function.__name__

    @property
    @functools.wraps(function)
    def decorator(self):
        if not hasattr(self, attribute):
            setattr(self, attribute, function(self))
        return getattr(self, attribute)

    return decorator

In [91]:
#Reference:http://blog.csdn.net/u010223750/article/details/71079036
from tensorflow.contrib import rnn
class CNN_Model:
    def __init__(self, config, x, y, lengths, is_training=True):
        self.x = x
        self.y = y
        self.vocab_size = config.vocab_size
        self.embed_size = config.embed_size
        self.label_size = config.label_size
        self.batch_size = config.batch_size
        self.lengths = lengths
        self.max_doc_len = config.max_doc_len
        self.is_training = is_training
        self.predict
        if is_training:
            self.optimize
        print('Model Initialized!')
    
    @lazy_property
    def cost(self):
        logits = self.inference
        targets = tf.one_hot(self.y, 20, 1, 0)
        targets = tf.cast(targets, tf.float32)
        #Note  tf.nn.softmax_cross_entropy_with_logits(labels=Y, logits=activation)
        loss = tf.losses.softmax_cross_entropy(targets, logits)
        return loss
    
    @lazy_property
    def predict(self):
        logits = self.inference
        #probs = tf.nn.softmax(logits)
        predictions = tf.argmax(logits, 1)
        return predictions
    
    @lazy_property
    def correct_num(self):
        prediction = self.predict
        targets = tf.reshape(self.y, [-1])
        targets = tf.cast(targets, tf.int64)
        correct_prediction = tf.equal(prediction, targets)
        correct_num = tf.reduce_sum(tf.cast(correct_prediction, "float"))
        return correct_num
    
    @lazy_property
    def accuracy(self):
        prediction = self.predict
        targets = tf.reshape(self.y, [-1])
        targets = tf.cast(targets, tf.int64)
        correct_prediction = tf.equal(prediction, targets)
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
        return accuracy
    
    @lazy_property
    def optimize(self):
        with tf.variable_scope('optimizer'):
            cost = self.cost
        #with tf.name_scope('Optimizer'):
            #self._learning_rate = tf.Variable(0.0, trainable=False)
            train_op = tf.train.AdamOptimizer(0.0005).minimize(cost)
            #train_op = tf.train.AdamOptimizer(self._learning_rate).minimize(cost)
            #tvars = tf.trainable_variables()
            #grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 6)
            #optimizer = tf.train.AdamOptimizer(self._learning_rate)
            #train_op = optimizer.apply_gradients(zip(grads, tvars))
        return train_op
    
    @lazy_property
    def inference(self):
        #Create embedding matrix
        with tf.device("/cpu:0"):
            embeddings = tf.get_variable('embedding', [self.vocab_size,  self.embed_size])
            inputs = tf.nn.embedding_lookup(embeddings, self.x)
        if self.is_training:
            inputs = tf.nn.dropout(inputs, 0.5)
        #Expand the dim to cater to CNN
        intputs_expanded = tf.expand_dims(inputs, -1)
        with tf.variable_scope('CNN1'):
            filter_shape = [3, self.embed_size, 1, 8]
            W1 = tf.get_variable('W1', shape=filter_shape)
            b1 = tf.get_variable('b1', shape=[8])
            #798*1*8
            conv1 = tf.nn.conv2d(intputs_expanded, W1, strides=[1, 1, 1, 1], padding="VALID", name="conv1")
            # Apply nonlinearity
            h1 = tf.nn.relu(tf.nn.bias_add(conv1, b1), name="relu1")
            #shape=batch, 398, 1, 8
            pooled1 = tf.nn.max_pool(h1, ksize=[1, 4, 1, 1],
                                     strides=[1, 2, 1, 1],
                                     padding='VALID',
                                     name="pool1")

        with tf.variable_scope('CNN2'):
            filter_shape = [3, 1, 8, 8]
            W2 = tf.get_variable('W2', shape=filter_shape)
            b2 = tf.get_variable('b2', shape=[8])
            #shape=batch, 396, 1, 8
            conv2 = tf.nn.conv2d(pooled1, W2, strides=[1, 1, 1, 1], padding="VALID", name="conv2")
            # Apply nonlinearity
            h2 = tf.nn.relu(tf.nn.bias_add(conv2, b2), name="relu2")
            #shape=batch, 197, 1, 8
            pooled2 = tf.nn.max_pool(h2, ksize=[1, 4, 1, 1],
                                     strides=[1, 2, 1, 1],
                                     padding='VALID',
                                     name="POOL2")

            #Fully Connected Layer
        with tf.name_scope('fully_connected'):
            W_fc1 = tf.get_variable('W_fc1', shape=[197*8, 128])#weight_variable([7 * 7 * 64, 1024])
            b_fc1 = tf.get_variable('b_fc1', shape=[128])#bias_variable([1024])
            h_pool2_flat = tf.reshape(pooled2, [-1, 197*8])
            h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
            #Dropout, to prevent against overfitting
            #h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
        
        with tf.variable_scope('output'):
            weights = tf.get_variable('weights', [128, self.label_size], dtype=tf.float32)
            biases = tf.get_variable('biases', [self.label_size], dtype=tf.float32)
        logits = tf.matmul(h_fc1, weights) + biases
        #预测值
        return logits
    
    @property
    def learningRate(self):
        return self._learning_rate
        

In [92]:
graph_cnn = tf.Graph()
#Create models for training and testing data
with graph_cnn.as_default():
    initializer = tf.random_uniform_initializer(-0.02, 0.02)
    with tf.name_scope('train'):
        train_data = tf.placeholder(tf.int32, [trainConfig.batch_size, None])
        train_label = tf.placeholder(tf.int32, [trainConfig.batch_size])
        train_weight = tf.placeholder(tf.float32, [trainConfig.batch_size, None])
        train_lengths = tf.placeholder(tf.float32, [trainConfig.batch_size])
        #Set different models for different buckets
        with tf.variable_scope("Model", reuse=None, initializer=initializer):
            train_model = CNN_Model(trainConfig, train_data, train_label, train_lengths)
    with tf.name_scope('test'):
        test_data = tf.placeholder(tf.int32, [testConfig.batch_size, None])
        test_label = tf.placeholder(tf.int32, [testConfig.batch_size])
        test_weight = tf.placeholder(tf.float32, [testConfig.batch_size, None])
        test_lengths = tf.placeholder(tf.float32, [testConfig.batch_size])
        single_data = tf.placeholder(tf.int32, [singleConfig.batch_size, None])
        single_label = tf.placeholder(tf.int32, [singleConfig.batch_size])
        single_weight = tf.placeholder(tf.float32, [singleConfig.batch_size, None])
        single_lengths = tf.placeholder(tf.float32, [singleConfig.batch_size])
        #Set different models for different buckets
        with tf.variable_scope("Model", reuse=True, initializer=initializer):
            test_model = CNN_Model(testConfig, test_data, test_label, test_lengths, False)
            single_model = CNN_Model(singleConfig, single_data, single_label, single_lengths, False)

Model Initialized!
Model Initialized!
Model Initialized!


In [62]:
len(test_news_vecs)/64

117.6875

In [63]:
len(test_news_vecs) - 64*117

44

In [94]:
import time
epochs = 30
with tf.Session(graph=graph_cnn) as sess:
    #Initialize parameters
    init = tf.global_variables_initializer()
    sess.run(init)
    start_time = time.time()
    for m in range(epochs):
        for i in range(train_chunk_num):
            #sess.run(tf.assign(learning_rate, 0.002*((0.98)**m)))
            x, y, lengths, w = train_samples.generate_batch()
            feed_dict = {train_data:x, train_label:y, train_lengths:lengths}
            l, a, _ = sess.run([train_model.cost, train_model.accuracy, train_model.optimize], feed_dict=feed_dict)
            if i%100 == 0:
                print('Loss:', round(l, 4), 'Accuracy:', round(a, 3))
        end_time = time.time()
        print('Epoch', m, 'time:{:.2f}'.format(end_time - start_time))
        start_time = end_time
    #Calculate Testing Accuracy
    print('Testing...')
    count = 0
    test_samples = generateSamples(test_news_vecs, test_news_labels, test_weights)
    for _ in range(117):
        #Traverse each data
        x, y, lengths, w = test_samples.generate_batch(64, False)
        feed_dict = {test_data:x, test_label:y, test_lengths:lengths}
        n = sess.run(test_model.correct_num, feed_dict=feed_dict)
        count += np.sum(n)
    for _ in range(44):
        #Traverse each data
        x, y, lengths, w = test_samples.generate_batch(1, False)
        feed_dict = {single_data:x, single_label:y, single_lengths:lengths}
        n = sess.run(single_model.correct_num, feed_dict=feed_dict)
        count += np.sum(n)
    end_time = time.time()
    print('Testing Time:{:.2f}'.format(end_time - start_time))
    print(count*1.0/len(test_news_vecs))   


Loss: 2.9959 Accuracy: 0.094
Loss: 2.9656 Accuracy: 0.062
Epoch 0 time:11.11
Loss: 2.4277 Accuracy: 0.141
Loss: 2.1634 Accuracy: 0.219
Epoch 1 time:11.10
Loss: 2.1547 Accuracy: 0.109
Loss: 1.731 Accuracy: 0.297
Epoch 2 time:11.00
Loss: 1.5872 Accuracy: 0.469
Loss: 1.9427 Accuracy: 0.438
Epoch 3 time:11.07
Loss: 1.4907 Accuracy: 0.359
Loss: 1.4578 Accuracy: 0.406
Epoch 4 time:11.06
Loss: 1.3905 Accuracy: 0.484
Loss: 1.2998 Accuracy: 0.594
Epoch 5 time:11.06
Loss: 1.139 Accuracy: 0.484
Loss: 1.0314 Accuracy: 0.578
Epoch 6 time:11.05
Loss: 1.2717 Accuracy: 0.594
Loss: 1.0393 Accuracy: 0.609
Epoch 7 time:11.13
Loss: 0.8287 Accuracy: 0.797
Loss: 0.8785 Accuracy: 0.688
Epoch 8 time:11.12
Loss: 0.837 Accuracy: 0.719
Loss: 0.9559 Accuracy: 0.75
Epoch 9 time:11.09
Loss: 0.7941 Accuracy: 0.734
Loss: 0.7599 Accuracy: 0.703
Epoch 10 time:11.04
Loss: 0.7663 Accuracy: 0.75
Loss: 0.7257 Accuracy: 0.734
Epoch 11 time:11.10
Loss: 0.6414 Accuracy: 0.828
Loss: 0.5011 Accuracy: 0.812
Epoch 12 time:11.13
L

It seems overfitting happens for convolutional neural networks.