In [1]:
import numpy as np
import matplotlib.pyplot as plt
import re
import string
import tensorflow as tf
from collections import Counter
%matplotlib inline

## Data Preprocessing

In [2]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train',
                                  shuffle=True, random_state=11)
newsgroups_test = fetch_20newsgroups(subset='test', 
                                  shuffle=True, random_state=11)

In [3]:
print('Training text number:', len(newsgroups_train.data))
print('Testing text number:', len(newsgroups_test.data))

Training text number: 11314
Testing text number: 7532


## News to IDs

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
class readNews:
    '''
    Read 20news and transform them into vectors for training
    Args:
    train_data
    test_data
    '''
    def __init__(self, train_data, test_data):
        self._train_data = train_data
        self._test_data = test_data
        self._preprocess()
    
    
    def _preProcessor(self, s):
        #remove punctuation
        s = re.sub('['+string.punctuation+']', ' ', s)
        #remove digits
        s = re.sub('['+string.digits+']', ' ', s)
        #remove foreign characters
        s = re.sub('[^a-zA-Z]', ' ', s)
        #remove line ends
        s = re.sub('\n', ' ', s)
        #turn to lower case
        s = s.lower()
        s = re.sub('[ ]+',' ', s)
        s = s.rstrip()
        return s
    
    def _preprocess(self):
        '''Remove punctuations'''
        train_news = self._train_data.data
        test_news = self._test_data.data
        self._train_data.data = [self._preProcessor(item) for item in train_news]
        self._test_data.data = [self._preProcessor(item) for item in test_news]
        
    def _tfidf_vectorizer(self):
        ''''Vectorize news'''
        tfidfVectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 1), max_features=5000)
        X_train_tfidf = tfidfVectorizer.fit_transform(self._train_data.data)
        X_test_tfidf = tfidfVectorizer.transform(self._test_data.data)
        vocab_index_dict = tfidfVectorizer.vocabulary_
        return X_train_tfidf, X_test_tfidf, vocab_index_dict
    
    def tfidf_weight(self):
        '''Calculate TfIdf weights for each word within each news'''
        train_news_words, test_news_words = self._news2words()
        X_train_tfidf, X_test_tfidf, vocab_index_dict = self._tfidf_vectorizer()
        train_weights = []
        test_weights = []
        #Generate dicts for words and corresponding tfidf weights
        for i, news in enumerate(train_news_words):
            word_weight = []
            for word in news:
                try:
                    word_index = vocab_index_dict.get(word)
                    w = X_train_tfidf[i, word_index]
                    word_weight.append(w)
                except:
                    word_weight.append(0)
            train_weights.append(word_weight)
        for i, news in enumerate(test_news_words):
            word_weight = []
            for word in news:
                try:
                    word_index = vocab_index_dict.get(word)
                    w = X_test_tfidf[i, word_index]
                    word_weight.append(w)
                except:
                    word_weight.append(0)
            test_weights.append(word_weight)      
        return train_weights, test_weights
    
    def _news2words(self):
        #Split each news into words
        train_news_words = []
        test_news_words = []
        for news in self._train_data.data:
           #Collect words for each news
           train_news_words.append(news.split())
        for news in self._test_data.data:
            test_news_words.append(news.split())
        return train_news_words, test_news_words
    
    def buildVocab(self):
        words = []
        for news in self._train_data.data:
           #Collect all the chars
           words.extend(news.split())
        #Calculate frequencies of each character
        word_freq = Counter(words)
        #Filter out those low frequency characters
        vocab = [u for u,v in word_freq.items() if v>3]
        if 'UNK' not in vocab:
            vocab.append('UNK')
        #Map each char into an ID
        word_id_map = dict(zip(vocab, range(len(vocab))))
        #Map each ID into a word
        id_word_map = dict(zip(word_id_map.values(), word_id_map.keys()))
        return vocab, word_id_map, id_word_map
    
    def news2vecs(self):
        #Map each word into an ID
        train_news_words, test_news_words = self._news2words()
        vocab, word_id_map, id_word_mapp = self.buildVocab()
        def word2id(c):
            try:
               ID = word_id_map[c]
            except:#Trun those less frequent words into UNK
               ID = word_id_map['UNK']
            return ID
        #Turn each news into a list of word Ids
        words_vecs = lambda words: [word2id(w) for w in words]
        train_news_vecs = [words_vecs(words) for words in train_news_words]
        train_news_labels = self._train_data.target
        test_news_vecs = [words_vecs(words) for words in test_news_words]
        test_news_labels = self._test_data.target
        return train_news_vecs, train_news_labels, test_news_vecs, test_news_labels

In [5]:
#Create a readnews object
rn = readNews(newsgroups_train, newsgroups_test)
train_news_vecs, train_news_labels, test_news_vecs, test_news_labels = rn.news2vecs()

In [6]:
#Record tfidf weights for each word in each news
#train_weights, test_weights = rn.tfidf_weight()

In [7]:
#Get the vocabulary and dictionary of words as well as corresponding ids
vocab, word_id_map, id_word_map = rn.buildVocab()

In [8]:
def word2id(c):
    try:
        ID = word_id_map[c]
    except:#Trun those less frequent words into UNK
        ID = word_id_map['UNK']
    return ID
def id2word(c):
    try:
        word = id_word_map[c]
    except:
        word='UNK'
    return word

In [9]:
train_news_length = [len(news) for news in train_news_vecs]
print('Min Length', np.amin(train_news_length))
print('Max Length', np.max(train_news_length))
print('Median Length', np.median(train_news_length))

Min Length 17
Max Length 15804
Median Length 184.0


In [10]:
np.percentile(train_news_length, [0, 25, 50, 75, 90, 95])

array([  17.,  116.,  184.,  301.,  509.,  769.])

It seems the length varies much, perhaps we need buckets to put news with similar lengths together.

## Create Batch Data Generator

In [27]:
import random
class generateSamples:
    '''Generate samples for training and testing'''
    
    def __init__(self, news_vecs, news_labels, max_len=800):
        '''Pass batch size and poems vectors'''
        self.index = 0
        self.news_vecs = news_vecs
        self.news_labels = news_labels
        self.news_count = len(news_vecs)
        self.max_news_len = max_len
        
    def generate_batch(self, batch_size=64, is_training=True):
        '''Generate a training sample each time'''
        
        selected_samples = []
        selected_labels = []
        #For training, select random samples
        if is_training:
            selected_index = np.random.choice(len(self.news_vecs), batch_size, replace=True)
            for index in selected_index:
                selected_samples.append(self.news_vecs[index])
                selected_labels.append(self.news_labels[index])
        #For testing, select a few samples each time
        else:#Testing model
            start = self.index%self.news_count
            end = (start + batch_size)%self.news_count
            #In case end goes beyong the range of the samples
            if end > start:
                selected_samples = self.news_vecs[start: end]
                selected_labels = self.news_labels[start: end]
                self.index = end
            else:
                print('Test Samples come to an end!')
                selected_samples = self.news_vecs[start: ]
                selected_labels = self.news_labels[start: ]
                self.index = 0
            
        #Set the max lengths as the size of the input
        #max_len = max(map(len, data))
        #Record lengths for each text
        lengths = [len(item) for item in selected_samples]
        lengths = np.array(lengths)
        #Get the max length in current batch
        max_len = max(lengths)
        max_len = self.max_news_len if max_len > self.max_news_len else max_len

        #Create input and label
        x = np.full((batch_size, max_len), word2id('UNK'), np.int32)
        y = np.zeros(batch_size)
        for i in range(batch_size):
            #the first n elements as input
            if len(selected_samples[i]) < max_len:
                x[i, :len(selected_samples[i])] = selected_samples[i]
                y[i] = selected_labels[i]
            #If the news is very long
            #Cut it to the max_news_len
            else:
                x[i, :] = selected_samples[i][:max_len]
                y[i] = selected_labels[i]
        return x, y, lengths
    

From the data exploration, it is clear that the length of the news varies much, ranging from 10 to 10000. In order to deal with that case, we can take buckets into consideration, similar to seq2seq model.

In [12]:
class trainConfig:
    vocab_size = len(vocab)
    max_doc_len = max(map(len, train_news_vecs))
    label_size = 20
    embed_size = 128
    batch_size = 64
    layer_size = 2

In [13]:
class testConfig:
    vocab_size = len(vocab)
    max_doc_len = max(map(len, train_news_vecs))
    label_size = 20
    embed_size = 128
    batch_size = 64
    layer_size = 2
    
class singleConfig:
    vocab_size = len(vocab)
    max_doc_len = max(map(len, train_news_vecs))
    label_size = 20
    embed_size = 128
    batch_size = 1
    layer_size = 2

In [14]:
train_chunk_num = int(len(train_news_vecs)/trainConfig.batch_size)
test_chunk_num = int(len(test_news_vecs)/trainConfig.batch_size)
remain_num = len(test_news_labels) - trainConfig.batch_size*test_chunk_num
remain_num

44

In [28]:
train_samples = generateSamples(train_news_vecs, train_news_labels)

In [29]:
test_samples = generateSamples(test_news_vecs, test_news_labels)

In [30]:
x, y, lengths = train_samples.generate_batch()

In [31]:
x, y, lengths = test_samples.generate_batch()

## Dynamic RNN Model

In this model, we first transform each news as a series of word vectors. Then we put the series of news into a RNN system to get the final state vectors. Next, we do classification based on the news vectors.

In [19]:
import functools
from tensorflow.contrib.layers.python.layers import encoders
def lazy_property(function):
    attribute = '_cache_' + function.__name__

    @property
    @functools.wraps(function)
    def decorator(self):
        if not hasattr(self, attribute):
            setattr(self, attribute, function(self))
        return getattr(self, attribute)

    return decorator

In [20]:
#Reference:http://blog.csdn.net/u010223750/article/details/71079036
from tensorflow.contrib import rnn
class BiRNN_Model:
    def __init__(self, config, x, y, lengths, is_training=True):
        self.x = x
        self.y = y
        self.vocab_size = config.vocab_size
        self.embed_size = config.embed_size
        self.label_size = config.label_size
        self.batch_size = config.batch_size
        self.lengths = lengths
        self.max_doc_len = config.max_doc_len
        self.is_training = is_training
        self.predict
        if is_training:
            self.optimize
        print('Model Initialized!')
    
    @lazy_property
    def cost(self):
        logits = self.inference
        targets = tf.one_hot(self.y, 20, 1, 0)
        targets = tf.cast(targets, tf.float32)
        #Note  tf.nn.softmax_cross_entropy_with_logits(labels=Y, logits=activation)
        loss = tf.losses.softmax_cross_entropy(targets, logits)
        return loss
    
    @lazy_property
    def predict(self):
        logits = self.inference
        #probs = tf.nn.softmax(logits)
        predictions = tf.argmax(logits, 1)
        return predictions
    
    @lazy_property
    def correct_num(self):
        prediction = self.predict
        targets = tf.reshape(self.y, [-1])
        targets = tf.cast(targets, tf.int64)
        correct_prediction = tf.equal(prediction, targets)
        correct_num = tf.reduce_sum(tf.cast(correct_prediction, "float"))
        return correct_num
    
    @lazy_property
    def optimize(self):
        with tf.variable_scope('optimizer'):
            cost = self.cost
        #with tf.name_scope('Optimizer'):
            #self._learning_rate = tf.Variable(0.0, trainable=False)
            train_op = tf.train.AdamOptimizer(0.0005).minimize(cost)
            #train_op = tf.train.AdamOptimizer(self._learning_rate).minimize(cost)
            #tvars = tf.trainable_variables()
            #grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 6)
            #optimizer = tf.train.AdamOptimizer(self._learning_rate)
            #train_op = optimizer.apply_gradients(zip(grads, tvars))
        return train_op
    
    @lazy_property
    def inference(self):
        #Create embedding matrix
        with tf.device("/cpu:0"):
            embeddings = tf.get_variable('embedding', [self.vocab_size,  self.embed_size])
            inputs = tf.nn.embedding_lookup(embeddings, self.x)
        if self.is_training:
            inputs = tf.nn.dropout(inputs, 0.5)

        #slice inputs into a series of vectors
        #batch_doc_vectors = tf.unstack(inputs, axis=0)
        def lstm():
            return rnn.BasicLSTMCell(self.embed_size, forget_bias=0.0, 
                                      state_is_tuple=True) 
        #lstm_cell = lstm
        #cell = rnn.MultiRNNCell([lstm_cell() for _ in range(2)], 
                                #state_is_tuple=True)
        lstm_fw_cell = lstm()
        lstm_bw_cell = lstm()
        initial_fw_state = lstm_fw_cell.zero_state(self.batch_size, tf.float32)
        initial_bw_state = lstm_bw_cell.zero_state(self.batch_size, tf.float32)
        #Bidirectional dynamic RNN with given lengths for each text
        outputs, status = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, inputs,
                                                          initial_state_fw=initial_fw_state,
                                                          initial_state_bw=initial_bw_state,
                                                          sequence_length=self.lengths, dtype=tf.float32)
        #output = outputs[:,-1,:]
        #If we use padding, the last output will be based on the padding input values
        #Note here we use the hidden state instead of the last output
        #In dynamic rnn, the last state will remain the same after specified time steps
        #For example, if the length is 10 and the padding sequence has 20 words, the 
        #final state will be the one of 10th time step
        #COncatenate the two hidden states
        #print(status[1].h)
        output = tf.concat([status[0].h, status[1].h], axis=1)
        
        
        weights = tf.get_variable('weights', [2*self.embed_size, self.label_size], dtype=tf.float32)
        biases = tf.get_variable('biases', [self.label_size], dtype=tf.float32)
        logits = tf.matmul(output, weights) + biases
        #预测值
        return logits
    
    @property
    def learningRate(self):
        return self._learning_rate
        

In [21]:
graph_birnn = tf.Graph()
#Create models for training and testing data
with graph_birnn.as_default():
    initializer = tf.random_uniform_initializer(-0.02, 0.02)
    with tf.name_scope('train'):
        train_data = tf.placeholder(tf.int32, [trainConfig.batch_size, None])
        train_label = tf.placeholder(tf.int32, [trainConfig.batch_size])
        train_lengths = tf.placeholder(tf.int32, [trainConfig.batch_size])
        #Set different models for different buckets
        with tf.variable_scope("Model", reuse=None, initializer=initializer):
            train_model = BiRNN_Model(trainConfig, train_data, train_label, train_lengths)
    with tf.name_scope('test'):
        test_data = tf.placeholder(tf.int32, [testConfig.batch_size, None])
        test_label = tf.placeholder(tf.int32, [testConfig.batch_size])
        test_lengths = tf.placeholder(tf.int32, [testConfig.batch_size])
        single_data = tf.placeholder(tf.int32, [singleConfig.batch_size, None])
        single_label = tf.placeholder(tf.int32, [singleConfig.batch_size])
        single_lengths = tf.placeholder(tf.int32, [singleConfig.batch_size])
        #Set different models for different buckets
        with tf.variable_scope("Model", reuse=True, initializer=initializer):
            test_model = BiRNN_Model(testConfig, test_data, test_label, test_lengths, False)
            single_model = BiRNN_Model(singleConfig, single_data, single_label, single_lengths, False)

Model Initialized!
Model Initialized!
Model Initialized!


In [22]:
len(test_news_vecs)/64

117.6875

In [24]:
len(test_news_vecs) - 64*117

44

In [33]:
import time
epochs = 50
with tf.Session(graph=graph_birnn) as sess:
    #Initialize parameters
    init = tf.global_variables_initializer()
    sess.run(init)
    start_time = time.time()
    for m in range(epochs):
        for i in range(train_chunk_num):
            #sess.run(tf.assign(learning_rate, 0.002*((0.98)**m)))
            x, y, lengths = train_samples.generate_batch()
            feed_dict = {train_data:x, train_label:y, train_lengths:lengths}
            l, _ = sess.run([train_model.cost, train_model.optimize], feed_dict=feed_dict)
            if i%100 == 0:
                print('Loss:', round(l, 4))
        end_time = time.time()
        print('Epoch', m, 'time:{:.2f}'.format(end_time - start_time))
        start_time = end_time
    #Calculate Testing Accuracy
    print('Testing...')
    count = 0
    test_samples = generateSamples(test_news_vecs, test_news_labels)
    for _ in range(117):
        #Traverse each data
        x, y, lengths = test_samples.generate_batch(64, False)
        feed_dict = {test_data:x, test_label:y, test_lengths:lengths}
        n = sess.run(test_model.correct_num, feed_dict=feed_dict)
        count += np.sum(n)
    for _ in range(44):
        #Traverse each data
        x, y, lengths = test_samples.generate_batch(1, False)
        feed_dict = {single_data:x, single_label:y, single_lengths:lengths}
        n = sess.run(single_model.correct_num, feed_dict=feed_dict)
        count += np.sum(n)
    end_time = time.time()
    print('Testing Time:{:.2f}'.format(end_time - start_time))
    print(count*1.0/len(test_news_vecs))   


Loss: 2.995
Loss: 2.979
Epoch 0 time:172.94
Loss: 2.8216
Loss: 2.5011
Epoch 1 time:173.14
Loss: 2.4381
Loss: 2.2444
Epoch 2 time:172.67
Loss: 1.8115
Loss: 2.0834
Epoch 3 time:171.38
Loss: 1.827
Loss: 1.4616
Epoch 4 time:171.21
Loss: 1.3666
Loss: 2.3901
Epoch 5 time:173.48
Loss: 1.1999
Loss: 1.1597
Epoch 6 time:173.18
Loss: 1.1915
Loss: 0.8363
Epoch 7 time:173.63
Loss: 0.7313
Loss: 0.919
Epoch 8 time:172.54
Loss: 0.9635
Loss: 0.7441
Epoch 9 time:171.12
Loss: 0.8278
Loss: 0.6081
Epoch 10 time:172.59
Loss: 0.6202
Loss: 0.4946
Epoch 11 time:173.95
Loss: 0.3739
Loss: 0.7374
Epoch 12 time:171.25
Loss: 0.3572
Loss: 0.3614
Epoch 13 time:173.87
Loss: 0.2641
Loss: 0.2091
Epoch 14 time:172.71
Loss: 0.3236
Loss: 0.3306
Epoch 15 time:174.83
Loss: 0.2707
Loss: 0.3751
Epoch 16 time:171.60
Loss: 0.2333
Loss: 0.3598
Epoch 17 time:171.86
Loss: 0.2254
Loss: 0.2952
Epoch 18 time:170.41
Loss: 0.1887
Loss: 0.1345
Epoch 19 time:170.75
Loss: 0.1172
Loss: 0.1832
Epoch 20 time:173.30
Loss: 0.2014
Loss: 0.3127
E

It works much better than one-direction RNN.