In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
import tensorflow as tf
from collections import Counter
%matplotlib inline

## Data Preprocessing

In [2]:
training_data = pd.read_csv('training_data_processed.csv')
testing_data = pd.read_csv('testing_data_processed.csv')

In [3]:
#Remove those brackets and blanks
training_data['text'] = training_data['text'].str.replace('[', '')
training_data['text'] = training_data['text'].str.replace(']', '')
training_data['text'] = training_data['text'].str.replace(' ', '')
training_data['text'] = training_data['text'].map(lambda x: x.split(','))

In [4]:
testing_data['text'] = testing_data['text'].str.replace('[', '')
testing_data['text'] = testing_data['text'].str.replace(']', '')
testing_data['text'] = testing_data['text'].str.replace(' ', '')
testing_data['text'] = testing_data['text'].map(lambda x: x.split(','))

In [30]:
train_news_vecs = training_data['text'].map(lambda l: [int(i) for i in l]).values

In [31]:
test_news_vecs = testing_data['text'].map(lambda l: [int(i) for i in l]).values

In [32]:
train_news_labels = training_data['label']
test_news_labels = testing_data['label']

## News to IDs

It seems the length varies much, perhaps we need buckets to put news with similar lengths together.

## Create Batch Data Generator

In [46]:
import random
class generateSamples:
    '''Generate samples for training and testing'''
    
    def __init__(self, news_vecs, news_labels,  max_len=150):
        '''Pass batch size and poems vectors'''
        self.index = 0
        self.news_vecs = news_vecs
        self.news_labels = news_labels
        self.news_count = len(news_vecs)
        self.max_news_len = max_len
        
    def generate_batch(self, batch_size=64, is_training=True):
        '''Generate a training sample each time'''
        
        selected_samples = []
        selected_labels = []

        #For training, select random samples
        if is_training:
            selected_index = np.random.choice(len(self.news_vecs), batch_size, replace=True)
            for index in selected_index:
                selected_samples.append(self.news_vecs[index])
                selected_labels.append(self.news_labels[index])

        #For testing, select a few samples each time
        else:#Testing model
            start = self.index%self.news_count
            end = (start + batch_size)%self.news_count
            #In case end goes beyong the range of the samples
            if end > start:
                selected_samples = self.news_vecs[start: end]
                selected_labels = self.news_labels[start: end]
                #record weights
                self.index = end
            else:
                print('Test Samples come to an end!')
                selected_samples = self.news_vecs[start: ]
                selected_labels = self.news_labels[start: ]
                self.index = 0
            
        #Set the max lengths as the size of the input
        #max_len = max(map(len, data))
        #Record lengths for each text
        lengths = [len(item) for item in selected_samples]
        lengths = np.array(lengths)
        #Get the max length in current batch
        max_len = self.max_news_len
        #max_len = self.max_news_len if max_len > self.max_news_len else max_len

        #Create input and label
        x = np.full((batch_size, max_len), 9999, np.int32)
        y = np.array(selected_labels)
        for i in range(batch_size):
            #the first n elements as input
            if len(selected_samples[i]) < max_len:
                x[i, :len(selected_samples[i])] = selected_samples[i]
                #y[i] = selected_labels[i]
            #If the news is very long
            #Cut it to the max_news_len
            else:
                x[i, :] = selected_samples[i][:max_len]
                #y[i] = selected_labels[i]
        return x, y, lengths
    

From the data exploration, it is clear that the length of the news varies much, ranging from 10 to 10000. In order to deal with that case, we can take buckets into consideration, similar to seq2seq model.

In [47]:
class trainConfig:
    vocab_size = 10000
    max_doc_len = 150
    label_size = 20
    embed_size = 128
    batch_size = 64
    layer_size = 2

In [48]:
class testConfig:
    vocab_size = 10000
    max_doc_len = 150
    label_size = 20
    embed_size = 128
    batch_size = 64
    layer_size = 2
    
class singleConfig:
    vocab_size = 10000
    max_doc_len = 150
    label_size = 20
    embed_size = 128
    batch_size = 1
    layer_size = 2

In [49]:
train_chunk_num = int(len(train_news_vecs)/trainConfig.batch_size)
test_chunk_num = int(len(test_news_vecs)/trainConfig.batch_size)
remain_num = len(test_news_labels) - trainConfig.batch_size*test_chunk_num
remain_num

56

In [50]:
train_samples = generateSamples(train_news_vecs, train_news_labels)

In [51]:
test_samples = generateSamples(test_news_vecs, test_news_labels)

In [52]:
x, y, lengths = train_samples.generate_batch()

In [54]:
x, y, lengths = test_samples.generate_batch(is_training=False)

## Dynamic RNN Model

In this model, we first transform each news as a series of word vectors. Then we put the series of news into a RNN system to get the final state vectors. Next, we do classification based on the news vectors.

In [16]:
import functools
from tensorflow.contrib.layers.python.layers import encoders
def lazy_property(function):
    attribute = '_cache_' + function.__name__

    @property
    @functools.wraps(function)
    def decorator(self):
        if not hasattr(self, attribute):
            setattr(self, attribute, function(self))
        return getattr(self, attribute)

    return decorator

In [66]:
#Reference:http://blog.csdn.net/u010223750/article/details/71079036
from tensorflow.contrib import rnn
class RNN_Model:
    def __init__(self, config, x, y, lengths, is_training=True):
        self.x = x
        self.y = y
        self.vocab_size = config.vocab_size
        self.embed_size = config.embed_size
        self.label_size = config.label_size
        self.batch_size = config.batch_size
        self.lengths = lengths
        self.max_doc_len = config.max_doc_len
        self.is_training = is_training
        self.predict
        if is_training:
            self.optimize
        print('Model Initialized!')
    
    @lazy_property
    def cost(self):
        logits = self.inference
        targets = tf.one_hot(self.y, 20, 1, 0)
        targets = tf.cast(targets, tf.float32)
        #Note  tf.nn.softmax_cross_entropy_with_logits(labels=Y, logits=activation)
        loss = tf.losses.softmax_cross_entropy(targets, logits)
        return loss
    
    @lazy_property
    def predict(self):
        logits = self.inference
        #probs = tf.nn.softmax(logits)
        predictions = tf.argmax(logits, 1)
        return predictions
    
    @lazy_property
    def correct_num(self):
        prediction = self.predict
        targets = tf.reshape(self.y, [-1])
        targets = tf.cast(targets, tf.int64)
        correct_prediction = tf.equal(prediction, targets)
        correct_num = tf.reduce_sum(tf.cast(correct_prediction, "float"))
        return correct_num
    
    @lazy_property
    def optimize(self):
        with tf.variable_scope('optimizer'):
            cost = self.cost
        #with tf.name_scope('Optimizer'):
            #self._learning_rate = tf.Variable(0.0, trainable=False)
            train_op = tf.train.AdamOptimizer(0.0005).minimize(cost)
            #train_op = tf.train.AdamOptimizer(self._learning_rate).minimize(cost)
            #tvars = tf.trainable_variables()
            #grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 6)
            #optimizer = tf.train.AdamOptimizer(self._learning_rate)
            #train_op = optimizer.apply_gradients(zip(grads, tvars))
        return train_op
    
    @lazy_property
    def inference(self):
        #Create embedding matrix
        with tf.device("/cpu:0"):
            embeddings = tf.get_variable('embedding', [self.vocab_size,  self.embed_size])
            inputs = tf.nn.embedding_lookup(embeddings, self.x)
        if self.is_training:
            inputs = tf.nn.dropout(inputs, 0.5)

        #slice inputs into a series of vectors
        #batch_doc_vectors = tf.unstack(inputs, axis=0)
        def lstm():
            return rnn.BasicLSTMCell(self.embed_size, forget_bias=0.0, 
                                      state_is_tuple=True) 
        with tf.variable_scope('RNNLayer'):
            lstm_cell = lstm
            cell = rnn.MultiRNNCell([lstm_cell() for _ in range(2)],                                    
                                    state_is_tuple=True)
            initial_state = cell.zero_state(self.batch_size, tf.float32)
            #Dynamic RNN with given lengths for each text
            outputs, status = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state,
                                            sequence_length=self.lengths, dtype=tf.float32)
        #output = outputs[:,-1,:]
        #If we use padding, the last output will be based on the padding input values
        #Note here we use the hidden state instead of the last output
        #In dynamic rnn, the last state will remain the same after specified time steps
        #For example, if the length is 10 and the padding sequence has 20 words, the 
        #final state will be the one of 10th time step
        #Multilayer RNN has multilayer hidden states
        output = status[1].h
        
        with tf.variable_scope('outputlayer'):       
            weights = tf.get_variable('weights', [self.embed_size, self.label_size], dtype=tf.float32)
            biases = tf.get_variable('biases', [self.label_size], dtype=tf.float32)
            logits = tf.matmul(output, weights) + biases
        #预测值
        return logits
    
    @property
    def learningRate(self):
        return self._learning_rate
        

In [67]:
graph_rnn = tf.Graph()
#Create models for training and testing data
with graph_rnn.as_default():
    initializer = tf.random_uniform_initializer(-0.02, 0.02)
    
    with tf.name_scope('train'):
        train_data = tf.placeholder(tf.int32, [trainConfig.batch_size, None])
        train_label = tf.placeholder(tf.int32, [trainConfig.batch_size])
        train_lengths = tf.placeholder(tf.float32, [trainConfig.batch_size])
        #Set different models for different buckets
        with tf.variable_scope("Model", reuse=None, initializer=initializer):
            train_model = RNN_Model(trainConfig, train_data, train_label, train_lengths)
    with tf.name_scope('test'):
        test_data = tf.placeholder(tf.int32, [testConfig.batch_size, None])
        test_label = tf.placeholder(tf.int32, [testConfig.batch_size])
        test_lengths = tf.placeholder(tf.float32, [testConfig.batch_size])
        single_data = tf.placeholder(tf.int32, [singleConfig.batch_size, None])
        single_label = tf.placeholder(tf.int32, [singleConfig.batch_size])
        single_lengths = tf.placeholder(tf.float32, [singleConfig.batch_size])
        #Set different models for different buckets
        with tf.variable_scope("Model", reuse=True, initializer=initializer):
            test_model = RNN_Model(testConfig, test_data, test_label, test_lengths, False)
            single_model = RNN_Model(singleConfig, single_data, single_label, single_lengths, False)
    saver=tf.train.Saver()

Model Initialized!
Model Initialized!
Model Initialized!


In [68]:
len(test_news_vecs)/64

103.875

In [69]:
len(test_news_vecs) - 64*103

56

In [70]:
import time
epochs = 30
with tf.Session(graph=graph_rnn) as sess:
    #Initialize parameters
    init = tf.global_variables_initializer()
    sess.run(init)
    start_time = time.time()
    for m in range(epochs):
        for i in range(train_chunk_num):
            #sess.run(tf.assign(learning_rate, 0.002*((0.98)**m)))
            x, y, lengths = train_samples.generate_batch()
            feed_dict = {train_data:x, train_label:y, train_lengths:lengths}
            l, _ = sess.run([train_model.cost, train_model.optimize], feed_dict=feed_dict)
            if i%100 == 0:
                print('Loss:', round(l, 4))
        end_time = time.time()
        print('Epoch', m, 'time:{:.2f}'.format(end_time - start_time))
        start_time = end_time
        saver.save(sess,'model/model.ckpt', global_step=m+1) 


Loss: 2.9968
Loss: 3.0664
Epoch 0 time:37.43
Loss: 2.7493
Loss: 2.9734
Epoch 1 time:38.97
Loss: 2.9718
Loss: 2.9568
Epoch 2 time:39.09
Loss: 2.979
Loss: 3.0325
Epoch 3 time:38.96
Loss: 2.9921
Loss: 2.9809
Epoch 4 time:39.45
Loss: 2.952
Loss: 2.9359
Epoch 5 time:39.05
Loss: 3.0217
Loss: 2.9496
Epoch 6 time:39.03
Loss: 2.8805
Loss: 2.9681
Epoch 7 time:39.13
Loss: 3.0039
Loss: 2.946
Epoch 8 time:39.05
Loss: 2.893
Loss: 2.9756
Epoch 9 time:39.14
Loss: 2.9267
Loss: 2.9998
Epoch 10 time:38.96
Loss: 2.9894
Loss: 2.9838
Epoch 11 time:38.92
Loss: 2.8845
Loss: 2.6776
Epoch 12 time:38.90
Loss: 2.7426
Loss: 2.4841
Epoch 13 time:38.98
Loss: 2.4402
Loss: 2.3949
Epoch 14 time:38.78
Loss: 2.3496
Loss: 2.2987
Epoch 15 time:39.32
Loss: 2.2507
Loss: 2.6053
Epoch 16 time:38.42
Loss: 2.3727
Loss: 2.39
Epoch 17 time:38.81
Loss: 2.3345
Loss: 2.2796
Epoch 18 time:38.84
Loss: 2.298
Loss: 2.1731
Epoch 19 time:39.38
Loss: 2.3299
Loss: 2.1623
Epoch 20 time:39.08
Loss: 2.0615
Loss: 2.2393
Epoch 21 time:39.05
Loss:

In [71]:
with tf.Session(graph=graph_rnn) as sess:
    #Get the latest model
    model_file=tf.train.latest_checkpoint('model/')
    #Pass the saved model parameters to sess
    #saver.restore(sess, "/model/model.ckpt9")
    saver.restore(sess,model_file)
    #Calculate Testing Accuracy
    print('Testing...')
    count = 0
    test_samples = generateSamples(test_news_vecs, test_news_labels)
    for _ in range(103):
        #Traverse each data
        x, y, lengths = test_samples.generate_batch(is_training=False)
        feed_dict = {test_data:x, test_label:y, test_lengths:lengths}
        n = sess.run(test_model.correct_num, feed_dict=feed_dict)
        count += np.sum(n)
    for _ in range(56):
        #Traverse each data
        x, y, lengths = test_samples.generate_batch(1, False)
        feed_dict = {single_data:x, single_label:y, single_lengths:lengths}
        n = sess.run(single_model.correct_num, feed_dict=feed_dict)
        count += np.sum(n)
    end_time = time.time()
    print('Testing Time:{:.2f}'.format(end_time - start_time))
    print(count*1.0/len(test_news_vecs))  

INFO:tensorflow:Restoring parameters from model/model.ckpt-30
Testing...
Test Samples come to an end!
Testing Time:201.73
0.180505415162
