This experiment aims to classifiy 20newsgroup through convolutional neural network models. Based on the original [paper](http://arxiv.org/abs/1408.5882), [dennybritz](https://github.com/dennybritz/cnn-text-classification-tf) realized this model on sentimental classification using tensorflow. And he was generous to share his work at [Github](https://github.com/dennybritz/cnn-text-classification-tf). Refering to his work,  I made some modifications on the model and classify 20newsgroup.

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import time
import datetime
import data_helpers
from text_cnn import TextCNN
from tensorflow.contrib import learn

## Read and Preprocess Texts
I have preprocess the original texts and saved them as csv files.

In [2]:
train_data = pd.read_csv('data/train_data.csv')
test_data = pd.read_csv('data/test_data.csv')

In [3]:
# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in train_data.text])
#Cut long articles to 800 words. Pad short ones
vocab_processor = learn.preprocessing.VocabularyProcessor(800)
x_train = np.array(list(vocab_processor.fit_transform(train_data.text)))

In [10]:
x_test = np.array(list(vocab_processor.transform(test_data.text)))

In [11]:
y_train, y_test = train_data.target, test_data.target
y_train = np.array(y_train).reshape(len(y_train), 1)
y_test = np.array(y_test).reshape(len(y_test), 1)

In [12]:
#Encode the label as one-hot code
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
y_train = ohe.fit_transform(y_train)
y_test = ohe.transform(y_test)

In [13]:
#Restore the values from sparse matrix
y_train = np.array([item.toarray().reshape(-1) for item in y_train])
y_test = np.array([item.toarray().reshape(-1) for item in y_test])

In [14]:
graph = tf.Graph()
with graph.as_default():
    cnn = TextCNN(
            sequence_length=x_train.shape[1],
            num_classes=y_train.shape[1],
            vocab_size=len(vocab_processor.vocabulary_),
            embedding_size=64,
            filter_sizes= [3, 4, 5, 6],
            num_filters=32,
            l2_reg_lambda=0.02)

In [15]:
with graph.as_default():
    # Define Training procedure
    global_step = tf.Variable(0, name="global_step", trainable=False)
    optimizer = tf.train.AdamOptimizer(1e-3)
    #train_op = optimizer.minimize(cnn.loss)
    grads_and_vars = optimizer.compute_gradients(cnn.loss)
    train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
    
    # Keep track of gradient values and sparsity (optional)
    grad_summaries = []
    for g, v in grads_and_vars:
        if g is not None:
            grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name.replace(':', '_')), g)
            sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name.replace(':', '_')), tf.nn.zero_fraction(g))
            grad_summaries.append(grad_hist_summary)
            grad_summaries.append(sparsity_summary)
    grad_summaries_merged = tf.summary.merge(grad_summaries)
    # Output directory for models and summaries
    timestamp = str(int(time.time()))
    out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
    print("Writing to {}\n".format(out_dir))
    # Summaries for loss and accuracy
    loss_summary = tf.summary.scalar("loss", cnn.loss)
    acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)
    
    # Train Summaries
    train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
    train_summary_dir = os.path.join(out_dir, "summaries", "train")
    train_summary_writer = tf.summary.FileWriter(train_summary_dir, graph)

    # Dev summaries
    dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
    dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
    dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, graph)

    # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
    checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)

Writing to C:\Users\onlooker\Documents\deeplearning_projects\text_classification_cnn\cnn_text_classification_tf\runs\1509495505



In [17]:
with tf.Session(graph=graph) as sess:
    init = tf.global_variables_initializer()
    sess.run(tf.global_variables_initializer())
    def train_step(x_batch, y_batch):
            """
            A single training step
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: 0.5
            }
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            if step%100 == 0:
                print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            train_summary_writer.add_summary(summaries, step)

    def dev_step(x_batch, y_batch, writer=None):
        """
        Evaluates model on a dev set
        """
        print('Evaluation....')
        loops = int(len(x_batch)/32)
        remains = len(x_batch) - 32*loops
        count = 0
        for i in range(loops):
            start = i * 32
            end = (i+1) * 32
            x = x_batch[start: end]
            y = y_batch[start: end]
            feed_dict = {
                cnn.input_x: x,
                cnn.input_y: y,
                cnn.dropout_keep_prob: 1.0
            }
            step, summaries, loss, correct_num = sess.run([global_step, dev_summary_op, cnn.loss, cnn.correct_num],
            feed_dict)
            count += correct_num
        for i in range(remains):
            start = 32 * loops + i
            end = 32 * loops + i + 1
            x = x_batch[start: end]
            y = y_batch[start: end]
            feed_dict = {
                cnn.input_x: x,
                cnn.input_y: y,
                cnn.dropout_keep_prob: 1.0
            }
            step, summaries, loss, correct_num = sess.run([global_step, dev_summary_op, cnn.loss, cnn.correct_num],
            feed_dict)
            count += correct_num
        time_str = datetime.datetime.now().isoformat()
        print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, float(correct_num)/len(y_batch)))
        if writer:
            writer.add_summary(summaries, step)

    # Generate batches
    batches = data_helpers.batch_iter(
            list(zip(x_train, y_train)), 32, 50)
    # Training loop. For each batch...
    for batch in batches:
        x_batch, y_batch = zip(*batch)
        x_batch = np.array(x_batch)
        y_batch = np.array(y_batch)
        
        train_step(x_batch, y_batch)
        current_step = tf.train.global_step(sess, global_step)
        if current_step % 300 == 0:
            path = saver.save(sess, checkpoint_prefix, global_step=current_step)
            print("Saved model checkpoint to {}\n".format(path))
    path = saver.save(sess, checkpoint_prefix, global_step=current_step)
    print("Saved model checkpoint to {}\n".format(path))     

2017-11-01T08:19:19.986761: step 100, loss 4.95034, acc 0.09375
2017-11-01T08:19:44.527734: step 200, loss 4.74812, acc 0.09375
Saved model checkpoint to C:\Users\onlooker\Documents\deeplearning_projects\text_classification_cnn\cnn_text_classification_tf\runs\1509495505\checkpoints\model-200

2017-11-01T08:20:10.927117: step 300, loss 3.64749, acc 0.09375
2017-11-01T08:20:35.603756: step 400, loss 3.72247, acc 0.09375
Saved model checkpoint to C:\Users\onlooker\Documents\deeplearning_projects\text_classification_cnn\cnn_text_classification_tf\runs\1509495505\checkpoints\model-400

2017-11-01T08:21:01.370469: step 500, loss 3.69736, acc 0.0625
2017-11-01T08:21:25.801734: step 600, loss 2.91698, acc 0.1875
Saved model checkpoint to C:\Users\onlooker\Documents\deeplearning_projects\text_classification_cnn\cnn_text_classification_tf\runs\1509495505\checkpoints\model-600

2017-11-01T08:21:51.764804: step 700, loss 3.23904, acc 0.0625
2017-11-01T08:22:16.486764: step 800, loss 2.8626, acc 0.

In [19]:
with tf.Session(graph=graph) as sess:
    model_file=tf.train.latest_checkpoint('runs/1509495505/checkpoints/')
    saver.restore(sess, model_file)
    # Generate batches
    batches = data_helpers.batch_iter(
           list(zip(x_train, y_train)), 32, 10)
    # Training loop. For each batch...
    for batch in batches:
        x_batch, y_batch = zip(*batch)
        x_batch = np.array(x_batch)
        y_batch = np.array(y_batch)
        
        train_step(x_batch, y_batch)
        current_step = tf.train.global_step(sess, global_step)
        if current_step % 300 == 0:
            path = saver.save(sess, checkpoint_prefix, global_step=current_step)
            print("Saved model checkpoint to {}\n".format(path))
    path = saver.save(sess, checkpoint_prefix, global_step=current_step)
    print("Saved model checkpoint to {}\n".format(path)) 

INFO:tensorflow:Restoring parameters from C:\Users\onlooker\Documents\deeplearning_projects\text_classification_cnn\cnn_text_classification_tf\runs\1509495505\checkpoints\model-10600
2017-11-01T10:53:14.178573: step 10700, loss 0.130815, acc 1
2017-11-01T10:53:38.938346: step 10800, loss 0.125105, acc 1
Saved model checkpoint to C:\Users\onlooker\Documents\deeplearning_projects\text_classification_cnn\cnn_text_classification_tf\runs\1509495505\checkpoints\model-10800

2017-11-01T10:54:05.163908: step 10900, loss 0.154567, acc 1
2017-11-01T10:54:29.679825: step 11000, loss 0.15521, acc 1
2017-11-01T10:54:54.389944: step 11100, loss 0.241976, acc 0.9375
Saved model checkpoint to C:\Users\onlooker\Documents\deeplearning_projects\text_classification_cnn\cnn_text_classification_tf\runs\1509495505\checkpoints\model-11100

2017-11-01T10:55:20.553063: step 11200, loss 0.162005, acc 1
2017-11-01T10:55:45.300993: step 11300, loss 0.128682, acc 1
2017-11-01T10:56:09.944910: step 11400, loss 0.135

In [20]:
#Restore the parameters and do testing
with tf.Session(graph=graph) as sess:
    model_file=tf.train.latest_checkpoint('runs/1509495505/checkpoints/')
    saver.restore(sess, model_file)
    loops = int(len(x_test)/32)
    remains = len(x_test) - 32*loops
    count = 0
    for i in range(loops):
        start = i * 32
        end = (i+1) * 32
        x = x_test[start: end]
        y = y_test[start: end]
        feed_dict = {
            cnn.input_x: x,
            cnn.input_y: y,
            cnn.dropout_keep_prob: 1.0
            }
        correct_num = sess.run(cnn.correct_num, feed_dict)
        count += correct_num
        
    for i in range(remains):
        start = 32 * loops + i
        end = 32 * loops + i + 1
        x = x_test[start: ]
        y = y_test[start: end]
        feed_dict = {
            cnn.input_x: x,
            cnn.input_y: y,
            cnn.dropout_keep_prob: 1.0
        }
        correct_num = sess.run(cnn.correct_num, feed_dict)
        count += correct_num
    time_str = datetime.datetime.now().isoformat()
    print("Accuracy:{:.5f}".format(float(count)/len(y_test)))

INFO:tensorflow:Restoring parameters from C:\Users\onlooker\Documents\deeplearning_projects\text_classification_cnn\cnn_text_classification_tf\runs\1509495505\checkpoints\model-14140
Accuracy:0.79607


注意进入路径下 tensorboard --logir=train 注意不要空格。