In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from pprint import pprint

import tensorflow as tf
import tensorflow.contrib.slim as slim

# 0. Data Loading and processing

In [2]:
# Load training set and test set
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
newsgroups_test  = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
X_train = newsgroups_train.data
Y_train = newsgroups_train.target
X_test  = newsgroups_test.data
Y_test  = newsgroups_test.target

In [3]:
print(X_test[0:3])
print(Y_test[0:3])

['TRry the SKywatch project in  Arizona.', 'The Vatican library recently made a tour of the US.\n Can anyone help me in finding a FTP site where this collection is \n available.', 'Hi there,\n\nI am here looking for some help.\n\nMy friend is a interior decor designer. He is from Thailand. He is\ntrying to find some graphics software on PC. Any suggestion on which\nsoftware to buy,where to buy and how much it costs ? He likes the most\nsophisticated \nsoftware(the more features it has,the better)']
[2 1 1]


In [4]:
# Declare two vectorizers
tfidf_vectorizer = TfidfVectorizer(min_df=40)

In [5]:
# Fitting vectorizers to the training set
tfidf_vectorizer = tfidf_vectorizer.fit(X_train)

In [6]:
# Transform X_train and X_test using 2 vectorizers
# X_train_count = count_vectorizer.transform(X_train)
X_train_tfidf = tfidf_vectorizer.transform(X_train)
# X_test_count  = count_vectorizer.transform(X_test)
X_test_tfidf  = tfidf_vectorizer.transform(X_test)

# dense vector

In [7]:
# Convert sparse matrix into dense matrix
X_train = X_train_tfidf.toarray()
X_test = X_test_tfidf.toarray()

In [8]:
num_train = Y_train.shape[0]
num_test = Y_test.shape[0]

print("Number of training points: ", num_train)
print("Number of test points: ", num_test)

Number of training points:  2034
Number of test points:  1353


In [9]:
dim_X = X_train.shape[1]
print("Dimension of X: %d" % dim_X)

Dimension of X: 758


In [10]:
labels = np.unique(Y_test)
print("Labels: ", labels)

Labels:  [0 1 2 3]


# 1. Fitting classifiers with TF-IDF vectorizer with TensorFlow

## 1.1. Placeholder
- Shape of the placeholder for inputs: [batch_size, dim_X]
- Shape of the placeholder for outputs: [batch_size]

In [11]:
X = tf.placeholder(tf.float32, [None, dim_X], name="Inputs")
Y = tf.placeholder(tf.int32, [None], name="Labels")

## 1.2. Build the model
- with TF-Slim
- https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/slim 참조

In [12]:
def fully_connected(inputs, num_labels, hidden_sizes=[100, 100], scope='FCN'):
    """
    [fully_connected] n개의 hidden layer를 갖는 feed-forward network 생성 (with TF-Slim)
    
    [Args]
      - inputs: 입력 데이터를 위한 placeholder
      - hidden_sizes: a list (은닉 노드 수를 원하는 층 수 만큼 기록한 리스트)
      - Scope: default value ("FCN")
    """
    # Inputs에서 1차원의 텐서들이 placeholder로 들어온다고 가정
    input_dim = inputs.get_shape()[1]

    # Number of hidden layers
    num_hidden_layers = len(hidden_sizes)
    
    with slim.arg_scope([slim.fully_connected],
                        activation_fn=tf.nn.relu,
                        weights_initializer=tf.contrib.layers.xavier_initializer(),
                        biases_initializer=tf.constant_initializer(0.0)):
        net = inputs
        for i in range(num_hidden_layers):
            scope_name = 'fc' + str(i)
            net = slim.fully_connected(inputs=net, num_outputs=hidden_sizes[i], scope=scope_name)
        net = slim.fully_connected(inputs=net, num_outputs=num_labels, activation_fn=None, scope='logits')
    
    return net

In [13]:
logits = fully_connected(inputs=X, num_labels=len(labels), hidden_sizes=[100, 100], scope='FCN')

## 1.3. Cost function and optimizer

In [14]:
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
train_op = optimizer.minimize(cost)

## 1.4. Predicting operator

In [15]:
correct_prediction = tf.nn.in_top_k(logits, Y, 1)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

## 1.5. Run

In [16]:
# Parameters
NUM_EPOCHS = 40
BATCH_SIZE = 20

In [17]:
# 결과를 저장할 리스트를 생성
train_cost_list = []
test_cost_list = []
test_accuracy_list = []

In [18]:
with tf.Session() as sess:
    # Variable initialization
    sess.run(tf.global_variables_initializer())
    
    # Indices for constructing batches
    start_idx = range(0, num_train, BATCH_SIZE)
    end_idx = range(BATCH_SIZE, num_train + 1, BATCH_SIZE)
    
    NUM_BATCHES = len(start_idx)
    
    for epoch in range(0,NUM_EPOCHS):

        # Set "train_cost" as 0 before starting the epoch
        train_cost = 0
        
        # Training phase
        for start, end in zip(start_idx, end_idx):

            # Construct the input batch
            batch_xs = X_train[start:end]
            batch_ys = Y_train[start:end]
            
            # Calculate cost
            tmp_cost, _ = sess.run([cost, train_op], feed_dict={X: batch_xs, Y: batch_ys})
            train_cost += tmp_cost
        
        train_cost = train_cost / NUM_BATCHES
        train_cost_list.append(train_cost)
        print("[{} epoch] training cost {:0.4f}".format((epoch + 1), train_cost))
        
        # Validation phase
        if (epoch + 1) % 10 == 0:
            test_cost, test_accuracy = sess.run([cost, accuracy], feed_dict={X: X_test, Y: Y_test})
            test_cost_list.append(test_cost)
            test_accuracy_list.append(test_accuracy)
            print("\t[{} epoch] test accuracy {:0.4f}".format((epoch + 1), test_accuracy))
            
    # Test phase
    test_accuracy = sess.run(accuracy, feed_dict={X: X_test, Y: Y_test})
    print("\n")
    print("Test accuracy: {:0.4f}".format(test_accuracy))

[1 epoch] training cost 0.8086
[2 epoch] training cost 0.3784
[3 epoch] training cost 0.2137
[4 epoch] training cost 0.1946
[5 epoch] training cost 0.1616
[6 epoch] training cost 0.1125
[7 epoch] training cost 0.0743
[8 epoch] training cost 0.0686
[9 epoch] training cost 0.0649
[10 epoch] training cost 0.0741
	[10 epoch] test accuracy 0.6364
[11 epoch] training cost 0.0556
[12 epoch] training cost 0.0574
[13 epoch] training cost 0.0455
[14 epoch] training cost 0.0451
[15 epoch] training cost 0.0439
[16 epoch] training cost 0.0438
[17 epoch] training cost 0.0438
[18 epoch] training cost 0.0438
[19 epoch] training cost 0.0438
[20 epoch] training cost 0.0438
	[20 epoch] test accuracy 0.6563
[21 epoch] training cost 0.0438
[22 epoch] training cost 0.0438
[23 epoch] training cost 0.0438
[24 epoch] training cost 0.0438
[25 epoch] training cost 0.0438
[26 epoch] training cost 0.0438
[27 epoch] training cost 0.0437
[28 epoch] training cost 0.0437
[29 epoch] training cost 0.0437
[30 epoch] trai