In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from pprint import pprint

import tensorflow as tf
import tensorflow.contrib.slim as slim

# 0. Data Loading and processing
앞선 실험에서 TfidfVectorizer를 사용하는 것이 근소하게 성능이 더 좋았기 때문에, 여기서는 TfidfVectorizer만 사용하였음

In [None]:
# Load training set and test set
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
newsgroups_test  = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
X_train = newsgroups_train.data
Y_train = newsgroups_train.target
X_test  = newsgroups_test.data
Y_test  = newsgroups_test.target

In [None]:
# Declare two vectorizers
# count_vectorizer = CountVectorizer(min_df=40)
tfidf_vectorizer = TfidfVectorizer(min_df=40)

In [None]:
# Fitting vectorizers to the training set
# count_vectorizer = count_vectorizer.fit(X_train)
tfidf_vectorizer = tfidf_vectorizer.fit(X_train)

In [None]:
# Transform X_train and X_test using 2 vectorizers
# X_train_count = count_vectorizer.transform(X_train)
X_train_tfidf = tfidf_vectorizer.transform(X_train)
# X_test_count  = count_vectorizer.transform(X_test)
X_test_tfidf  = tfidf_vectorizer.transform(X_test)

In [None]:
# Convert sparse matrix into dense matrix
X_train = X_train_tfidf.toarray()
X_test = X_test_tfidf.toarray()

In [None]:
num_train = Y_train.shape[0]
num_test = Y_test.shape[0]

print("Number of training points: ", num_train)
print("Number of test points: ", num_test)

In [None]:
dim_X = X_train.shape[1]
print("Dimension of X: %d" % dim_X)

In [None]:
labels = np.unique(Y_test)
print("Labels: ", labels)

# 1. Fitting classifiers with TF-IDF vectorizer and TensorFlow

## 1.1. Placeholder
- Shape of the placeholder for inputs: [batch_size, dim_X]
- Shape of the placeholder for outputs: [batch_size]

In [None]:
X = tf.placeholder(tf.float32, [None, dim_X], name="Inputs")
Y = tf.placeholder(tf.int32, [None], name="Labels")

## 1.2. Build the model
- TF-Slim을 이용하여 아주 간단하게 모델을 선언해봅시다.
- https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/slim 참조

In [None]:
def fully_connected(inputs, num_labels, hidden_sizes=[100, 100], scope='FCN'):
    """
    [fully_connected] n개의 hidden layer를 갖는 feed-forward network 생성 (with TF-Slim)
    
    [Args]
      - inputs: 입력 데이터를 위한 placeholder
      - hidden_sizes: a list (은닉 노드 수를 원하는 층 수 만큼 기록한 리스트)
      - Scope: default value ("FCN")
    """
    # Inputs에서 1차원의 텐서들이 placeholder로 들어온다고 가정
    input_dim = inputs.get_shape()[1]

    # Number of hidden layers
    num_hidden_layers = len(hidden_sizes)
    
    with slim.arg_scope([slim.fully_connected],
                        activation_fn=tf.nn.relu,
                        weights_initializer=tf.contrib.layers.xavier_initializer(),
                        biases_initializer=tf.constant_initializer(0.0)):
        net = inputs
        for i in range(num_hidden_layers):
            scope_name = 'fc' + str(i)
            net = slim.fully_connected(inputs=net, num_outputs=hidden_sizes[i], scope=scope_name)
        net = slim.fully_connected(inputs=net, num_outputs=num_labels, activation_fn=None, scope='logits')
    
    return net

In [None]:
logits = fully_connected(inputs=X, num_labels=len(labels), hidden_sizes=[100, 100], scope='FCN')

## 1.3. Cost function and optimizer

In [None]:
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
train_op = optimizer.minimize(cost)

## 1.4. Predicting operator

In [None]:
correct_prediction = tf.nn.in_top_k(logits, Y, 1)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

## 1.5. Run

In [None]:
# Parameters
NUM_EPOCHS = 40
BATCH_SIZE = 20

In [None]:
# 결과를 저장할 리스트를 생성
train_cost_list = list()
test_cost_list = list()
test_accuracy_list = list()

In [None]:
with tf.Session() as sess:
    # Variable initialization
    sess.run(tf.global_variables_initializer())
    
    # Indices for constructing batches
    start_idx = range(0, num_train, BATCH_SIZE)
    end_idx = range(BATCH_SIZE, num_train + 1, BATCH_SIZE)
    
    NUM_BATCHES = len(start_idx)
    
    for epoch in range(0,NUM_EPOCHS):

        # Set "train_cost" as 0 before starting the epoch
        train_cost = 0
        
        # Training phase
        for start, end in zip(start_idx, end_idx):

            # Construct the input batch
            batch_xs = X_train[start:end]
            batch_ys = Y_train[start:end]
            
            # Calculate cost
            tmp_cost, _ = sess.run([cost, train_op], feed_dict={X: batch_xs, Y: batch_ys})
            train_cost += tmp_cost
        
        train_cost = train_cost / NUM_BATCHES
        train_cost_list.append(train_cost)
        print("[{} epoch] training cost {:0.4f}".format((epoch + 1), train_cost))
        
        # Validation phase
        if (epoch + 1) % 10 == 0:
            test_cost, test_accuracy = sess.run([cost, accuracy], feed_dict={X: X_test, Y: Y_test})
            test_cost_list.append(test_cost)
            test_accuracy_list.append(test_accuracy)
            print("\t[{} epoch] test accuracy {:0.4f}".format((epoch + 1), test_accuracy))
            
    # Test phase
    test_accuracy = sess.run(accuracy, feed_dict={X: X_test, Y: Y_test})
    print("\n")
    print("Test accuracy: {:0.4f}".format(test_accuracy))