In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.preprocessing import  LabelBinarizer

In [2]:
df_train = pd.read_csv('./MNIST_data_csv/mnist_train.csv', header=None)
df_test = pd.read_csv('./MNIST_data_csv/mnist_test.csv', header=None)

train_list = list(df_train)
test_list = list(df_test)

train_images = df_train.iloc[:,1:].values
test_images = df_test.iloc[:,1:].values

lb1 = LabelBinarizer()
lb2 = LabelBinarizer()
train_labels = lb1.fit_transform(df_train.iloc[:,0].values)
test_labels = lb2.fit_transform(df_test.iloc[:,0].values)

mnist = {'train':{
        'images':train_images,
        'labels':train_labels},
        'test':{
        'images':test_images,
        'labels':test_labels}
         }

### softmax classifier 

In [109]:
x = tf.placeholder('float', [None, 784])
y = tf.placeholder('float', [None, 10])

w = tf.Variable(tf.zeros([784,10]))
b = tf.Variable(tf.zeros([10]))

# construct model
activation = tf.nn.softmax(tf.matmul(x,w)+b)

# minimize cost
cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(activation), reduction_indices=1)) # cross entropy
l_r = 0.00001
optimizer = tf.train.GradientDescentOptimizer(l_r).minimize(cost) # gradient descent

init = tf.global_variables_initializer()

# launch the graph
with tf.Session() as sess:
    sess.run(init)
    feed_train = {x:mnist['train']['images'], y:mnist['train']['labels']}
    feed_test = {x:mnist['test']['images'], y:mnist['test']['labels']}
    
    # training cycle
    for step in range(501):
        sess.run(optimizer, feed_dict=feed_train)
        
        if step%20==0:
            print(step, sess.run(cost, feed_dict=feed_train))
        if step%100==0:
            correc_prediction = tf.equal(tf.arg_max(activation,1), tf.argmax(y,1))
            accuracy = tf.reduce_mean(tf.cast(correc_prediction, tf.float32))
            print(sess.run(accuracy, feed_dict=feed_test))

0 1.71979
0.6804
20 0.543549
40 0.453451
60 0.414749
80 0.391904
100 0.376385
0.9042
120 0.364952
140 0.356068
160 0.348901
180 0.342957
200 0.337923
0.911
220 0.333583
240 0.329792
260 0.326441
280 0.323451
300 0.320759
0.9143
320 0.31832
340 0.316094
360 0.314052
380 0.31217
400 0.310427
0.9179
420 0.308806
440 0.307294
460 0.305879
480 0.304549


### NN with initialization & dropout

#### Xavier initialization

In [3]:
def xavier_init(n_inputs, n_outputs, uniform=True):
    if uniform:
        init_range = tf.sqrt(6.0 / (n_inputs+n_outputs))
        return tf.random_uniform_initializer(-init_range, init_range)
    else:
        stddev = tf.sqrt(3.0/ (n_inputs+n_outputs))
        return tf.truncated_normal_initializer(stddev=stddev)

In [4]:
# parameter
learning_rate = 0.001
training_epochs = 15
batch_size =100
display_step = 1

# tf graph input
x = tf.placeholder('float', [None, 784])
y = tf.placeholder('float', [None, 10])


# 기본 초기값
# w1 = tf.Variable(tf.random_normal([784,256]))
# w2 = tf.Variable(tf.random_normal([256,256]))
# w3 = tf.Variable(tf.random_normal([256,10]))

# Xavier initialization
w1 = tf.get_variable('w1', shape=[784,256], initializer=xavier_init(784,256))
w2 = tf.get_variable('w2', shape=[256,256], initializer=xavier_init(256,256))
w3 = tf.get_variable('w3', shape=[256,128], initializer=xavier_init(256,128))
w4 = tf.get_variable('w4', shape=[128,64], initializer=xavier_init(128,64))
w5 = tf.get_variable('w5', shape=[64,10], initializer=xavier_init(64,10))
b1 = tf.Variable(tf.random_normal([256]))
b2 = tf.Variable(tf.random_normal([256]))
b3 = tf.Variable(tf.random_normal([128]))
b4 = tf.Variable(tf.random_normal([64]))
b5 = tf.Variable(tf.random_normal([10]))


# 기본 모델
# l1 = tf.nn.relu(tf.matmul(x,w1)+b1)
# l2 = tf.nn.relu(tf.matmul(l1,w2)+b2) # hidden layer with relu activation
# hypothesis = tf.matmul(l2,w3)+b3 # no need to use softmax here

# more deep & Dropout
dropout_rate = tf.placeholder('float')
_l1 = tf.nn.relu(tf.add(tf.matmul(x,w1),b1))
l1 = tf.nn.dropout(_l1, dropout_rate)
_l2 = tf.nn.relu(tf.add(tf.matmul(l1,w2),b2))
l2 = tf.nn.dropout(_l2, dropout_rate)
_l3 = tf.nn.relu(tf.add(tf.matmul(l2,w3),b3))
l3 = tf.nn.dropout(_l3, dropout_rate)
_l4 = tf.nn.relu(tf.add(tf.matmul(l3,w4),b4))
l4 = tf.nn.dropout(_l4, dropout_rate)

hypothesis = tf.add(tf.matmul(l4, w5),b5)


# define cost & optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(hypothesis,y)) # softmax loss
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# initializing the variables
init = tf.global_variables_initializer()

# launch the graph
with tf.Session() as sess:
    sess.run(init)
    feed_train = {x:mnist['train']['images'], y:mnist['train']['labels'], dropout_rate:0.7}
    feed_test = {x:mnist['test']['images'], y:mnist['test']['labels'], dropout_rate:1}
    
    # training cycle
    for step in range(501):
        sess.run(optimizer, feed_dict=feed_train)
        
        if step%20==0:
            print(step, sess.run(cost, feed_dict=feed_train))
        if step%100==0:
            correc_prediction = tf.equal(tf.arg_max(hypothesis,1), tf.argmax(y,1))
            accuracy = tf.reduce_mean(tf.cast(correc_prediction, tf.float32))
            print(sess.run(accuracy, feed_dict=feed_test))

0 79.8873
0.1402


KeyboardInterrupt: 

### 정리

- Softmax vs Neural Nets for MNIST => 91.4% vs 94.4%
- Xavier initialization: 97.8%
- Deep Neural Nets and Dropout: 98%
- Adam optimizer