# Tutorial of implementing Transfer learning
- ** Ch1. Supervised pre-training for an auxilary task**  
mnist 데이터 중 숫자 0~4를 예측하는 5개의 hidden layer를 가지는 Deep Neural Network를 학습하고, 이 모형의 weight들을 save
  
  
- ** Ch2. Transfer learning 1**  
Ch1에서 학습한 모형의 전체 weight를 restore한다. 이후 hidden layer를 제외한 output layer의 weight를 다시 initialize를 하고,
5~9에 해당하는 mnist image를 넣어서, output layer의 weight만 training 한다. 또한 Ch3에서 hidden layer의 weight를 활용하기위해서 hidden layer의 weight들만 저장하고, 후에 Ch3에서 restore한다.  

    *(이 예제에서는 hidden layer의 weight는 학습을 통해 update하지 않는다. 전체 Network를 update를 해야하는 가의 여부에 대한 guide는 아래의 링크를 참고)*  
  
    링크 : http://cs231n.github.io/transfer-learning/
  
  
- ** Ch3. Transfer learning 2**  
Ch2에서 저장한 모형의 hidden layer의 weight를 restore한다. Ch2의 예제와 다른 점은 5~9의 5개의 숫자를 분류하기위해 output layer의 weight만 training 했다면, 이번 예제에서는 0~9의 숫자를 예측하기위해 output layer의 architecture를 바꾸고, weight를 initialize한 뒤, architecture가 바뀐 output layer의 weight만 update 한다.  
  
  
***전체 내용을 구성하기위한 reference는 아래와 같다.***  
* 참고 

    링크1 : https://wookayin.github.io/TensorFlowKR-2017-talk-bestpractice/ko/#1  
    링크2 : https://github.com/ageron/handson-ml/blob/master/11_deep_learning.ipynb

## Preliminary

### Load modules and mnist dataset

In [1]:
import os, sys
import shutil 
import tensorflow as tf
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
from tensorflow.examples.tutorials.mnist import input_data # load mnist dataset
mnist = input_data.read_data_sets(train_dir = './MNIST_data', one_hot = True, reshape = True, seed = 777)

Extracting ./MNIST_data\train-images-idx3-ubyte.gz
Extracting ./MNIST_data\train-labels-idx1-ubyte.gz
Extracting ./MNIST_data\t10k-images-idx3-ubyte.gz
Extracting ./MNIST_data\t10k-labels-idx1-ubyte.gz


### Define DNNClassifier class

In [2]:
class DnnClassifier:
    def __init__(self, sess, n_features, n_class, hidden_dims = [100, 100, 100, 100, 100],
               activation_fn = tf.nn.elu, initializer = tf.contrib.layers.variance_scaling_initializer()):
        self._sess = sess
        
        with tf.variable_scope('input_layer'):
            self._x = tf.placeholder(dtype = tf.float32, shape = [None, n_features])
            self._y = tf.placeholder(dtype = tf.float32, shape = [None, n_class])
                
        _net = self._x
            
        for layer, h_dim in enumerate(hidden_dims):
            with tf.variable_scope('hidden_layer{}'.format(layer + 1)):
                _net = tf.layers.dense(inputs = _net, units = h_dim, activation = activation_fn,
                                        kernel_initializer = initializer)
                    
        with tf.variable_scope('output_layer'):
            self._score = tf.layers.dense(inputs = _net, units = n_class,
                                             kernel_initializer = tf.contrib.layers.xavier_initializer())
                
        with tf.variable_scope('loss'):
            self._ce_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = self._y, logits = self._score))
                
        with tf.variable_scope('predict'):
            self._prediction = tf.argmax(input = self._score, axis = -1)
                
    def predict(self, x_data):
        feed_predict = {self._x : x_data}
        return self._sess.run(fetches = self._prediction, feed_dict = feed_predict)                        

### Define Solver class

In [3]:
class Solver:
    def __init__(self, sess, model, optimizer = tf.train.AdamOptimizer, var_list = None):
        self._sess = sess
        self._model = model # DnnClassifier의 class의 instance를 input으로 받는다
        self._lr = tf.placeholder(dtype = tf.float32)
        self._optimizer = optimizer(self._lr)
        self._training_op = self._optimizer.minimize(loss = self._model._ce_loss, var_list = var_list)
            
    def train(self, x_data, y_data, lr):
        feed_train = {self._model._x : x_data,
                      self._model._y : y_data,
                      self._lr : lr}
        return self._sess.run(fetches = [self._training_op, self._model._ce_loss], feed_dict = feed_train)
    
    def evaluate(self, x_data, y_data):
        feed_loss = {self._model._x : x_data, self._model._y : y_data}
        return self._sess.run(fetches = self._model._ce_loss, feed_dict = feed_loss)        

## Ch1. Supervised pre-training for an auxilary task
0~4를 예측하는 5개의 hidden layer를 가지는 Deep Neural Network를 학습한다. 추후 이 모형의 hideen layer들의 weight는 5~9를 예측하는 Deep Neural Network의 hidden layer의 weight 값으로 활용된다.

### Pre-processing

In [4]:
'''
mnist image 중 0~4에 해당되는 image와 label만 따로 뽑는다.
'''
x_train1 = mnist.train.images[np.argmax(mnist.train.labels, axis=1) < 5]
y_train1 = mnist.train.labels[np.argmax(mnist.train.labels, axis=1) < 5][:,:5]
x_valid1 = mnist.validation.images[np.argmax(mnist.validation.labels, axis=1) < 5]
y_valid1 = mnist.validation.labels[np.argmax(mnist.validation.labels, axis=1) < 5][:,:5]
x_test1 = mnist.test.images[np.argmax(mnist.test.labels, axis=1) < 5]
y_test1 = mnist.test.labels[np.argmax(mnist.test.labels, axis=1) < 5][:,:5]

### Generate DNN model and Adam solver

In [5]:
sess = tf.Session()
pre_trained_model = DnnClassifier(sess = sess, n_class = 5, n_features = 784)

In [6]:
adam_opt1 = Solver(sess = sess, model = pre_trained_model, optimizer = tf.train.AdamOptimizer)

### Training

In [7]:
saver = tf.train.Saver()

In [8]:
if 'pre_trained' in os.listdir():
    shutil.rmtree('pre_trained')
os.mkdir('pre_trained')

In [9]:
# hyper-parameters
n_epochs = 10
batch_size = 100

max_checks_without_progress = 5
checks_without_progress = 0
best_loss = np.infty

sess.run(tf.global_variables_initializer())

for epoch in range(n_epochs):
    avg_tr_loss, avg_val_loss = 0, 0
    total_batch = int(x_train1.shape[0] / batch_size)
    
    for step in range(total_batch):
        tr_indices = np.random.randint(low = 0, high = x_train1.shape[0], size = batch_size)
        val_indices = np.random.randint(low = 0, high = x_valid1.shape[0], size = batch_size)
        batch_xs, batch_ys = x_train1[tr_indices], y_train1[tr_indices]
        val_xs, val_ys = x_valid1[val_indices], y_valid1[val_indices]
        _, tr_loss = adam_opt1.train(x_data = batch_xs, y_data = batch_ys, lr = 1e-3)
        val_loss = adam_opt1.evaluate(x_data = val_xs, y_data = val_ys)
        
        avg_tr_loss += tr_loss / total_batch 
        avg_val_loss += val_loss / total_batch
        
        if step % 100 == 0:
            print('step : {:3}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(step, tr_loss, val_loss))

    print('epoch : {:3}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(epoch, avg_tr_loss, avg_val_loss))
    # early stopping : epoch 단위로
    if avg_val_loss < best_loss: # epoch 당 avg_val_loss가 낮은 시점의 모형을 저장한다.
        save_path = saver.save(sess = sess, save_path = './pre_trained/pre_trained_model.ckpt')
        best_loss = avg_val_loss
        checks_without_progress = 0
    else:
        checks_without_progress += 1
        if checks_without_progress > max_checks_without_progress:
            print('Early stopping')
            break

step :   0, tr_loss : 1.510, val_loss : 1.059
step : 100, tr_loss : 0.026, val_loss : 0.131
step : 200, tr_loss : 0.067, val_loss : 0.038
epoch :   0, tr_loss : 0.106, val_loss : 0.098
step :   0, tr_loss : 0.133, val_loss : 0.067
step : 100, tr_loss : 0.049, val_loss : 0.168
step : 200, tr_loss : 0.008, val_loss : 0.012
epoch :   1, tr_loss : 0.047, val_loss : 0.046
step :   0, tr_loss : 0.044, val_loss : 0.010
step : 100, tr_loss : 0.043, val_loss : 0.040
step : 200, tr_loss : 0.059, val_loss : 0.022
epoch :   2, tr_loss : 0.029, val_loss : 0.034
step :   0, tr_loss : 0.013, val_loss : 0.034
step : 100, tr_loss : 0.006, val_loss : 0.005
step : 200, tr_loss : 0.035, val_loss : 0.036
epoch :   3, tr_loss : 0.021, val_loss : 0.034
step :   0, tr_loss : 0.006, val_loss : 0.067
step : 100, tr_loss : 0.032, val_loss : 0.056
step : 200, tr_loss : 0.005, val_loss : 0.076
epoch :   4, tr_loss : 0.021, val_loss : 0.040
step :   0, tr_loss : 0.021, val_loss : 0.015
step : 100, tr_loss : 0.001, 

In [10]:
hat = pre_trained_model.predict(x_data = x_test1)
print('test accuracy : {:.2%}'.format(np.mean(hat == np.argmax(y_test1, axis = 1))))

test accuracy : 99.18%


In [11]:
sess.close()

## Ch2. Transfer learning 1
Ch1에서 학습한 모형을 restore하고, output layer의 weight만 initialize를 한다. 그리고 숫자 5~9에 해당하는 mnist image를 feed하여 output layer의 weight만 update한다.

### Pre-processing

In [12]:
'''
mnist image 중 5~9에 해당되는 image와 label만 따로 뽑는다.
'''
x_train2 = mnist.train.images[np.argmax(mnist.train.labels, axis = 1) >= 5]
y_train2 = mnist.train.labels[np.argmax(mnist.train.labels, axis = 1) >= 5][:,5:]
x_valid2 = mnist.validation.images[np.argmax(mnist.validation.labels, axis=1) >= 5]
y_valid2 = mnist.validation.labels[np.argmax(mnist.validation.labels, axis=1) >= 5][:,5:]
x_test2 = mnist.test.images[np.argmax(mnist.test.labels, axis=1) >= 5]
y_test2 = mnist.test.labels[np.argmax(mnist.test.labels, axis=1) >= 5][:,5:]

### Generate Dnn model

In [13]:
tf.reset_default_graph()
del pre_trained_model, adam_opt1

In [14]:
sess = tf.Session()
transfer_model1 = DnnClassifier(sess = sess, n_class = 5, n_features = 784)

### Restore all weights

In [15]:
'''
0~4에 대하여 학습한 모형을 restore하면 위의 main_model과 variable_scope가 같으므로,
transfer_model1의 weight들에는 pre_trained_model의 weight들의 값이 저장된다.
'''
saver = tf.train.Saver()
saver.restore(sess = sess, save_path = './pre_trained/pre_trained_model.ckpt')

INFO:tensorflow:Restoring parameters from ./pre_trained/pre_trained_model.ckpt


In [16]:
'''
output layer의 weight만 update 하는 것이 목적이므로, 정상적으로 잘 작동하는 지 확인을 하기위해서
Transfer learning을 하기전에 hidden layer들의 weight들을 따로 저장해놓는다.
'''
before_hidden_weights = sess.run(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope ='hidden'))

In [17]:
'''
restore가 잘 되었는 지 확인한다.
pre_trained_model의 학습이 epoch 8 일 때의, weight를 restore한 결과물이다.
'''
hat = transfer_model1.predict(x_data = x_test1)
print('test accuracy : {:.2%}'.format(np.mean(hat == np.argmax(y_test1, axis = 1))))

test accuracy : 99.01%


In [18]:
'''
0~4를 분류하는 모형의 weight를 이용하여 5~9를 예측하므로 test accuracy가 상당히 낮은 것이 당연하다.
'''
hat = transfer_model1.predict(x_data = x_test2)
np.mean(hat == np.argmax(y_test2, axis = 1))

0.37235136803126928

### Generate Adam solver and initialize weights of output layer and initial parameters of Adam

In [19]:
'''
hidden layer의 weight는 update하지 않으므로 (frozen), output_layer만 update 하기 위해서, update 해야 할
variable만 아래처럼 뽑아낼 수 있다. 아래의 list를 Solver의 var_list에 전달한다.
'''
tf.contrib.framework.get_trainable_variables(scope = 'output_layer')
# tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'output_layer')

[<tf.Variable 'output_layer/dense/kernel:0' shape=(100, 5) dtype=float32_ref>,
 <tf.Variable 'output_layer/dense/bias:0' shape=(5,) dtype=float32_ref>]

In [20]:
# Transfer learning
adam_opt2 = Solver(sess = sess, model = transfer_model1,
                    optimizer = tf.train.AdamOptimizer,
                    var_list = tf.contrib.framework.get_trainable_variables(scope = 'output_layer'))

In [21]:
'''
먼저 output_layer의 weight와 Transfer learning을 하는 데에 Adam optimizer를 사용하므로 Adam optimizer의 parameter의 initial value를
initialize해야한다.
'''
tf.global_variables()

[<tf.Variable 'hidden_layer1/dense/kernel:0' shape=(784, 100) dtype=float32_ref>,
 <tf.Variable 'hidden_layer1/dense/bias:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'hidden_layer2/dense/kernel:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'hidden_layer2/dense/bias:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'hidden_layer3/dense/kernel:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'hidden_layer3/dense/bias:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'hidden_layer4/dense/kernel:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'hidden_layer4/dense/bias:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'hidden_layer5/dense/kernel:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'hidden_layer5/dense/bias:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'output_layer/dense/kernel:0' shape=(100, 5) dtype=float32_ref>,
 <tf.Variable 'output_layer/dense/bias:0' shape=(5,) dtype=float32_ref>,
 <tf.Variable 'beta1_power:0' shape=() dtype=float32_ref>,

In [22]:
tf.global_variables()[-6:]

[<tf.Variable 'beta1_power:0' shape=() dtype=float32_ref>,
 <tf.Variable 'beta2_power:0' shape=() dtype=float32_ref>,
 <tf.Variable 'output_layer/dense/kernel/Adam:0' shape=(100, 5) dtype=float32_ref>,
 <tf.Variable 'output_layer/dense/kernel/Adam_1:0' shape=(100, 5) dtype=float32_ref>,
 <tf.Variable 'output_layer/dense/bias/Adam:0' shape=(5,) dtype=float32_ref>,
 <tf.Variable 'output_layer/dense/bias/Adam_1:0' shape=(5,) dtype=float32_ref>]

In [23]:
tf.contrib.framework.get_trainable_variables(scope = 'output_layer')

[<tf.Variable 'output_layer/dense/kernel:0' shape=(100, 5) dtype=float32_ref>,
 <tf.Variable 'output_layer/dense/bias:0' shape=(5,) dtype=float32_ref>]

In [24]:
vars_params = tf.contrib.framework.get_trainable_variables(scope = 'output_layer') + tf.global_variables()[-6:]
print(vars_params)
sess.run(tf.variables_initializer(var_list = vars_params))

[<tf.Variable 'output_layer/dense/kernel:0' shape=(100, 5) dtype=float32_ref>, <tf.Variable 'output_layer/dense/bias:0' shape=(5,) dtype=float32_ref>, <tf.Variable 'beta1_power:0' shape=() dtype=float32_ref>, <tf.Variable 'beta2_power:0' shape=() dtype=float32_ref>, <tf.Variable 'output_layer/dense/kernel/Adam:0' shape=(100, 5) dtype=float32_ref>, <tf.Variable 'output_layer/dense/kernel/Adam_1:0' shape=(100, 5) dtype=float32_ref>, <tf.Variable 'output_layer/dense/bias/Adam:0' shape=(5,) dtype=float32_ref>, <tf.Variable 'output_layer/dense/bias/Adam_1:0' shape=(5,) dtype=float32_ref>]


### Training

In [25]:
# hyper-parameters
n_epochs = 10
batch_size = 100

max_checks_without_progress = 5
checks_without_progress = 0
best_loss = np.infty

tf.variables_initializer

for epoch in range(n_epochs):
    avg_tr_loss, avg_val_loss = 0, 0
    total_batch = int(x_train2.shape[0] / batch_size)
    
    for step in range(total_batch):
        tr_indices = np.random.randint(low = 0, high = x_train2.shape[0], size = batch_size)
        val_indices = np.random.randint(low = 0, high = x_valid2.shape[0], size = batch_size)
        batch_xs, batch_ys = x_train2[tr_indices], y_train2[tr_indices]
        val_xs, val_ys = x_valid2[val_indices], y_valid2[val_indices]
        _, tr_loss = adam_opt2.train(x_data = batch_xs, y_data = batch_ys, lr = 1e-3)
        val_loss = adam_opt2.evaluate(x_data = val_xs, y_data = val_ys)
        
        avg_tr_loss += tr_loss / total_batch 
        avg_val_loss += val_loss / total_batch
        
        if step % 100 == 0:
            print('step : {:3}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(step, tr_loss, val_loss))

    print('epoch : {:3}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(epoch, avg_tr_loss, avg_val_loss))

step :   0, tr_loss : 3.998, val_loss : 4.143
step : 100, tr_loss : 1.194, val_loss : 0.740
step : 200, tr_loss : 0.697, val_loss : 0.503
epoch :   0, tr_loss : 1.131, val_loss : 1.066
step :   0, tr_loss : 0.798, val_loss : 0.498
step : 100, tr_loss : 0.566, val_loss : 0.425
step : 200, tr_loss : 0.420, val_loss : 0.396
epoch :   1, tr_loss : 0.462, val_loss : 0.452
step :   0, tr_loss : 0.488, val_loss : 0.337
step : 100, tr_loss : 0.531, val_loss : 0.524
step : 200, tr_loss : 0.427, val_loss : 0.343
epoch :   2, tr_loss : 0.385, val_loss : 0.358
step :   0, tr_loss : 0.255, val_loss : 0.512
step : 100, tr_loss : 0.429, val_loss : 0.315
step : 200, tr_loss : 0.257, val_loss : 0.431
epoch :   3, tr_loss : 0.338, val_loss : 0.314
step :   0, tr_loss : 0.238, val_loss : 0.262
step : 100, tr_loss : 0.315, val_loss : 0.391
step : 200, tr_loss : 0.332, val_loss : 0.272
epoch :   4, tr_loss : 0.306, val_loss : 0.294
step :   0, tr_loss : 0.262, val_loss : 0.225
step : 100, tr_loss : 0.300, 

In [26]:
'''
output_layer의 weight들만 update 한 것이 맞는 지 확인하기위해, update 종료 후 hidden layer의 weight들을
가져온다.
'''
after_hidden_weights = sess.run(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope ='hidden'))

In [27]:
'''
검증결과 hidden layer의 weight들은 update하지 않았다.
'''
for num in range(len(before_hidden_weights)):
    print(np.all(before_hidden_weights[num] == after_hidden_weights[num]))    

True
True
True
True
True
True
True
True
True
True


In [28]:
'''
transfer learning을 한 결과, 5~9를 분류하는 모형의 test accuracy가 향상된다.
'''
hat = transfer_model1.predict(x_data = x_test2)
np.mean(hat == np.argmax(y_test2, axis = 1))

0.91853528080641844

In [29]:
'''
Ch3에서 hidden layer의 weight만 활용하기위해서 5~9를 분류하는 transfer_model1의 hidden layer의 weight들만 저장한다.
'''
reuse_vars = tf.contrib.framework.get_trainable_variables(scope = 'hidden_layer')
saver = tf.train.Saver(var_list = reuse_vars)
saver.save(sess = sess, save_path = './transfer_model1/transfer_model1.ckpt')

'./transfer_model1/transfer_model1.ckpt'

In [30]:
sess.close()

## Ch3. Transfer learning 2
Ch2에서 학습한 transfer_model1의 hidden layer의 weight를 restore하고, 0~9를 분류하는 모형의 output layer의 weight와 Adam optimizer의 parameter의 initial value를 initialize해야한다.

### Generate Dnn model

In [31]:
tf.reset_default_graph()
del transfer_model1, adam_opt2

In [32]:
sess = tf.Session()
transfer_model2 = DnnClassifier(sess = sess, n_class = 10, n_features = 784)

### Restore weights of hidden layers

In [33]:
'''
0~4에 대하여 학습한 모형을 restore하면 위의 main_model과 variable_scope가 같으므로,
main_model의 weight들에는 pre_trained_model의 weight들의 값이 저장된다.
'''
reuse_vars = tf.contrib.framework.get_trainable_variables(scope = 'hidden_layer')
saver = tf.train.Saver(var_list = reuse_vars)
saver.restore(sess = sess, save_path = './transfer_model1/transfer_model1.ckpt')

INFO:tensorflow:Restoring parameters from ./transfer_model1/transfer_model1.ckpt


### Generate Adam solver and initialize weights of output layer and initial parameters of Adam

In [34]:
'''
아래의 학습가능한 variable들 중에서 restore하지않은 output_layer의 weight는 값이 들어가있지 않은 상태이다. 추후
Adam optimizer의 initial parameter와 같이 initialize한다.
'''
tf.contrib.framework.get_trainable_variables()

[<tf.Variable 'hidden_layer1/dense/kernel:0' shape=(784, 100) dtype=float32_ref>,
 <tf.Variable 'hidden_layer1/dense/bias:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'hidden_layer2/dense/kernel:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'hidden_layer2/dense/bias:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'hidden_layer3/dense/kernel:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'hidden_layer3/dense/bias:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'hidden_layer4/dense/kernel:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'hidden_layer4/dense/bias:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'hidden_layer5/dense/kernel:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'hidden_layer5/dense/bias:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'output_layer/dense/kernel:0' shape=(100, 10) dtype=float32_ref>,
 <tf.Variable 'output_layer/dense/bias:0' shape=(10,) dtype=float32_ref>]

In [35]:
'''
hidden layer의 weight는 update하지 않으므로 (frozen), output_layer만 update 하기 위해서, update 해야 할
variable만 아래처럼 뽑아낼 수 있다. 아래의 list를 Solver의 var_list에 전달한다.
'''
adam_opt3 = Solver(sess = sess, model = transfer_model2,
                    optimizer = tf.train.AdamOptimizer,
                    var_list = tf.contrib.framework.get_trainable_variables(scope = 'output_layer'))

In [36]:
'''
먼저 output_layer의 weight와 Transfer learning을 하는 데에 Adam optimizer를 사용하므로 Adam optimizer의 parameter의 initial value를
initialize해야한다. Ch2에서의 과정과 같다.
'''
tf.global_variables()

[<tf.Variable 'hidden_layer1/dense/kernel:0' shape=(784, 100) dtype=float32_ref>,
 <tf.Variable 'hidden_layer1/dense/bias:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'hidden_layer2/dense/kernel:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'hidden_layer2/dense/bias:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'hidden_layer3/dense/kernel:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'hidden_layer3/dense/bias:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'hidden_layer4/dense/kernel:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'hidden_layer4/dense/bias:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'hidden_layer5/dense/kernel:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'hidden_layer5/dense/bias:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'output_layer/dense/kernel:0' shape=(100, 10) dtype=float32_ref>,
 <tf.Variable 'output_layer/dense/bias:0' shape=(10,) dtype=float32_ref>,
 <tf.Variable 'beta1_power:0' shape=() dtype=float32_ref

In [37]:
vars_params = tf.contrib.framework.get_trainable_variables(scope = 'output_layer') + tf.global_variables()[-6:]
print(vars_params)
sess.run(tf.variables_initializer(var_list = vars_params))

[<tf.Variable 'output_layer/dense/kernel:0' shape=(100, 10) dtype=float32_ref>, <tf.Variable 'output_layer/dense/bias:0' shape=(10,) dtype=float32_ref>, <tf.Variable 'beta1_power:0' shape=() dtype=float32_ref>, <tf.Variable 'beta2_power:0' shape=() dtype=float32_ref>, <tf.Variable 'output_layer/dense/kernel/Adam:0' shape=(100, 10) dtype=float32_ref>, <tf.Variable 'output_layer/dense/kernel/Adam_1:0' shape=(100, 10) dtype=float32_ref>, <tf.Variable 'output_layer/dense/bias/Adam:0' shape=(10,) dtype=float32_ref>, <tf.Variable 'output_layer/dense/bias/Adam_1:0' shape=(10,) dtype=float32_ref>]


### Training

In [38]:
# hyper-parameters
n_epochs = 10
batch_size = 100

max_checks_without_progress = 5
checks_without_progress = 0
best_loss = np.infty

for epoch in range(n_epochs):
    avg_tr_loss, avg_val_loss = 0, 0
    total_batch = int(mnist.train.images.shape[0] / batch_size)
    
    for step in range(total_batch):
    
        batch_xs, batch_ys = mnist.train.next_batch(batch_size = batch_size)
        val_xs, val_ys = mnist.validation.next_batch(batch_size = batch_size)
        _, tr_loss = adam_opt3.train(x_data = batch_xs, y_data = batch_ys, lr = 1e-3)
        val_loss = adam_opt3.evaluate(x_data = val_xs, y_data = val_ys)
        
        avg_tr_loss += tr_loss / total_batch 
        avg_val_loss += val_loss / total_batch
        
        if step % 100 == 0:
            print('step : {:3}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(step, tr_loss, val_loss))

    print('epoch : {:3}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(epoch, avg_tr_loss, avg_val_loss))

step :   0, tr_loss : 4.771, val_loss : 4.367
step : 100, tr_loss : 0.975, val_loss : 0.854
step : 200, tr_loss : 0.518, val_loss : 0.556
step : 300, tr_loss : 0.502, val_loss : 0.449
step : 400, tr_loss : 0.473, val_loss : 0.457
step : 500, tr_loss : 0.441, val_loss : 0.379
epoch :   0, tr_loss : 0.734, val_loss : 0.719
step :   0, tr_loss : 0.327, val_loss : 0.509
step : 100, tr_loss : 0.330, val_loss : 0.315
step : 200, tr_loss : 0.581, val_loss : 0.305
step : 300, tr_loss : 0.361, val_loss : 0.246
step : 400, tr_loss : 0.392, val_loss : 0.321
step : 500, tr_loss : 0.227, val_loss : 0.299
epoch :   1, tr_loss : 0.346, val_loss : 0.332
step :   0, tr_loss : 0.424, val_loss : 0.277
step : 100, tr_loss : 0.243, val_loss : 0.334
step : 200, tr_loss : 0.306, val_loss : 0.283
step : 300, tr_loss : 0.378, val_loss : 0.329
step : 400, tr_loss : 0.330, val_loss : 0.380
step : 500, tr_loss : 0.306, val_loss : 0.172
epoch :   2, tr_loss : 0.299, val_loss : 0.289
step :   0, tr_loss : 0.252, va

In [39]:
hat = transfer_model2.predict(x_data = mnist.test.images)
print('test accuracy : {:.2%}'.format(np.mean(hat == np.argmax(mnist.test.labels, axis = 1))))

test accuracy : 92.03%
