# Assignment on Regularization and Optimization of Deep Learning

이번 과제에서는 reagularization과 optimization에서 배운 내용들을 이용해 최대한 Deep Models의 성능을 높여보고자 합니다. Layer 4개짜리 MLP (각 hidden layer는 512개의 unit을 가짐) 상황에서 정규화와 최적화 방법론들을 총 동원해 성능을 높여주시면 됩니다.

먼저, 아래 코드는 데이터 셋을 셋팅하는 부분입니다. 이 부분은 건드리시면 안됩니다. 이 부분을 건드리시면 0점 처리 됩니다. 외부 데이터 사용하셔도 안됩니다.

In [1]:
%matplotlib inline
import math
import random 

import tensorflow as tf
import numpy as np 
import matplotlib.pyplot as plt

from tensorflow import keras

seed = 1
random.seed(seed)
np.random.seed(seed=seed)
tf.random.set_random_seed(seed)

(x_1, y_1), (x_2, y_2) = tf.keras.datasets.cifar100.load_data()
x_total = np.concatenate([x_1, x_2], axis=0).astype(np.float64)
y_total = np.concatenate([y_1, y_2], axis=0)

n_output = 10

valid_index, _ = np.where(y_total < n_output)
y_total = y_total[valid_index].reshape([-1])
x_total = x_total[valid_index]

i = np.arange(x_total.shape[0])
np.random.shuffle(i)
x_total = x_total[i]
y_total = y_total[i]

train_size = 100 * n_output
x_train = x_total[:train_size]
y_train = y_total[:train_size]
x_test = x_total[train_size:]
y_test = y_total[train_size:]

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(1000, 32, 32, 3)
(1000,)
(5000, 32, 32, 3)
(5000,)


validation set을 나눕니다. 
- 실습시간에 배웠던 것처럼 Validation set 비율은 조정하셔도 됩니다. 

In [2]:
split = x_train.shape[0] // 5
x_valid = x_train[:split]
y_valid = y_train[:split]

x_train = x_train[split:]
y_train = y_train[split:]

이미지를 greyscale로 변경합니다. 
1. RGB 값을 고려한 코드로 변경하셔도 됩니다. 
2. Augmentation을 고려해보세요.

In [3]:
x_train = np.mean(x_train, axis=3)
x_valid = np.mean(x_valid, axis=3)
x_test = np.mean(x_test, axis=3)
print(x_train.shape)
print(x_valid.shape)
print(x_test.shape)

n_input = 32 * 32

x_train = x_train.reshape([-1, n_input])
x_valid = x_valid.reshape([-1, n_input])
x_test = x_test.reshape([-1, n_input])

print(x_train.shape)
print(x_valid.shape)
print(x_test.shape)

(800, 32, 32)
(200, 32, 32)
(5000, 32, 32)
(800, 1024)
(200, 1024)
(5000, 1024)


이제 모델을 만듭니다.

1. Optimizer를 다른 걸로 바꿔보세요
2. Learning Rate를 바꿔보세요. Learning Rate Scheduling도 고려해보세요.
3. Activation Function을 바꿔보세요. 
4. Dropout, DropConnect, Gaussian Dropout 을 고려해보세요.
5. Augmentation을 고려해보세요. 

In [4]:
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.int32, [None])
training = tf.placeholder(tf.bool)

n_units = [n_input, 512, 512, 512, n_output]

weights, biases = [], []
for i, (n_in, n_out) in enumerate(zip(n_units[:-1], n_units[1:])):
    stddev = math.sqrt(2 / n_in) # Kaiming He Initialization
    weight = tf.Variable(tf.random.truncated_normal([n_in, n_out], mean=0, stddev=stddev))
    bias = tf.Variable(tf.zeros([n_out]))
    weights.append(weight)
    biases.append(bias)    
    
layer = x 

for i, (weight, bias) in enumerate(zip(weights, biases)):
    layer = tf.matmul(layer, weight) + bias
    if i < len(weights) - 1:
        
        layer = tf.nn.tanh(layer) 
        # layer = tf.nn.sigmoid(layer)
        # layer = tf.nn.relu(layer)
        
        layer = tf.keras.layers.GaussianDropout(rate=0.3)(layer, training=training)
        # layer = tf.nn.dropout(layer, keep_prob=0.5)*0.5   # Drop connect
        # layer = tf.layers.dropout(layer, rate=0.4, training=training)  # Drop out (layers Dropout)    
        
        
y_hat = layer

y_hot = tf.one_hot(y, n_output)
costs = tf.nn.softmax_cross_entropy_with_logits_v2(
        labels=y_hot, logits=y_hat)
cross_entropy_loss = tf.reduce_mean(costs)
loss = cross_entropy_loss 

y_label = tf.argmax(y_hat, 1)
accuracy = tf.count_nonzero(
        tf.cast(tf.equal(tf.argmax(y_hot, 1), y_label),
                tf.int64)) / tf.cast(tf.shape(y_hot)[0], tf.int64)

extra_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)


with tf.control_dependencies(extra_ops):     
    
    optimizer = tf.train.AdamOptimizer(1e-4)    
    # optimizer = tf.train.AdamOptimizer(1e-3)    
    # optimizer = tf.train.AdamOptimizer(1e-5)    
    # optimizer = tf.train.GradientDescentOptimizer(1e-4)
    # optimizer = tf.train.RMSPropOptimizer(1e-4)
    # optimizer = tf.train.AdagradOptimizer(1e-4)    
 
    train_op = optimizer.minimize(loss)

In [5]:
gpu_options = tf.GPUOptions()
gpu_options.allow_growth = True
session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
session.run(tf.global_variables_initializer())

max_valid_epoch_idx = 0
max_valid_accuracy = 0.0
final_test_accuracy = 0.0
for epoch_idx in range(1, 10000 + 1):    
    
    
    session.run(
            train_op,
            feed_dict={
                x: x_train,
                y: y_train,
                training: True
            })
    
    if epoch_idx % 10 == 0:
        train_loss_value, train_accuracy_value = session.run(
            [loss, accuracy],
            feed_dict={
                x: x_train,
                y: y_train,
                training: False
            })
        
        valid_loss_value, valid_accuracy_value = session.run(
            [loss, accuracy],
            feed_dict={
                x: x_valid,
                y: y_valid,
                training: False
            })
            
        test_loss_value, test_accuracy_value = session.run(
            [loss, accuracy],
            feed_dict={
                x: x_test,
                y: y_test,
                training: False
            })

        print(epoch_idx, '%.4f' % train_loss_value, '%.4f' % valid_loss_value, '%.4f' % test_loss_value, '%.4f' % train_accuracy_value, '%.4f' % valid_accuracy_value, '%.4f' % test_accuracy_value)
        
        if max_valid_accuracy < valid_accuracy_value:
            max_valid_accuracy = valid_accuracy_value 
            max_valid_epoch_idx = epoch_idx
            final_test_accuracy = test_accuracy_value
            
    
    # Early Stop
    if max_valid_epoch_idx + 100 < epoch_idx:
        break
        
print(final_test_accuracy)

10 2.1106 2.3066 2.2474 0.2338 0.1350 0.1698
20 1.9816 2.1953 2.1651 0.3050 0.1500 0.2084
30 1.8698 2.1782 2.1287 0.3475 0.2150 0.2410
40 1.7896 2.1558 2.1038 0.3925 0.2150 0.2610
50 1.7074 2.1338 2.0967 0.4200 0.2350 0.2720
60 1.6562 2.1240 2.0787 0.4400 0.2550 0.2816
70 1.5941 2.1205 2.0874 0.4525 0.2850 0.2870
80 1.5394 2.1116 2.0872 0.4738 0.2800 0.2876
90 1.4983 2.1618 2.0953 0.4888 0.2450 0.2940
100 1.4582 2.1381 2.1044 0.5000 0.2550 0.2882
110 1.4188 2.1496 2.0899 0.5162 0.2650 0.2960
120 1.3730 2.1803 2.1298 0.5312 0.2400 0.2960
130 1.3132 2.1913 2.1145 0.5613 0.2650 0.2958
140 1.2787 2.1318 2.1289 0.5813 0.2900 0.3020
150 1.2450 2.1681 2.1439 0.5713 0.2600 0.3020
160 1.1820 2.1696 2.1505 0.6212 0.2800 0.3022
170 1.1206 2.1877 2.1614 0.6575 0.2800 0.3028
180 1.0917 2.2416 2.1935 0.6462 0.3050 0.3072
190 1.0423 2.2084 2.2058 0.6462 0.2600 0.3096
200 0.9848 2.2550 2.2200 0.6837 0.2700 0.3042
210 0.9666 2.2712 2.2307 0.7025 0.2650 0.3054
220 0.9398 2.2880 2.2680 0.7087 0.2700 0.30

32.88% 의 성능을 확인할 수 있습니다. 실습시간 배운 몇 가지 정규화와 최적화 과정을 동원하면 50% 정도의 성능까지는 쉽게 달성할 수 있음을 확인했습니다. 수업시간에 배운 내용들을 사용해 최대한 높은 성능을 나타내는 모델을 만들어보세요! 
주피터 노트북 파일을 제출해주시면 되며, 성능을 기준으로 점수를 매길 예정입니다. (상대평가)