# ConvNet and Tensorflow 2.0 beta

이 notebook은 cs231n-2019 ConvNet numpy 구현 과제와 Tensorflow 2.0 beta 과제 정리본입니다.

개인적인 정리이니 틀린 내용이 있을수 있습니다.

## ConvNet implementation with numpy

In [25]:
# As usual, a bit of setup
import numpy as np
import matplotlib.pyplot as plt
from cs231n.classifiers.cnn import *
from cs231n.data_utils import get_CIFAR10_data
from cs231n.gradient_check import eval_numerical_gradient_array, eval_numerical_gradient
from cs231n.layers import *
from cs231n.fast_layers import *
from cs231n.solver import Solver
import os
import tensorflow as tf
import numpy as np
import math
import timeit
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

def rel_error(x, y):
  """ returns relative error """
  return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

# Load the (preprocessed) CIFAR10 data.

data = get_CIFAR10_data()
for k, v in data.items():
  print('%s: ' % k, v.shape)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
X_train:  (49000, 3, 32, 32)
y_train:  (49000,)
X_val:  (1000, 3, 32, 32)
y_val:  (1000,)
X_test:  (1000, 3, 32, 32)
y_test:  (1000,)


cs231n ConvNet implementation 과제는 cs231n/layers.py, cs231n/classifiers/cnn.py 를 구현하는 과제 입니다.

따라서 python import 자체는 해당 파일에서 될것이고, 이 notebook 에서는 제가 구현한 소스코드를 해당 파일에서 가져와서 설명만 진행할것입니다.

### cs231n/layers.py

In [15]:
def conv_forward_naive(x, w, b, conv_param):
    """
    A naive implementation of the forward pass for a convolutional layer.

    The input consists of N data points, each with C channels, height H and
    width W. We convolve each input with F different filters, where each filter
    spans all C channels and has height HH and width WW.

    Input:
    - x: Input data of shape (N, C, H, W)
    - w: Filter weights of shape (F, C, HH, WW)
    - b: Biases, of shape (F,)
    - conv_param: A dictionary with the following keys:
      - 'stride': The number of pixels between adjacent receptive fields in the
        horizontal and vertical directions.
      - 'pad': The number of pixels that will be used to zero-pad the input. 
        

    During padding, 'pad' zeros should be placed symmetrically (i.e equally on both sides)
    along the height and width axes of the input. Be careful not to modfiy the original
    input x directly.

    Returns a tuple of:
    - out: Output data, of shape (N, F, H', W') where H' and W' are given by
      H' = 1 + (H + 2 * pad - HH) / stride
      W' = 1 + (W + 2 * pad - WW) / stride
    - cache: (x, w, b, conv_param)
    """
    out = None
    ###########################################################################
    # TODO: Implement the convolutional forward pass.                         #
    # Hint: you can use the function np.pad for padding.                      #
    ###########################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    batch_size, channel, height, width = x.shape
    # input data 의 type 입니다.
    # 이해가 쉽게 image의 예시를 들겠습니다.
    # 배치 사이즈, image의 channel 수(보통 3 또는 4), input image의 size_Height, input image의 size_Width
    filter_num, _, filter_height, filter_width = w.shape
    # learnable filter의 shape 입니다.
    # filter의 shape은 filter 개수, 입력 데이터 shape (without batch size)입니다.
    # 따라서 filter 개수, Channel 개수, image size Height, image size Width 입니다.
    stride, padding = conv_param['stride'], conv_param['pad']
    output_height = 1 + (height + 2 * padding - filter_heig를ht) // stride
    output_width = 1 + (width + 2 * padding - filter_width) // stride
    # ConvNet의 output 인 Activation map의 size를 구하는 공식입니다.
    
    out = np.zeros((batch_size, filter_num, output_height, output_width))
    x_pad = np.pad(x, ((0, 0), (0, 0), (padding, padding), (padding, padding)), mode='constant', constant_values=0)
    # 이미지에 해당하는 영역에만 padding 적용
    
    for n in range(batch_size):
        for f in range(filter_num):
            for height_index in range(output_height):
                for width_index in range(output_width):
                    # 각 filter를 적용시켜 convolution 결과값인 scala 하나를 activation map 에 대입.
                    out[n, f, height_index, width_index] = (
                        np.sum(
                            x_pad[n,
                                  :, # 3 channel 모두 구한 후, np.sum을 해주기 위해 channel 전체 선택
                                  height_index * stride:height_index * stride + filter_height,
                                  width_index * stri를de:width_index * stride + filter_width]
                                # filter size 만큼 input image 에서 가져오기
                            * w[f]
                            # filter size 인 W의 특정 filter 를 croped input 에 곱함으로, convolution 연산 진행.
                            # element wise 곱셈
                        )
                        # element wise 곱셈이 끝나고, 전체 값을 더해주기
                        + b[f]
                        # bias 더하기
                    )
    
    # 결국, croped input 에 대해서 element wise WX + b 를 하는것과 동일하다.

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    ###########################################################################
    #                             END OF YOUR CODE                            #
    ###########################################################################
    cache = (x, w, b, conv_param)
    return out, cache

ConvNet 의 Activation map size 구하는 방법

![Untitled Diagram](https://user-images.githubusercontent.com/26921984/62833190-a233a580-bc75-11e9-8e53-5999bc96fcfd.png)

In [11]:
x = np.array([[[[1,2,3,4], [5,6,7,8], [9,10,11,12], [13,14,15,16]]]])
w = np.array([[[[1,2],[3,4]]]])
b = np.array([3])
conv_param = {
    'stride': 1, 
    'pad': 0
}
out, cache = conv_forward_naive(x, w, b, conv_param)
print(out)

[[[[ 47.  57.  67.]
   [ 87.  97. 107.]
   [127. 137. 147.]]]]


ConvNet activation map 생성 과정

![Untitled Diagram (1)](https://user-images.githubusercontent.com/26921984/62833376-b3ca7c80-bc78-11e9-895a-7b4db19fbcaa.png)

In [12]:
def conv_backward_naive(dout, cache):
    """
    A naive implementation of the backward pass for a convolutional layer.

    Inputs:
    - dout: Upstream derivatives.
    - cache: A tuple of (x, w, b, conv_param) as in conv_forward_naive

    Returns a tuple of:
    - dx: Gradient with respect to x
    - dw: Gradient with respect to w
    - db: Gradient with respect to b
    """
    dx, dw, db = None, None, None
    ###########################################################################
    # TODO: Implement the convolutional backward pass.                        #
    ###########################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    x, w, b, conv_param = cache
    batch_size, channel, height, width = x.shape
    filter_num, _, filter_height, filter_width = w.shape
    stride, padding = conv_param['stride'], conv_param['pad']
    _, _, output_height, output_width = dout.shape
    
    
    x_pad = np.pad(x, ((0, 0), (0, 0), (padding, padding), (padding, padding)), mode='constant', constant_values=0)
    
    dx_pad = np.zeros_like(x_pad)
    dw = np.zeros_like(w)
    db = np.zeros_like(b)
    # data 준비 과정 설명은 생략
    
    for n in range(batch_size):
        for f in range(filter_num):
            db[f] += np.sum(dout[n, f])
            for height_index in range(output_height):
                for width_index in range(output_width):
                    # 각 filter를 돌면서 backprop을 진행.
                    # 각 learnable filter, W 가 어떻게 사용되었나 바라보자.
                    # element wise W * X + b 가 진행되었었다.
                    # 최종 gradient 는 각 element가 사용된 수식들에서 각각 구한 gradient들의 합이다.
                    # batch normalization backprop을 구현해보면 잘 알 수 있다.
                    # 예시) x -> WX + b -> output_a  -> output_a + output _b -> output_c
                    #         -> ZX     -> output_ b
                    # 위와같은 network 가 있다면, x 에 대한 gradient 는 output_c 부터 시작하여 흘러 내려오게 되는데,
                    # WX+b 와 ZW 는 chain rule 에 의해 독립적으로 gradient를 구할 수 있다.
                    # 따라서, x 의 gradient, dx 는 WX + b 로 부터 구한 dx_1, ZX 로 부터 구한 dx_2 의 합 이다.
                    # dx = dx_1 + dx_2
                    # 이 아이디어를 conv net 에 가져와보자.
            
                    dw[f] += (
                        x_pad[n,
                              :,
                              height_index * stride:height_index * stride + filter_height,
                              width_index * stride:width_index * stride + filter_width]
                        * dout[n, f, height_index, width_index]
                    )
                    
                    # W 에 대한 gradient를 구해야 한다.
                    # forward 에서 element wise W * X + b 를 진행하였었다.
                    # 이 식을 W 에 대해 편미분해보자.
                    # 일단 bias 는 날아간다.
                    # element wise 이므로 X 만 남게 된다.
                    # 여기서 X 는 forward에서 crop 된 특정 영역의 input 이다.(shape == filter shape)
                    # chain rule 을 적용해야하므로, dout 을 곱해준다.
                    # 이로써, dW 의 일부분을 구하였다.
                    # 그러면 어느 부분의 gradient를 구한것일까?
                    # forward 에서 filter 가 sliding 하면서 건드렸던 모든 input 에 대해 dW 를 구한것이다.
                    # 위에서 설명했듯이, W * X + b 에서, dW 를 구하기 위해서는
                    # W 가 연산에 활용되었던 X 의 모든 구간에 대한 gradient 를 구해야하고, 해당 값에 대한 합을 구해야한다.
                    # 따라서 대입이 아닌 += 연산을 활용한다.
                    
                    dx_pad[n,
                           :,
                           height_index * stride:height_index * stride + filter_height,
                           width_index * stride:width_index * stride + filter_width] += (
                        w[f] * dout[n, f, height_index, width_index]
                    )
                    
                    # X 에 대한 gradient 또한 마찬가지로, X 와 같이 연산을 진행하였던 모든 W 에 대한 gradient 를 구하고, 합해야한다.
                    
    dx = dx_pad[:, :, padding:padding+height, padding:padding+width]

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    ###########################################################################
    #                             END OF YOUR CODE                            #
    ###########################################################################
    return dx, dw, db

![Untitled Diagram (2)](https://user-images.githubusercontent.com/26921984/62833616-b8912f80-bc7c-11e9-878a-b79c35f15f6c.png)
![Untitled Diagram (3)](https://user-images.githubusercontent.com/26921984/62833615-b8912f80-bc7c-11e9-882f-2762bf44b0c4.png)
![Untitled Diagram (5)](https://user-images.githubusercontent.com/26921984/62833614-b8912f80-bc7c-11e9-8276-d8f8c4ba8cdd.png)

In [None]:
def max_pool_forward_naive(x, pool_param):
    """
    A naive implementation of the forward pass for a max-pooling layer.

    Inputs:
    - x: Input data, of shape (N, C, H, W)
    - pool_param: dictionary with the following keys:
      - 'pool_height': The height of each pooling region
      - 'pool_width': The width of each pooling region
      - 'stride': The distance between adjacent pooling regions

    No padding is necessary here. Output size is given by 

    Returns a tuple of:
    - out: Output data, of shape (N, C, H', W') where H' and W' are given by
      H' = 1 + (H - pool_height) / stride
      W' = 1 + (W - pool_width) / stride
    - cache: (x, pool_param)
    """
    out = None
    ###########################################################################
    # TODO: Implement the max-pooling forward pass                            #
    ###########################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    N, C, H, W = x.shape
    pool_height = pool_param['pool_height']
    pool_width = pool_param['pool_width']
    stride = pool_param['stride']
    
    H_prime = 1 + (H - pool_height) // stride
    W_prime = 1 + (W - pool_width) // stride
    
    out = np.zeros((N, C, H_prime, W_prime))
    for n in range(N):
        for h_prime in range(H_prime):
            for w_prime in range(W_prime):
                # max pooling forward 는 특별한 구현은 딱히 없다.
                # 그저 axis 를 image height x width 영역에 맞추어 잘 걸어주면 끝.
                out[n, :, h_prime, w_prime] = np.max(
                    x[n, 
                      :, 
                      h_prime * stride:h_prime * stride + pool_height,
                      w_prime * stride:w_prime * stride + pool_width,
                     ], axis=(-1, -2))
                
                

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    ###########################################################################
    #                             END OF YOUR CODE                            #
    ###########################################################################
    cache = (x, pool_param)
    return out, cache

In [17]:
def max_pool_backward_naive(dout, cache):
    """
    A naive implementation of the backward pass for a max-pooling layer.

    Inputs:
    - dout: Upstream derivatives
    - cache: A tuple of (x, pool_param) as in the forward pass.

    Returns:
    - dx: Gradient with respect to x
    """
    dx = None
    ###########################################################################
    # TODO: Implement the max-pooling backward pass                           #
    ###########################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    x, pool_param = cache
    N, C, H, W = x.shape
    pool_height = pool_param['pool_height']
    pool_width = pool_param['pool_width']
    stride = pool_param['stride']
    
    H_prime = 1 + (H - pool_height) // stride
    W_prime = 1 + (W - pool_width) // stride
    
    dx = np.zeros_like(x)
    
    for n in range(N):
        for c in range(C):
            for h_prime in range(H_prime):
                for w_prime in range(W_prime):
                    # max pool 의 back prop은 relu backprop 과 유사하다.
                    # 결국 max pool 에 영향을 준 data, 즉 max 값으로 적용된 data만 pooling layer에서 연산에 적용되었고,
                    # 나머지는 영향이 zero다.
                    # 따라서, forward 단계에서 max 값으로 적용되었던 data 부분만 dout 을 흘려주고,
                    # 나머지는 0을 넣어주면 max pool back prop이 완성된다.
                    index = np.unravel_index(
                        np.argmax(
                            x[n, 
                              c, 
                              h_prime * stride:h_prime * stride + pool_height,
                              w_prime * stride:w_prime * stride + pool_width,
                             ]
                        ),
                        (pool_height, pool_width)
                    )
                    dx[n, 
                       c, 
                       h_prime * stride:h_prime * stride + pool_height,
                       w_prime * stride:w_prime * stride + pool_width,
                    ][index] = dout[n, c, h_prime, w_prime]
                    

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    ###########################################################################
    #                             END OF YOUR CODE                            #
    ###########################################################################
    return dx

![cs231n](https://user-images.githubusercontent.com/26921984/62834386-6ead4700-bc86-11e9-8d76-d9c52b7063b3.png)

## Tensorflow 2.0 beta 사용하기

이 과제에서는, 3 가지 방식으로 모델을 생성한다.
각 방식은 다음과같은 특징을 가진다.

| API           | Flexibility | Convenience |
|---------------|-------------|-------------|
| Barebone      | High        | Low         |
| `tf.keras.Model`     | High        | Medium      |
| `tf.keras.Sequential` | Low         | High        |

## Barebone Tensorflow

Barebone 방식은 forward, backward 등을 모두 손수 돌려줘야한다.
물론 numpy 구현보단 쉬운 방법이지만, tensorflow 를 사용해서 크게 편해진 점이 없다.
각 layer 구현을 안한다 이 하나 정도 있는정도이다.

심지어 gradient descent 도 함수로 돌려줘야했다....
아무튼 시작해보자

In [1]:
def flatten(x):
    """    
    Input:
    - TensorFlow Tensor of shape (N, D1, ..., DM)
    
    Output:
    - TensorFlow Tensor of shape (N, D1 * ... * DM)
    """
    N = tf.shape(x)[0]
    return tf.reshape(x, (N, -1))

def three_layer_convnet(x, params):
    """
    A three-layer convolutional network with the architecture described above.
    
    Inputs:
    - x: A TensorFlow Tensor of shape (N, H, W, 3) giving a minibatch of images
    - params: A list of TensorFlow Tensors giving the weights and biases for the
      network; should contain the following:
      - conv_w1: TensorFlow Tensor of shape (KH1, KW1, 3, channel_1) giving
        weights for the first convolutional layer.
      - conv_b1: TensorFlow Tensor of shape (channel_1,) giving biases for the
        first convolutional layer.
      - conv_w2: TensorFlow Tensor of shape (KH2, KW2, channel_1, channel_2)
        giving weights for the second convolutional layer
      - conv_b2: TensorFlow Tensor of shape (channel_2,) giving biases for the
        second convolutional layer.
      - fc_w: TensorFlow Tensor giving weights for the fully-connected layer.
        Can you figure out what the shape should be?
      - fc_b: TensorFlow Tensor giving biases for the fully-connected layer.
        Can you figure out what the shape should be?
    """
    conv_w1, conv_b1, conv_w2, conv_b2, fc_w, fc_b = params
    scores = None
    ############################################################################
    # TODO: Implement the forward pass for the three-layer ConvNet.            #
    ############################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    
    # layer 정보를 담는 구간이다.
    x_pad = tf.pad(x, [
        [0, 0],
        [2, 2],
        [2, 2],
        [0, 0],
    ], 'CONSTANT')
    
    # tf.nn 밑에 있는 layer 들을 사용할 수 있다.
    # 하지만 다 수동으로 돌려줘야한다.. bias 도 수동으로 돌려준모습을 볼 수 있다.
    conv1 = tf.nn.conv2d(x_pad, conv_w1, 1, padding='VALID') + conv_b1
    relu1 = tf.nn.relu(conv1)
    
    conv1_pad = tf.pad(conv1, [[0, 0], [1, 1], [1, 1], [0, 0]], 'CONSTANT')
    conv2 = tf.nn.conv2d(conv1_pad, conv_w2, 1, padding='VALID') + conv_b2
    relu2 = tf.nn.relu(conv2)
    
    conv2_flat = flatten(relu2)
    
    # fc layer를 matmul로 직접 작업하였다.
    
    scores = tf.matmul(conv2_flat, fc_w) + fc_b

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    ############################################################################
    #                              END OF YOUR CODE                            #
    ############################################################################
    return scores

In [2]:
def check_accuracy(dset, x, model_fn, params):
    """
    Check accuracy on a classification model, e.g. for validation.
    
    Inputs:
    - dset: A Dataset object against which to check accuracy
    - x: A TensorFlow placeholder Tensor where input images should be fed
    - model_fn: the Model we will be calling to make predictions on x
    - params: parameters for the model_fn to work with
      
    Returns: Nothing, but prints the accuracy of the model
    """
    num_correct, num_samples = 0, 0
    for x_batch, y_batch in dset:
        scores_np = model_fn(x_batch, params).numpy()
        y_pred = scores_np.argmax(axis=1)
        num_samples += x_batch.shape[0]
        num_correct += (y_pred == y_batch).sum()
    acc = float(num_correct) / num_samples
    print('Got %d / %d correct (%.2f%%)' % (num_correct, num_samples, 100 * acc))

def training_step(model_fn, x, y, params, learning_rate):
    with tf.GradientTape() as tape:
        # GradientTape 는 model 에 learnable variable 들의 gradient 를 저장해두는 변수이다.
        scores = model_fn(x, params) # Forward pass of the model
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=scores)
        # softmax loss 를 구한다.
        total_loss = tf.reduce_mean(loss)
        # total loss 를 tf.reduce_mean 으로 평균을 구한다
        grad_params = tape.gradient(total_loss, params)
        # loss 를 이용하여 각 parameter들의 gradient를 구한다.
        
        # Make a vanilla gradient descent step on all of the model parameters
        # Manually update the weights using assign_sub()
        for w, grad_w in zip(params, grad_params):
            # gradient descent 를 각 learnable variable들에 적용한다.
            w.assign_sub(learning_rate * grad_w)
                        
        return total_loss    

def create_matrix_with_kaiming_normal(shape):
    if len(shape) == 2:
        fan_in, fan_out = shape[0], shape[1]
    elif len(shape) == 4:
        fan_in, fan_out = np.prod(shape[:3]), shape[3]
    return tf.keras.backend.random_normal(shape) * np.sqrt(2.0 / fan_in)

def train_part2(model_fn, init_fn, learning_rate):
    """
    Train a model on CIFAR-10.
    
    Inputs:
    - model_fn: A Python function that performs the forward pass of the model
      using TensorFlow; it should have the following signature:
      scores = model_fn(x, params) where x is a TensorFlow Tensor giving a
      minibatch of image data, params is a list of TensorFlow Tensors holding
      the model weights, and scores is a TensorFlow Tensor of shape (N, C)
      giving scores for all elements of x.
    - init_fn: A Python function that initializes the parameters of the model.
      It should have the signature params = init_fn() where params is a list
      of TensorFlow Tensors holding the (randomly initialized) weights of the
      model.
    - learning_rate: Python float giving the learning rate to use for SGD.
    """
    
    
    params = init_fn()  # Initialize the model parameters            
        
    for t, (x_np, y_np) in enumerate(train_dset):
        # Run the graph on a batch of training data.
        loss = training_step(model_fn, x_np, y_np, params, learning_rate)
        
        # Periodically print the loss and check accuracy on the val set.
        if t % print_every == 0:
            print('Iteration %d, loss = %.4f' % (t, loss))
            check_accuracy(val_dset, x_np, model_fn, params)

In [4]:
def three_layer_convnet_init():
    """
    Initialize the weights of a Three-Layer ConvNet, for use with the
    three_layer_convnet function defined above.
    You can use the `create_matrix_with_kaiming_normal` helper!
    
    Inputs: None
    
    Returns a list containing:
    - conv_w1: TensorFlow tf.Variable giving weights for the first conv layer
    - conv_b1: TensorFlow tf.Variable giving biases for the first conv layer
    - conv_w2: TensorFlow tf.Variable giving weights for the second conv layer
    - conv_b2: TensorFlow tf.Variable giving biases for the second conv layer
    - fc_w: TensorFlow tf.Variable giving weights for the fully-connected layer
    - fc_b: TensorFlow tf.Variable giving biases for the fully-connected layer
    """
    params = None
    ############################################################################
    # TODO: Initialize the parameters of the three-layer network.              #
    ############################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    
    # tf.Variable 로 Variable들을 만들어준다.
    conv_w1 = tf.Variable(create_matrix_with_kaiming_normal((5,5,3,32)))
    conv_b1 = tf.Variable(tf.zeros([32]))
    conv_w2 = tf.Variable(create_matrix_with_kaiming_normal((3,3,32,16)))
    conv_b2 = tf.Variable(tf.zeros([16]))
    fc_w = tf.Variable(create_matrix_with_kaiming_normal((32 * 32 *16,10)))
    fc_b = tf.Variable(tf.zeros([10]))
    params = [conv_w1, conv_b1, conv_w2, conv_b2, fc_w, fc_b]

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################
    return params

learning_rate = 3e-3
train_part2(three_layer_convnet, three_layer_convnet_init, learning_rate)

## Keras Model Subclassing API

gradient 계산이나 Variable선언 필요 없이, tf.keras.layers 밑에 구현되어있는 layer들을 가져다 쓰면 된다.
layer가 실행되는 순서를 call method에 선언해주면된다.

In [None]:
class ThreeLayerConvNet(tf.keras.Model):
    def __init__(self, channel_1, channel_2, num_classes):
        super(ThreeLayerConvNet, self).__init__()
        ########################################################################
        # TODO: Implement the __init__ method for a three-layer ConvNet. You   #
        # should instantiate layer objects to be used in the forward pass.     #
        ########################################################################
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

        initializer = tf.initializers.VarianceScaling(scale=2.0)
        self.conv1 = tf.keras.layers.Conv2D(
            filters=channel_1,
            kernel_size=[5, 5],
            strides=[1, 1],
            padding='valid',
            activation='relu',
            kernel_initializer=initializer
        )
        self.conv2 = tf.keras.layers.Conv2D(
            filters=channel_2,
            kernel_size=[3, 3],
            strides=[1, 1],
            padding='valid',
            activation='relu',
            kernel_initializer=initializer
        )
        self.fc1 = tf.keras.layers.Dense(
            units=num_classes,
            kernel_initializer=initializer,
            activation='softmax'
        )

        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################
        
    def call(self, x, training=False):
        scores = None
        ########################################################################
        # TODO: Implement the forward pass for a three-layer ConvNet. You      #
        # should use the layer objects defined in the __init__ method.         #
        ########################################################################
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
        x = tf.pad(x, [[0, 0], [2, 2], [2, 2], [0, 0]], 'CONSTANT')
        x = self.conv1(x)
        x = tf.pad(x, [[0, 0], [1, 1], [1, 1], [0, 0]], 'CONSTANT')
        x = self.conv2(x)
        x = tf.keras.layers.Flatten()(x)
        scores = self.fc1(x)

        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
        ########################################################################
        #                           END OF YOUR CODE                           #
        ########################################################################        
        return scores

Tensorflow 2.0 부터는 Eager Training을 지원한다.

즉, static graph를 만들고 적용하는 일이 없어진다는것이다.

이를 위해서 tf.GradientTape 에 gradient를 저장해둔뒤, 선언해둔 optimizer(tf.keras.optimizers) 의 method 인 apply_gradients 로 gradient descent작업이 끝난다.

또한, metric과 log작업도 tf.keras.metrics method들을 사용하면 편하게 사용할 수 있다.

In [6]:
def train_part34(model_init_fn, optimizer_init_fn, num_epochs=1, is_training=False):
    """
    Simple training loop for use with models defined using tf.keras. It trains
    a model for one epoch on the CIFAR-10 training set and periodically checks
    accuracy on the CIFAR-10 validation set.
    
    Inputs:
    - model_init_fn: A function that takes no parameters; when called it
      constructs the model we want to train: model = model_init_fn()
    - optimizer_init_fn: A function which takes no parameters; when called it
      constructs the Optimizer object we will use to optimize the model:
      optimizer = optimizer_init_fn()
    - num_epochs: The number of epochs to train for
    
    Returns: Nothing, but prints progress during trainingn
    """    
    with tf.device(device):

        # Compute the loss like we did in Part II
        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
        
        model = model_init_fn()
        optimizer = optimizer_init_fn()
        
        train_loss = tf.keras.metrics.Mean(name='train_loss')
        train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
    
        val_loss = tf.keras.metrics.Mean(name='val_loss')
        val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='val_accuracy')
        
        t = 0
        for epoch in range(num_epochs):
            
            # Reset the metrics - https://www.tensorflow.org/alpha/guide/migration_guide#new-style_metrics
            train_loss.reset_states()
            train_accuracy.reset_states()
            
            for x_np, y_np in train_dset:
                with tf.GradientTape() as tape:
                    
                    # Use the model function to build the forward pass.
                    scores = model(x_np, training=is_training)
                    loss = loss_fn(y_np, scores)
      
                    gradients = tape.gradient(loss, model.trainable_variables)
                    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
                    
                    # Update the metrics
                    train_loss.update_state(loss)
                    train_accuracy.update_state(y_np, scores)
                    
                    if t % print_every == 0:
                        val_loss.reset_states()
                        val_accuracy.reset_states()
                        for test_x, test_y in val_dset:
                            # During validation at end of epoch, training set to False
                            prediction = model(test_x, training=False)
                            t_loss = loss_fn(test_y, prediction)

                            val_loss.update_state(t_loss)
                            val_accuracy.update_state(test_y, prediction)
                        
                        template = 'Iteration {}, Epoch {}, Loss: {}, Accuracy: {}, Val Loss: {}, Val Accuracy: {}'
                        print (template.format(t, epoch+1,
                                             train_loss.result(),
                                             train_accuracy.result()*100,
                                             val_loss.result(),
                                             val_accuracy.result()*100))
                    t += 1

## Keras Sequential API

Keras model subclasse api 또한 편리했지만, 보통의 model들은 sequential한 layer구조를 가질때가 많다.

따라서 단순한 sequential한 network를 만들때에는 tf.Keras.Sequential을 이용하면 편하다.

아래 코드 처럼 layer를 list에 넣고 tf.keras.Sequential 에 넣어주기만 하면 끝이다.

In [10]:
def model_init_fn():
    model = None
    ############################################################################
    # TODO: Construct a three-layer ConvNet using tf.keras.Sequential.         #
    ############################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    input_shape = (32, 32, 3)
    channel_1,channel_2, num_classes = 32, 16, 10
    
    initializer = tf.initializers.VarianceScaling(scale=2.0)
    
    layers = [
        tf.keras.layers.Conv2D(
            filters=channel_1,
            kernel_size=[5, 5],
            strides=[1, 1],
            padding='valid',
            activation='relu',
            kernel_initializer=initializer
        ),
        tf.keras.layers.Conv2D(
            filters=channel_2,
            kernel_size=[3, 3],
            strides=[1, 1],
            padding='valid',
            activation='relu',
            kernel_initializer=initializer
        ),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(
            units=num_classes,
            kernel_initializer=initializer,
            activation='softmax'
        ),
    ]
    
    model = tf.keras.Sequential(layers)

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    ############################################################################
    #                            END OF YOUR CODE                              #
    ############################################################################
    return model

learning_rate = 5e-4
def optimizer_init_fn():
    optimizer = None
    ############################################################################
    # TODO: Complete the implementation of model_fn.                           #
    ############################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    optimizer = tf.keras.optimizers.SGD(
        learning_rate=learning_rate, momentum=0.9, nesterov=True
    )

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    ############################################################################
    #                           END OF YOUR CODE                               #
    ############################################################################
    return optimizer

train_part34(model_init_fn, optimizer_init_fn)

우리는 지금까지 tensorflow 의 eager execution을 사용하였다.
물론 충분히 high level api이지만, keras는 이 보다 더 high level Training api를 제공한다.

바로 model.fit 이다.

기존에는 아래 작업으로 학습을 진행하였다면, 이 코드를 짜지 않고, fit 함수에 적절한 optimizerr, loss function, metric option을 주면 알아서 학습을 진행한다.

```python
for x_np, y_np in train_dset:
                with tf.GradientTape() as tape:
    scores = model(x_np, training=is_training)
                        loss = loss_fn(y_np, scores)
                        gradients = tape.gradient(loss, model.trainable_variables)
                        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
```

In [None]:
model = model_init_fn()
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate),
              loss='sparse_categorical_crossentropy',
              metrics=[tf.keras.metrics.sparse_categorical_accuracy])
model.fit(X_train, y_train, batch_size=64, epochs=1, validation_data=(X_val, y_val))
model.evaluate(X_test, y_test)