신경망 학습
========
MNIST 이미지 인식 신경망을 역전파를 써서 만들어보자. 아래와 같은 구조로 만들것임.

> 입력층(784) &rarr; 은닉층 &rarr; 시그모이드 &rarr; 결과(10) &rarr; 소프트맥스

In [5]:
import numpy as np
import signal
import sys
import mnist
from common import sigmoid, gradient_sigmoid, softmax, cross_entropy_error_batch

#
# Hyper parameters
#
# 히든레이어 뉴런 수 (ex: 50, 100)
HIDDEN_LAYER_SIZE = 100
# 정규분포 난수로 생성될 초기 가중치의 표준편차
WEIGHT_INIT_STD = 0.01
# 경사하강법을 몇번 적용할지
ITERATION_COUNT = 10000
# 학습에 사용할 미니배치의 크기
BATCH_SIZE = 100
# 학습률
LEARNING_RATE = 0.1
# 에퍼크, 학습 진척도를 얼마나 자주 표시할지 (ex: 100, 300)
EPOCH = 300

#
# Utility functions
#
def make_predict(input):
    def predict(w0, b0, w1, b1):
        a0 = input @ w0 + b0
        z0 = sigmoid(a0)
        a1 = z0 @ w1 + b1
        z1 = softmax(a1)
        return [a0, z0, a1, z1]
    return predict

def accuracy(expected, actual):
    return (expected.argmax(axis=-1) == actual.argmax(axis=-1)).mean()

#
# Main logic
#
MNIST = mnist.load()
TRAIN_IMG = MNIST['train_img']
TRAIN_LABEL = MNIST['train_label']

layer0_size = TRAIN_IMG.shape[-1]
layer1_size = HIDDEN_LAYER_SIZE
layer2_size = TRAIN_LABEL.shape[-1]

# Randomly initialize the parameters
parameters = [
    # w0
    WEIGHT_INIT_STD * np.random.randn(layer0_size, layer1_size), 
    # b0
    np.zeros(layer1_size),
    # w1
    WEIGHT_INIT_STD * np.random.randn(layer1_size, layer2_size),
    # b1
    np.zeros(layer2_size),
]

print('''학습 시작!

반복횟수\t정확도\tLoss
-------------------------------------------''')
for iteration in range(ITERATION_COUNT):
    # Sample a batch from the train image/label set
    sample = np.random.choice(TRAIN_IMG.shape[0], BATCH_SIZE)
    BATCH_IMG = TRAIN_IMG[sample]
    BATCH_LABEL = TRAIN_LABEL[sample]

    predict = make_predict(BATCH_IMG)

    # Try the result
    if iteration % EPOCH == 0:
        expected = predict(*parameters)[-1]
        percentage = accuracy(expected, BATCH_LABEL)*100
        loss = cross_entropy_error_batch(expected, BATCH_LABEL)
        print(f'{iteration:8}\t{percentage:.04}%\t{loss}')

    # Calculate gradient
    def grad(parameters):
        w1 = parameters[2]
        a0, z0, _, expected = predict(*parameters)

        # Backward propagation
        dz1 = (expected - BATCH_LABEL)/BATCH_SIZE
        dw1 = z0.T @ dz1
        db1 = dz1.sum(axis=0)

        dz0 = gradient_sigmoid(a0) * (dz1 @ w1.T)
        dw0 = BATCH_IMG.T @ dz0
        db0 = dz0.sum(axis=0)
        return [dw0, db0, dw1, db1]

    # Update parameters using gradient descent method
    gradient = grad(parameters)
    for param, grad in zip(parameters, gradient):
        param -= LEARNING_RATE * grad

expected = make_predict(MNIST['test_img'])(*parameters)[-1]
TEST_LABEL = MNIST['test_label']
percentage = accuracy(expected, TEST_LABEL)*100

print(f'''
학습 완료!

최종 점수
-------------
정확도 : {percentage}%
''')

학습 시작!

반복횟수	정확도	Loss
-------------------------------------------
       0	14.0%	2.3038295694064117
     300	60.0%	1.5374030657050957
     600	83.0%	0.6986352475785611
     900	80.0%	0.5839718143466742
    1200	90.0%	0.3980761341793732
    1500	92.0%	0.35957619490559295
    1800	91.0%	0.3813059236076406
    2100	97.0%	0.21716477230783163
    2400	86.0%	0.36983858549496623
    2700	93.0%	0.24281726580553056
    3000	89.0%	0.31989025317745745
    3300	92.0%	0.36311490339430497
    3600	94.0%	0.31396136521992446
    3900	93.0%	0.2558455911821598
    4200	86.0%	0.32691406575618215
    4500	91.0%	0.2925008002107429
    4800	90.0%	0.29249756723289705
    5100	94.0%	0.25101859679622857
    5400	93.0%	0.2510662758883689
    5700	89.0%	0.26757411253411256
    6000	96.0%	0.18355339708285953
    6300	85.0%	0.4237913523903206
    6600	94.0%	0.2295439851592964
    6900	96.0%	0.13518225149769483
    7200	90.0%	0.2397597220188128
    7500	93.0%	0.2337063350708975
    7800	88.0%	0.31823535296692324
  