In [3]:
# 오차역전파법 - 메인 클래스

# coding: utf-8
import sys, os
sys.path.append(os.pardir)  # 부모 디렉터리의 파일을 가져올 수 있도록 설정
import numpy as np
from common.layers import *
from common.gradient import numerical_gradient
from collections import OrderedDict

class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01):
        # 가중치 초기화
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size) 
        self.params['b2'] = np.zeros(output_size)

        # 계층 생성
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])

        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        
        return x
        
    # x : 입력 데이터, t : 정답 레이블
    def loss(self, x, t):
        y = self.predict(x)
        return self.lastLayer.forward(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1 : t = np.argmax(t, axis=1)
        
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
        
    # x : 입력 데이터, t : 정답 레이블
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        
        return grads
        
    def gradient(self, x, t):
        # forward
        self.loss(x, t)

        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)

        # 결과 저장
        grads = {}
        grads['W1'], grads['b1'] = self.layers['Affine1'].dW, self.layers['Affine1'].db
        grads['W2'], grads['b2'] = self.layers['Affine2'].dW, self.layers['Affine2'].db

        return grads



In [4]:
# 오차역전파법 - 메인 클래스 - 3층 

# coding: utf-8
import sys, os
sys.path.append(os.pardir)  # 부모 디렉터리의 파일을 가져올 수 있도록 설정
import numpy as np
from common.layers import *
from common.gradient import numerical_gradient
from collections import OrderedDict

class ThreeLayerNet:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01):
        # 가중치 초기화
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size) # input_size(784) * hidden_size
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, hidden_size) # hidden_size * hidden_size
        self.params['b2'] = np.zeros(hidden_size)
        self.params['W3'] = weight_init_std * np.random.randn(hidden_size, output_size) # hidden_size * output_size
        self.params['b3'] = np.zeros(output_size)

        # 계층 생성
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
        self.layers['Relu2'] = Relu()
        self.layers['Affine3'] = Affine(self.params['W3'], self.params['b3'])

        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        return x
        
    # x : 입력 데이터, t : 정답 레이블
    def loss(self, x, t):
        y = self.predict(x)
        return self.lastLayer.forward(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1 : t = np.argmax(t, axis=1)
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
        
    # x : 입력 데이터, t : 정답 레이블
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        grads['W3'] = numerical_gradient(loss_W, self.params['W3'])
        grads['b3'] = numerical_gradient(loss_W, self.params['b3'])
        return grads
        
    def gradient(self, x, t):
        # forward
        self.loss(x, t)

        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)

        # 결과 저장
        grads = {}
        grads['W1'], grads['b1'] = self.layers['Affine1'].dW, self.layers['Affine1'].db
        grads['W2'], grads['b2'] = self.layers['Affine2'].dW, self.layers['Affine2'].db
        grads['W3'], grads['b3'] = self.layers['Affine3'].dW, self.layers['Affine3'].db

        return grads


In [5]:
# Relu 함수를 활용한 2계층 신경망 학습 예제
# 하이퍼파라메터 손대지 않고 활성화함수를 Relu로만 바꾸었을 뿐인데 97% 정확률을 달성했다
# 또한 학습 때마다 다른 결과 (정확률)을 내어주는 점도 주목할만 하다
# 레이어를 3개로 늘렸더니 학습데이터에 대한 정확률은 올라가지만(Overfitting) 테스트데이터는 0.1%밖에 향상이 없었다

# coding: utf-8
import sys, os
sys.path.append(os.pardir)

import numpy as np
from dataset.mnist import load_mnist
# from two_layer_net import TwoLayerNet
# from two_layer_net import ThreeLayerNet

# 데이터 읽기
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

twoLayerNet = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

print("Staring two layer netwrok")
for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 기울기 계산
    #grad = twoLayerNet.numerical_gradient(x_batch, t_batch) # 수치 미분 방식
    grad = twoLayerNet.gradient(x_batch, t_batch) # 오차역전파법 방식(훨씬 빠르다)
    
    # 갱신
    for key in ('W1', 'b1', 'W2', 'b2'):
        twoLayerNet.params[key] -= learning_rate * grad[key]
    
    loss = twoLayerNet.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    if i % iter_per_epoch == 0:
        train_acc = twoLayerNet.accuracy(x_train, t_train)
        test_acc = twoLayerNet.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print("two-layer", train_acc, test_acc)

threeLayerNet = ThreeLayerNet(input_size=784, hidden_size=50, output_size=10)

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

print("Staring three layer netwrok")
for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 기울기 계산
    #grad = threeLayerNet.numerical_gradient(x_batch, t_batch) # 수치 미분 방식
    grad = threeLayerNet.gradient(x_batch, t_batch) # 오차역전파법 방식(훨씬 빠르다)
    
    # 갱신
    for key in ('W1', 'b1', 'W2', 'b2', 'W3', 'b3'):
        threeLayerNet.params[key] -= learning_rate * grad[key]
    
    loss = threeLayerNet.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    if i % iter_per_epoch == 0:
        train_acc = threeLayerNet.accuracy(x_train, t_train)
        test_acc = threeLayerNet.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print("three-layer", train_acc, test_acc)

Staring two layer netwrok
two-layer 0.122683333333 0.121
two-layer 0.903266666667 0.9062
two-layer 0.92165 0.9254
two-layer 0.935133333333 0.936
two-layer 0.941883333333 0.9404
two-layer 0.9493 0.9454
two-layer 0.952016666667 0.9504
two-layer 0.958383333333 0.9549
two-layer 0.9609 0.9566
two-layer 0.96385 0.9581
two-layer 0.966366666667 0.9585
two-layer 0.970033333333 0.96
two-layer 0.9707 0.9621
two-layer 0.972933333333 0.9649
two-layer 0.97465 0.9654
two-layer 0.9759 0.9669
two-layer 0.977033333333 0.965
Staring three layer netwrok
three-layer 0.0993 0.1032
three-layer 0.764766666667 0.7731
three-layer 0.8944 0.8912
three-layer 0.927366666667 0.9217
three-layer 0.943633333333 0.941
three-layer 0.955783333333 0.9498
three-layer 0.961816666667 0.9556
three-layer 0.965233333333 0.9613
three-layer 0.966833333333 0.9587
three-layer 0.972566666667 0.9634
three-layer 0.973933333333 0.9647
three-layer 0.977066666667 0.9676
three-layer 0.978683333333 0.9674
three-layer 0.979866666667 0.9699
t

In [8]:
# hidden_size 를 반으로 줄였으나 성능저하는 크게 나지 않았고, 정확률도 비슷했다
# 레이어를 늘리거나, 히든 노드를 늘리거나 둘다 비슷한 효과를 주는 것 같다

# coding: utf-8
import sys, os
sys.path.append(os.pardir)

import numpy as np
from dataset.mnist import load_mnist
# from two_layer_net import TwoLayerNet
# from two_layer_net import ThreeLayerNet

# 데이터 읽기
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

threeLayerNet = ThreeLayerNet(input_size=784, hidden_size=20, output_size=10)

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

print("Staring three layer netwrok")
for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 기울기 계산
    #grad = threeLayerNet.numerical_gradient(x_batch, t_batch) # 수치 미분 방식
    grad = threeLayerNet.gradient(x_batch, t_batch) # 오차역전파법 방식(훨씬 빠르다)
    
    # 갱신
    for key in ('W1', 'b1', 'W2', 'b2', 'W3', 'b3'):
        threeLayerNet.params[key] -= learning_rate * grad[key]
    
    loss = threeLayerNet.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    if i % iter_per_epoch == 0:
        train_acc = threeLayerNet.accuracy(x_train, t_train)
        test_acc = threeLayerNet.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print("three-layer", train_acc, test_acc)

Staring three layer netwrok
three-layer 0.112366666667 0.1135
three-layer 0.216183333333 0.2171
three-layer 0.765233333333 0.7672
three-layer 0.8913 0.8963
three-layer 0.913466666667 0.9109
three-layer 0.929333333333 0.9254
three-layer 0.936333333333 0.9353
three-layer 0.9441 0.9403
three-layer 0.948533333333 0.9436
three-layer 0.955683333333 0.9466
three-layer 0.95585 0.9463
three-layer 0.959233333333 0.9463
three-layer 0.961266666667 0.9538
three-layer 0.963833333333 0.9531
three-layer 0.965033333333 0.9542
three-layer 0.96685 0.9566
three-layer 0.965466666667 0.957


In [5]:
import numpy as np
# Affine/Softmax 계층 구현하기
X = np.random.rand(2)
W = np.random.rand(2,3)
B = np.random.rand(3)

print(X.shape)
print(W.shape)
print(B.shape)

Y = np.dot(X,W) + B
print(Y.shape)


(2,)
(2, 3)
(3,)
(3,)
