미니배치 -> 기울기 산출 -> 매개변수 갱신 -> 1 ~ 3 반복
확률적 경사 하강법(SGD) : 데이터를 미니배치로 무작위로 선정하기 때문 

In [4]:
# two layered Network
import sys, os
sys.path.append(os.pardir)
from common.functions import *
from common.gradient import numerical_gradient

class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
        
    def predict(self, x): 
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
        
        a1 = np.dot(x, W1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1,W2) + b2
        y = softmax(a2)
        
        return y
    
    # x : 입력데이터, t : 정답 테이블
    def loss(self, x, t):
        y = self.predict(x)
        
        return cross_entropy_error(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        t = np.argmax(t, axis=1)
        
        accuracy = np.sum(y == t)
        return accuracy
    
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        
        return grads
    
    

In [5]:
net = TwoLayerNet(input_size=784, hidden_size=100, output_size=10)
print(net.params['W1'].shape)
print(net.params['b1'].shape)
print(net.params['W2'].shape)
print(net.params['b2'].shape)

(784, 100)
(100,)
(100, 10)
(10,)


In [6]:
# 더미 데이터를 통한 예측 
x = np.random.rand(100, 784)
y = net.predict(x)

print(y)

[[0.09489868 0.09697798 0.10029724 0.09822704 0.09715464 0.10368148
  0.08882073 0.1078613  0.10895127 0.10312963]
 [0.09531687 0.09743861 0.09997399 0.09850761 0.09690068 0.10386824
  0.08850936 0.10739063 0.10927418 0.10281984]
 [0.09525205 0.09705974 0.09988768 0.09859943 0.09680277 0.10385995
  0.08853446 0.10743324 0.10928028 0.1032904 ]
 [0.09483393 0.09749759 0.10044797 0.09845506 0.09691886 0.10355617
  0.08875947 0.10736737 0.10936712 0.10279645]
 [0.09464508 0.09732079 0.09998164 0.09836474 0.09686215 0.10406968
  0.08871398 0.10794758 0.10909325 0.10300111]
 [0.09500964 0.09753153 0.09990591 0.09840901 0.09683623 0.10386991
  0.08844827 0.10762909 0.10908128 0.10327912]
 [0.09491363 0.09705274 0.10024125 0.09842547 0.09678245 0.10384275
  0.08870405 0.10747877 0.10915749 0.10340139]
 [0.09472161 0.09740547 0.10017151 0.09834663 0.09675758 0.10368572
  0.08891552 0.1075404  0.10894979 0.10350576]
 [0.09498057 0.09708686 0.10016077 0.09856449 0.09707899 0.1035795
  0.08843964 

In [7]:
x = np.random.rand(100, 784)
t = np.random.rand(100, 10)

grads = net.numerical_gradient(x, t)

print(grads['W1'].shape)
print(grads['b1'].shape)
print(grads['W2'].shape)
print(grads['b2'].shape)


(784, 100)
(100,)
(100, 10)
(10,)


In [8]:
import numpy as np
from mnist import load_mnist

(x_train, t_train), (x_test, t_test) = load_mnist( normalize = True, one_hot_label=True)

train_loss_list = []

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_late = 0.1

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    grad = network.numerical_gradient(x_batch, t_batch)
    
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_late * grad[key]
    
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)


KeyboardInterrupt: 

에폭 = 학습에서 훈련 데이터를 모두 소진했을 때의 횟수에 해당함. 10000개의 훈련데이터를 100개의 미니배치로 학습할 경우, SGD를 100회 반복하면 
훈련 데이터를 모두 소진. 이경우 100회가 1에폭이 됨. 오버 피팅이 일어나는지에 대해 판단하기 위해 1에폭 당 데이터를 평가. 정확도를 기록하기로함.

In [None]:
## 개선 
import numpy as np
from mnist import load_mnist

(x_train, t_train), (x_test, t_test) = load_mnist( normalize = True, one_hot_label=True)

train_loss_list = []
train_acc_list = []
test_acc_list = []

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_late = 0.1

iter_per_epoch = max(train_size / batch_size, 1)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    grad = network.numerical_gradient(x_batch, t_batch)
    
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_late * grad[key]
    
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print("train acc, test acc | " + str(train_acc) +", " + str(test_acc))