# 4.1. Data-driven Approach
# 4.2. Loss Function
## 4.2.1. MSE

- $E = \frac{1}{2}\sum_{k}^{}{(y_{k} - t_{k}) ^2}$

    - ${y_{k}}$는 신경망의 출력, ${t_{k}}$는 정답 레이블, k는 데이터의 차원수

## 4.2.2 Cross Entropy

- $E=-\sum_{k}{t_{k}}{logy_{k}}$

   - ${y_{k}}$는 신경망의 확률 출력값, ${t_{k}}$는 라벨의 one-hot-encoding 벡터

In [1]:
import numpy as np

In [2]:
# one-hot-encoding 되어있지 않을 때
def cross_entropy_error(y, t):
    delta = 1e-7
    return -np.sum(t*np.log(y + delta))

## 4.2.3 Mini-Batch Learning

In [3]:
from tensorflow.python.keras.datasets.mnist import load_data

(X_train, y_train), (X_test,y_test) = load_data()
X_train, X_test = X_train.reshape(60000,-1) / 255, X_test.reshape(10000, -1) / 255

from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()
enc.fit(y_train.reshape(-1, 1))
y_train, y_test = enc.transform(y_train.reshape(-1, 1)).toarray(), enc.transform(y_test.reshape(-1, 1)).toarray()

In [4]:
def mini_batch(x, y, batch_size):
    mask = np.random.choice(x.shape[0], batch_size)
    return x[mask], y[mask]

In [5]:
tmp1,tmp2 = mini_batch(X_train, y_train, batch_size = 128)
print(tmp1.shape, tmp2.shape)

(128, 784) (128, 10)


## 4.2.4 Cross-Entropy for Mini-Batch Learning
- $E=-\frac{1}{N}\sum_{n}\sum_{k}{t_{nk}}{logy_{nk}}$

    - "평균 손실 함수"를 구하는 것

In [6]:
def cross_entropy_error(y, t):
    delta = 1e-7
    if y.ndim != 1:
        t = t.flatten()
        y = y.flatten()
        return -np.sum(t*np.log(y + delta) / y.shape[0]) 
    else:
        return -np.sum(t*np.log(y + delta)) 

# 4.3 수치 미분

# 4.4 기울기

In [7]:
def numerical_gradient(f, x):
    h = 1e-4 # 0.0001
    grad = np.zeros_like(x)
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    
    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + h
        fxh1 = f(x) # f(x+h)
        
        x[idx] = float(tmp_val) - h 
        fxh2 = f(x) # f(x-h)
        grad[idx] = (fxh1 - fxh2) / (2*h)
        
        x[idx] = tmp_val # 값 복원
        it.iternext()   
        
    return grad

# 4.5 구현

In [8]:
def sigmoid(x):
    return 1/(1+np.exp(-x))
    
def softmax(x):
    nom = np.exp(x - np.max(x, axis = 1)[:, np.newaxis])
    denom = np.sum(nom, axis = 1)[:, np.newaxis]
    return nom / denom

In [9]:
class TwoLayerNet(object):
    def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01):
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.normal(size = (input_size, hidden_size))
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.normal(size = (hidden_size, output_size))
        self.params['b2'] = np.zeros(output_size)
        
    def predict(self, x):
        W1, W2, b1, b2 = self.params['W1'], self.params['W2'], self.params['b1'], self.params['b2']
        a1 = np.dot(x, W1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)
        return y
    
    def loss(self, x, t):
        y = self.predict(x)
        return cross_entropy_error(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis = 1)
        t = np.argmax(t, axis = 1)
        accuracy = np.sum(y == t).astype(np.float) / x.shape[0]
        return accuracy
    
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        
        return grads

In [12]:
net = TwoLayerNet(input_size = 784, hidden_size = 50, output_size = 10)
print(net.params['W1'].shape, net.params['W2'].shape)
print(net.params['b1'].shape, net.params['b2'].shape)

(784, 50) (50, 10)
(50,) (10,)


```python
iter_num = 10000
batch_size = 100
learning_rate = 0.1

train_loss, train_acc, test_acc = [], [], []

for i in range(iter_num):
    x_batch, y_batch = mini_batch(X_train, y_train, batch_size)
    grad = net.numerical_gradient(x_batch, y_batch)
    
    for key in net.params.keys():
        net.params[key] -= learning_rate * grad[key]
        
    if i % iter_num / batch_size == 0: # iter_num_per_epoch
        loss = net.loss(x_batch, y_batch)
        acc1, acc2 = net.accuracy(X_train, y_train), net.accuracy(X_test, y_test)
        train_loss.append(loss)
        train_acc.append(acc1)
        test_acc.append(acc2)

        print(i,":", "loss, train acc, test acc |" , str(loss) +',', str(acc1) +',', str(acc2))
```