In [50]:
%load_ext nb_black
import numpy as np 
import matplotlib.pyplot as plt
import pickle

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [51]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def softmax(x):
    x = x - np.max(x, axis=-1, keepdims=True)   # overflow
    return np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True)

<IPython.core.display.Javascript object>

In [52]:
class MulLayer:
    def __init__(self):
        self.x = None
        self.y = None 

    def forward(self, x, y):
        self.x = x 
        self.y = y 
        return x * y 

    def backward(self, dout):
        dx = dout * self.y 
        dy = dout * self.x 
        return dx, dy

<IPython.core.display.Javascript object>

In [53]:
apple = 100 
napple = 2 
tax = 1.1 

mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()
apple_price = mul_apple_layer.forward(apple, napple)
price = mul_tax_layer.forward(apple_price, tax)
print(price)

dprice = 1
dapple_price, dtax = mul_tax_layer.backward(dprice)
dapple, dnapple = mul_apple_layer.backward(dapple_price)
print(dapple, dnapple, dtax)


220.00000000000003
2.2 110.00000000000001 200


<IPython.core.display.Javascript object>

In [54]:
class AddLayer:
    def __init__(self):
        pass 

    def forward(self, x, y):
        return x + y 

    def backward(self, dout):
        return dout, dout 

<IPython.core.display.Javascript object>

In [55]:
apple = 100 
apple_num = 2 
orange = 150 
orange_num = 3 
tax = 1.1
# layer
mul_apple_layer = MulLayer() 
mul_orange_layer = MulLayer() 
add_apple_orange_layer = AddLayer() 
mul_tax_layer = MulLayer()
# forward
apple_price = mul_apple_layer.forward(apple, apple_num) #(1)
orange_price = mul_orange_layer.forward(orange, orange_num) #(2)
all_price = add_apple_orange_layer.forward(apple_price, orange_price) #(3) 
price = mul_tax_layer.forward(all_price, tax) #(4)
# backward
dprice = 1
dall_price, dtax = mul_tax_layer.backward(dprice) #(4)
dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price) #(3) 
dorange, dorange_num = mul_orange_layer.backward(dorange_price) #(2)
dapple, dapple_num = mul_apple_layer.backward(dapple_price) #(1)
print(price) # 715
print(dapple_num, dapple, dorange, dorange_num, dtax) # 110 2.2 3.3 165 650

715.0000000000001
110.00000000000001 2.2 3.3000000000000003 165.0 650


<IPython.core.display.Javascript object>

In [56]:
class Relu:
    def __init__(self):
        self.mask = None 

    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0 
        return out 

    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout 
        return dx

<IPython.core.display.Javascript object>

In [64]:
class Sigmoid:
    def __init__(self):
        self.out = None

    def forward(self, x):
        self.out = sigmoid(x) 
        return self.out

    def backward(self, dout):
        dx = dout * (1.0 - self.out) * self.out
        return dx

<IPython.core.display.Javascript object>

In [58]:
class Affine:
    def __init__(self, w, b):
        self.w = w
        self.b = b 

        self.x = None 
        self.original_x_shape = None 
        self.dw = None 
        self.db = None 

    def forward(self, x):
        self.original_x_shape = x.shape 
        x = x.reshape(x.shape[0], -1)
        self.x = x 

        return np.dot(self.x, self.w) + self.b 

    def backward(self, dout):
        dx = np.dot(dout, self.w.T)
        self.dw = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        dx = dx.reshape(*self.original_x_shape)
        return dx 

<IPython.core.display.Javascript object>

In [65]:
def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)

    if t.size == y.size: # one-hot-vector
        t = t.argmax(axis=1)
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size


class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None 
        self.y = None 
        self.t = None 

    def forward(self, x, t):
        self.t = t 
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        return self.loss 

    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        if self.t.size == self.y.size: # one-hot-vector
            dx = (self.y - self.t) / batch_size
        else:
            dx = self.y.copy()
            dx[np.arange(batch_size), self.t] -= 1 
            dx = dx/ batch_size
        return dx 


<IPython.core.display.Javascript object>

In [67]:
def numerical_gradient(f, x):
    h = 1e-4
    grad = np.zeros_like(x)

    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
        x[idx] = tmp_val + h
        fxh1 = f(x)  # f(x + h)
        
        x[idx] = tmp_val - h
        fxh2 = f(x)  # f(x - h)
        grad[idx] = (fxh1 - fxh2) / (2 * h)

        x[idx] = tmp_val
        it.iternext()
    
    return grad

<IPython.core.display.Javascript object>

In [68]:
from collections import OrderedDict


class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        self.params = {}
        self.params["w1"] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params["b1"] = np.zeros(hidden_size)
        self.params["w2"] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params["b2"] = np.zeros(output_size)

        self.layers = OrderedDict()
        self.layers["Affine1"] = Affine(self.params["w1"], self.params["b1"])
        self.layers["Relu1"] = Relu()
        self.layers["Affine2"] = Affine(self.params["w2"], self.params["b2"])

        self.last_layer = SoftmaxWithLoss()

    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        return x

    def loss(self, x, t):
        y = self.predict(x)
        return self.last_layer.forward(y, t)

    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1:
            t = np.argmax(t, axis=1)
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy

    def numerical_gradient(self, x, t):
        loss_w = lambda w: self.loss(x, t)

        grads = {}
        grads["w1"] = numerical_gradient(loss_w, self.params["w1"])
        grads["b1"] = numerical_gradient(loss_w, self.params["b1"])
        grads["w2"] = numerical_gradient(loss_w, self.params["w2"])
        grads["b2"] = numerical_gradient(loss_w, self.params["b2"])

        return grads

    def gradient(self, x, t):
        # forward
        self.loss(x, t)

        # backward
        dout = 1.0
        dout = self.last_layer.backward(dout)
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)

        grads = {}
        grads["w1"] = self.layers["Affine1"].dw
        grads["b1"] = self.layers["Affine1"].db
        grads["w2"] = self.layers["Affine2"].dw
        grads["b2"] = self.layers["Affine2"].db
        return grads


<IPython.core.display.Javascript object>

In [69]:
from mnist import load_mnist

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

x_batch = x_train[:3]
t_batch = t_train[:3]

grad_numerical = network.numerical_gradient(x_batch, t_batch)
grad_backprop = network.gradient(x_batch, t_batch)

for key in grad_numerical.keys():
    diff = np.average(np.abs(grad_backprop[key] - grad_numerical[key]))
    print(f"{key}: {diff}")

w1: 3.9584387223858237e-10
b1: 2.2756062378914177e-09
w2: 5.158656588409856e-09
b2: 1.3935433495204652e-07


<IPython.core.display.Javascript object>

In [70]:
from mnist import load_mnist

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

niter = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1 

train_loss_list, train_acc_list, test_acc_list = [], [], []
iter_per_epoch = max(train_size / batch_size, 1)

for i in range(niter):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]

    grad = network.gradient(x_batch, t_batch)

    for key in ('w1', 'b1', 'w2', 'b2'):
        network.params[key] -= learning_rate * grad[key]

    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)

    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(train_acc, test_acc)

0.09928333333333333 0.1
0.9001833333333333 0.904
0.9246 0.9269
0.9375666666666667 0.9373
0.9424333333333333 0.9419
0.9520833333333333 0.9497
0.9568833333333333 0.9526
0.9606166666666667 0.956
0.9621666666666666 0.9584
0.9654166666666667 0.9608
0.9679333333333333 0.9627
0.9709166666666667 0.9657
0.9712 0.9656
0.9742333333333333 0.9685
0.9755666666666667 0.9682
0.9778166666666667 0.9694
0.9795333333333334 0.971


<IPython.core.display.Javascript object>