In [1]:
import numpy as np
import warnings
from collections import OrderedDict
warnings.filterwarnings("ignore")
from tensorflow import keras

In [2]:
def softmax(x):
    if x.ndim == 1:
        c = max(x)
        x = x-c
        return np.exp(x)/np.sum(np.exp(x))
    else:
        x = x.T - np.max(x,axis=1)
        x = x.T
        x = (np.exp(x).T/np.sum(np.exp(x),axis=1)).T
        return x   

def cross_entropy(y,t):
    epsilon = 1e-7
    y = softmax(y)
    return -np.sum(t*np.log(y+epsilon))/y.shape[0]

def sigmoid(x):
    return 1/(np.exp(-x))


def numerical_gradient(f,x):
    h = 1e-4
    grad = np.zeros_like(x)
    it = np.nditer(x,flags=['multi_index'],op_flags=['readwrite'])
    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
        x[idx] = tmp_val + h
        fxh1 = f(x)
        x[idx] = tmp_val - h
        fxh2 = f(x)
        grad[idx] = (fxh1-fxh2)/(2*h)
        x[idx] = tmp_val
        it.iternext()
    return grad

In [3]:
class Relu:
    def __init__(self):
        self.mask = None
        
    def forward(self,x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        return out
    
    def backward(self,dout):
        dout[self.mask] = 0
        dx = dout
        return dx  

class Sigmoid:
    def __init_(self):
        self.out = None
    
    def forward(self,x):
        out = sigmoid(x)
        self.out = out
        return self.out
    
    def backward(self,dout):
        dx = dout*self.out*(1. - self.out)
        return dx


class Affine:
    def __init__(self,W,b):
        self.W = W
        self.b = b
        self.x = None
        self.original_shape = None
        self.dW = None
        self.db = None
        
    def forward(self,x):
        # shape유지
        self.original_shape = x.shape
        x = x.reshape(x.shape[0],-1)
        self.x = x
        out = np.dot(self.x,self.W) + self.b
        return out
        
    
    def backward(self,dout):
        dx = np.dot(dout,self.W.T)
        self.dW = np.dot(self.x.T,dout)
        self.db = np.sum(dout,axis=0)
        dx = dx.reshape(*self.original_shape)
        return dx

class SoftmaxWithLoss:
    
    def __init__(self):
        self.loss = None
        self.y = None
        self.t = None
    
    def forward(self,x,t):
        self.t = t
        self.y = softmax(x)
        loss = cross_entropy(self.y,t)
        return loss
    
    def backward(self,dout=1):
        dx = (self.y - self.t)/self.t.shape[0]
        return dx
        
class LeakyReLu:
    
    def __init__(self,alpha=0.01):
        self.alpha = alpha
    
    def forward(self,x):
        out = np.where(x>0,x,self.alpha*x)
        return out
    
    def backward(self,dout):
        dx = np.where(dout >0,dout,self.alpha*dout) 
        return dx

In [29]:
class MultiNet:
#     def __init__(self,decay=0.01):
#         self.decay = decay
#         self.W = {
#             'W1':np.random.randn(784,100)*0.01,
#             'W2':np.random.randn(100,50)*0.01,
#             'W3':np.random.randn(50,10)*0.01,
#             'b1':np.zeros(100),
#             'b2':np.zeros(50),
#             'b3':np.zeros(10),
#         }
#         self.layers = OrderedDict()
#         self.layers['Affine1'] = Affine(self.W['W1'],self.W['b1'])
#         self.layers['relu1'] = Relu()
#         self.layers['Affine2'] = Affine(self.W['W2'],self.W['b2'])
#         self.layers['relu2'] = Relu()
#         self.layers['Affine3'] = Affine(self.W['W3'],self.W['b3'])
#         self.last_layer = SoftmaxWithLoss()
    
    def __init__(self,input_shape,x,decay,activation):
        self.decay = decay
        self.activation = activation
        self.model = []
        self.input_shape = input_shape
        self.x = x
        w = np.random.randn(input_shape,x)
        b = np.random.randn(x)
        self.model.append((w,b))
        self.model.append(activation)
        self.activation_type = {
            'relu':Relu(),
            'sigmoid':Sigmoid(),
            'softmax':SoftmaxWithLoss(),
        }
        
    def add(self,x,activation):
        self.input = self.model[-1][1].size
        w = np.random.randn(self.input,x)
        b = np.random.randn(x)
        self.model.append((w,b))
        self.model.append(activation)
        
    def summary():
        pass
    
    
    def predict(self,x):
        for layer in self.layers.values():
            x  = layer.forward(x)
        return x
              
    def loss(self,x,t):
        y = self.predict(x)   
        return self.last_layer.forward(y,t)
    
    def _numeric_gradient(self,x,t,learning_rate):
        self.learning_rate = learning_rate
        f = lambda w : self.loss(x,t)
        for i in range(3):
            self.W['W'+str(i+1)] -= self.learning_rate*numerical_gradient(f,self.W['W'+str(i+1)])
            self.W['b'+str(i+1)] -= self.learning_rate*numerical_gradient(f,self.W['b'+str(i+1)])
    
    def accuracy(self,x,t):
        result = self.predict(x)
        acc = sum(np.argmax(result,axis=1) == np.argmax(t,axis=1))/len(t)
        return acc
    
    def gradient(self,x,t):
        ## forward
        self.loss(x,t)
        ## backward
        dout = 1
        dout = self.last_layer.backward(dout)
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['W2'] = self.layers['Affine2'].dW
        grads['W3'] = self.layers['Affine3'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['b2'] = self.layers['Affine2'].db
        grads['b3'] = self.layers['Affine3'].db
        
        return grads
        

In [12]:
from sklearn.datasets import load_iris
from keras.datasets import mnist

In [41]:
(X_train, y_train),(X_test,y_test) = mnist.load_data()
X_train = X_train.reshape(-1,28*28)
X_test = X_test.reshape(-1,28*28)
y_train = keras.utils.to_categorical(y_train)
y_test = keras.utils.to_categorical(y_test)

In [31]:
model = MultiNet()

In [26]:
X_test.shape

(10000, 784)

In [32]:
epochs = 1000
lr = 1e-2
for epoch in range(epochs):
    x = X_train.copy()
    grads = model.gradient(x,y_train)
    model.W['W1'] -= lr*grads['W1']
    model.W['W2'] -= lr*grads['W2']
    model.W['W3'] -= lr*grads['W3']
    model.W['b1'] -= lr*grads['b1']
    model.W['b2'] -= lr*grads['b2']
    model.W['b3'] -= lr*grads['b3']
    print(np.sum(x == X_train)/x.size)
    print(epoch+1,"====",model.loss(X_train,y_train))

1.0
1 ==== 2.301711106850885
1.0
2 ==== 2.300072269983733
1.0
3 ==== 2.298317763788285
1.0
4 ==== 2.2963113456435837
1.0
5 ==== 2.2939273144834518
1.0
6 ==== 2.2909998238269202
1.0
7 ==== 2.287295194836455
1.0
8 ==== 2.2825142009899206
1.0
9 ==== 2.2762667038837403
1.0
10 ==== 2.2680229262715943
1.0
11 ==== 2.2571271847740326
1.0
12 ==== 2.2428524371872034
1.0
13 ==== 2.224531635242883
1.0
14 ==== 2.201704913008274
1.0
15 ==== 2.1741530033038035
1.0
16 ==== 2.141962825614152
1.0
17 ==== 2.1059831399145694
1.0
18 ==== 2.068050897292041
1.0
19 ==== 2.0307705843571657
1.0
20 ==== 1.9955967387304818
1.0
21 ==== 1.9653261616666773
1.0
22 ==== 1.935386917206923
1.0
23 ==== 1.9377796137902987
1.0
24 ==== 2.0162150056046158
1.0
25 ==== 2.190263156869877
1.0
26 ==== 2.1204938754184295
1.0
27 ==== 2.193171374655772
1.0
28 ==== 2.1426408260442735
1.0
29 ==== 2.0878735817543657
1.0
30 ==== 2.04509650912428
1.0
31 ==== 2.0118049800003326
1.0
32 ==== 1.9808543812253505
1.0
33 ==== 1.9493869659833145

KeyboardInterrupt: 

In [40]:
y_test

array([[[1., 0.],
        [1., 0.],
        [1., 0.],
        ...,
        [1., 0.],
        [1., 0.],
        [1., 0.]],

       [[0., 1.],
        [1., 0.],
        [1., 0.],
        ...,
        [1., 0.],
        [1., 0.],
        [1., 0.]],

       [[1., 0.],
        [1., 0.],
        [1., 0.],
        ...,
        [1., 0.],
        [1., 0.],
        [1., 0.]],

       ...,

       [[1., 0.],
        [1., 0.],
        [1., 0.],
        ...,
        [1., 0.],
        [1., 0.],
        [1., 0.]],

       [[1., 0.],
        [1., 0.],
        [1., 0.],
        ...,
        [1., 0.],
        [1., 0.],
        [1., 0.]],

       [[1., 0.],
        [1., 0.],
        [1., 0.],
        ...,
        [1., 0.],
        [0., 1.],
        [1., 0.]]], dtype=float32)

In [18]:
epochs = 100
lr = 1e-3
for epoch in range(epochs):
    for model.gradient(X,y).items():

{'W1': array([[ 0.00000000e+00,  4.18438157e+01, -1.47128233e+01,
         -1.43852161e-01,  0.00000000e+00,  0.00000000e+00,
          9.68587822e+00,  8.78412220e-40,  1.26756645e+01,
          1.83808221e+01,  1.40976528e+01,  0.00000000e+00,
          2.04852798e+01,  4.96610139e+01,  0.00000000e+00,
         -8.72966360e+00, -1.14831503e+01,  0.00000000e+00,
         -2.15478275e+01,  1.66248143e+01, -8.95180389e+00,
         -9.58744161e-02,  1.28081851e+01,  0.00000000e+00,
          1.07915290e-38,  7.22022583e-39,  3.39572767e+00,
         -1.32358488e+01,  0.00000000e+00, -1.06366903e-39,
          0.00000000e+00,  2.82459543e+00,  0.00000000e+00,
          2.04996575e+01, -1.38746789e+01,  4.98846983e-40,
          1.89842762e+01,  2.17418890e+01, -7.58125402e+00,
         -2.45285613e+01,  0.00000000e+00,  0.00000000e+00,
         -3.20764642e+01,  0.00000000e+00, -1.78879138e+01,
          5.76857216e+00,  2.48275015e+01,  0.00000000e+00,
          2.07904464e+01,  0.00000