In [1]:
import numpy as np
from collections import OrderedDict
from tensorflow import keras

In [2]:
def softmax(x):
    if x.ndim == 1:
        c = max(x)
        x = x-c
        return np.exp(x)/np.sum(np.exp(x))
    else:
        x = x.T - np.max(x,axis=1)
        x = x.T
        x = (np.exp(x).T/np.sum(np.exp(x),axis=1)).T
        return x   

def cross_entropy(y,t):
    epsilon = 1e-7
    y = softmax(y)
    return -np.sum(t*np.log(y+epsilon))/y.shape[0]

def sigmoid(x):
    return 1/(np.exp(-x))


def numerical_gradient(f,x):
    h = 1e-4
    grad = np.zeros_like(x)
    it = np.nditer(x,flags=['multi_index'],op_flags=['readwrite'])
    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
        x[idx] = tmp_val + h
        fxh1 = f(x)
        x[idx] = tmp_val - h
        fxh2 = f(x)
        grad[idx] = (fxh1-fxh2)/(2*h)
        x[idx] = tmp_val
        it.iternext()
    return grad

In [3]:
class Relu:
    def __init__(self):
        self.mask = None
        
    def forward(self,x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        return out
    
    def backward(self,dout):
        dout[self.mask] = 0
        dx = dout
        return dx  

class Sigmoid:
    def __init_(self):
        self.out = None
    
    def forward(self,x):
        out = sigmoid(x)
        self.out = out
        return self.out
    
    def backward(self,dout):
        dx = dout*self.out*(1. - self.out)
        return dx


class Affine:
    def __init__(self,W,b):
        self.W = W
        self.b = b
        self.x = None
        self.original_shape = None
        self.dW = None
        self.db = None
        
    def forward(self,x):
        # shape유지
        self.original_shape = x.shape
        x = x.reshape(x.shape[0],-1)
        self.x = x
        out = np.dot(self.x,self.W) + self.b
        return out
        
    
    def backward(self,dout):
        dx = np.dot(dout,self.W.T)
        self.dW = np.dot(self.x.T,dout)
        self.db = np.sum(dout,axis=0)
        dx = dx.reshape(*self.original_shape)
        return dx

class SoftmaxWithLoss:
    
    def __init__(self):
        self.loss = None
        self.y = None
        self.t = None
    
    def forward(self,x,t):
        self.t = t
        self.y = softmax(x)
        loss = cross_entropy(self.y,t)
        return loss
    
    def backward(self,dout=1):
        dx = (self.y - self.t)/self.t.shape[0]
        return dx
        
class LeakyReLu:
    
    def __init__(self,alpha=0.01):
        self.alpha = alpha
    
    def forward(self,x):
        out = np.where(x>0,x,self.alpha*x)
        return out
    
    def backward(self,dout):
        dx = np.where(dout >0,dout,self.alpha*dout) 
        return dx

In [15]:
class MultiNet:
    # def __init__(self):
    #     self.W = {
    #         # 'W1':np.random.randn(4,100),
    #         # 'W2':np.random.randn(100,50),
    #         # 'W3':np.random.randn(50,3),
    #         # 'b1':np.random.randn(100),
    #         # 'b2':np.random.randn(50),
    #         # 'b3':np.random.randn(3),
    #         'W1':np.random.randn(28 * 28, 100) * 0.01,
    #         'W2':np.random.randn(100, 50) * 0.01,
    #         'W3':np.random.randn(50, 10) * 0.01,
    #         # 'b1':np.random.randn(100) * 0,
    #         # 'b2':np.random.randn(50) * 0,
    #         # 'b3':np.random.randn(10) * 0,
    #         'b1':np.zeros(100),
    #         'b2':np.zeros(50),
    #         'b3':np.zeros(10),
    #     }
    #     self.layers = OrderedDict()
    #     self.layers['Affine1'] = Affine(self.W['W1'],self.W['b1'])
    #     self.layers['relu1'] = Relu()
    #     self.layers['Affine2'] = Affine(self.W['W2'],self.W['b2'])
    #     self.layers['relu2'] = Relu()
    #     self.layers['Affine3'] = Affine(self.W['W3'],self.W['b3'])
    #     self.last_layer = SoftmaxWithLoss()
    def __init__(self, input_shape, x, decay, activation):
        self.decay = decay
        self.activation = activation
        self.model = []
        self.input_shape = input_shape
        self.x = x
        w = np.random.randn(input_shape, x)
        b = np.random.randn(x)
        self.model.append((w, b))
        self.model.append(activation)
        self.activation_type = {
            'relu': Relu(), 
            'sigmoid': Sigmoid(), 
            'softmax': SoftmaxWithLoss(), 
        }
    
    def predict(self,x):
        for layer in self.layers.values():
            x  = layer.forward(x)
        return x
              
    def loss(self,x,t):
        y = self.predict(x)   
        return self.last_layer.forward(y,t)
    
    def _numeric_gradient(self,x,t,learning_rate):
        self.learning_rate = learning_rate
        f = lambda w : self.loss(x,t)
        for i in range(3):
            self.W['W'+str(i+1)] -= self.learning_rate*numerical_gradient(f,self.W['W'+str(i+1)])
            self.W['b'+str(i+1)] -= self.learning_rate*numerical_gradient(f,self.W['b'+str(i+1)])
    
    def accuracy(self,x,t):
        result = self.predict(x)
        acc = sum(np.argmax(result,axis=1) == np.argmax(t,axis=1))/len(t)
        return acc
    
    def gradient(self,x,t):
        ## forward
        self.loss(x,t)
        ## backward
        dout = 1
        dout = self.last_layer.backward(dout)
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['W2'] = self.layers['Affine2'].dW
        grads['W3'] = self.layers['Affine3'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['b2'] = self.layers['Affine2'].db
        grads['b3'] = self.layers['Affine3'].db
                
        return grads
    
    def add(self, x, activation):
        self.input = self.model[-1][1].size
        w = np.random.randn(self.input, x)
        b = np.random.randn(x)
        self.model.append((w, b))
        self.model.append(activation)


In [10]:
from sklearn.datasets import load_iris
X = load_iris()['data']
Y = load_iris()['target']

lr = 1e-3
epochs = 100

model = MultiNet()

# grad = model.gradient(X, Y)

# model.layers['Affine1'].dW -= lr * grad['W1']
# model.layers['Affine1'].db -= lr * grad['b1']
# model.layers['Affine2'].dW -= lr * grad['W2']
# model.layers['Affine2'].db -= lr * grad['b2']
# model.layers['Affine3'].dW -= lr * grad['W3']
# model.layers['Affine3'].db -= lr * grad['b3']

# np.sum(model.layers['Affine1'].dW)

for e in range(epochs):
    grad = model.gradient(X, Y)
    model.W['W1'] -= lr * grad['W1']
    model.W['b1'] -= lr * grad['b1']
    model.W['W2'] -= lr * grad['W2']
    model.W['b2'] -= lr * grad['b2']
    model.W['W3'] -= lr * grad['W3']
    model.W['b3'] -= lr * grad['b3']

    # model.W['W1'] = grad['W1']
    # model.W['b1'] = grad['b1']
    # model.W['W2'] = grad['W2']
    # model.W['b2'] = grad['b2']
    # model.W['W3'] = grad['W3']
    # model.W['b3'] = grad['b3']

    print(e+1, model.loss(X, Y))

1 1.2181110081880455
2 1.2134637719895558
3 1.2865982550309758
4 1.2827437880670178
5 1.2296364954735546
6 0.9577394315937414
7 1.0664708112198475
8 0.891423924258743
9 0.9820550109591807
10 0.8895179942098483
11 0.9649484167678765
12 0.8805245860561924
13 0.95044802772149
14 0.8782493477485368
15 0.9378090818691672
16 0.8706936001214032
17 0.9319637425821115
18 0.8696097074223983
19 0.9249567752984319
20 0.8597062274921253
21 0.9181461618105751
22 0.8625779668018552
23 0.9111390637837363
24 0.8508734947378429
25 0.9080541114496568
26 0.85928024321915
27 0.9045134168294089
28 0.8447720535258589
29 0.9023307669953734
30 0.857131121753351
31 0.9009759544834499
32 0.839150132922096
33 0.8993564200136678
34 0.8542254368687849
35 0.898785945879798
36 0.8330728002186927
37 0.8977590984675027
38 0.8536837775293801
39 0.8973996302986603
40 0.8251220957545545
41 0.8958785259876542
42 0.8565694511586059
43 0.8956347343573824
44 0.8216249080863135
45 0.893093852799899
46 0.8568275942452819
47 0.8

In [18]:
from tensorflow.keras.datasets import mnist
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()
X_train = X_train.reshape(-1, 28 * 28)
X_test = X_test.reshape(-1, 28 * 28)
Y_train = keras.utils.to_categorical(Y_train)
Y_test = keras.utils.to_categorical(Y_test)

lr = 1e-3
epochs = 1000

model = MultiNet()

for e in range(epochs):
    X, Y = np.copy(X_train), np.copy(Y_train)
    grad = model.gradient(X, Y)
    model.W['W1'] -= lr * grad['W1']
    model.W['b1'] -= lr * grad['b1']
    model.W['W2'] -= lr * grad['W2']
    model.W['b2'] -= lr * grad['b2']
    model.W['W3'] -= lr * grad['W3']
    model.W['b3'] -= lr * grad['b3']
    print(e+1, model.loss(X, Y))

1 2.3624324063633075
2 2.3597318990084872
3 2.2967802603571355
4 2.2734422855981653
5 2.3358612570811084
6 2.3046172925146187
7 2.302244130749385
8 2.2655515381758984
9 2.260072448296308
10 2.255010693517982
11 2.280606095021757
12 2.2431183823318483
13 2.2451779546087813
14 2.2469388603343927
15 2.257074620006451
16 2.2411258905981217
17 2.2417722235416626
18 2.2490721826755733
19 2.240429558457887
20 2.2804488840746164
21 2.2301452260989074
22 2.2294068025106104
23 2.229145142015557
24 2.228209977325263
25 2.227811460846818
26 2.2258375348048625
27 2.2258639617292477
28 2.226741393360952
29 2.2604067877407292
30 2.2269257813702987
31 2.227720729726769
32 2.2261467114161535
33 2.2284269505095864
34 2.253995424022629
35 2.2271685872097504
36 2.228654303543956
37 2.2296048330437235
38 2.231508991967227
39 2.272857759480099
40 2.230345473760841
41 2.2313926155123065
42 2.2325366923330603
43 2.2327429598763255
44 2.2332048937521884
45 2.2344990464372065
46 2.2355445265131832
47 2.23683819

In [19]:
model.accuracy(X_test, Y_test)

0.1393

In [21]:
# model.predict(X_test)