In [162]:
import numpy as np
import time
import pickle

In [132]:
data = np.load('data/mnist.npz', 'rb')
print(data.keys())
(_, x_test), (_, x_train), (_, x_valid), (_, y_valid), (_, y_train), (_, y_test) = data.items()

['x_test', 'x_train', 'x_valid', 'y_valid', 'y_train', 'y_test']


In [133]:
lam = 0.001
class FCLayer:
    def __init__(self, shape, nonlinearity, nonlinearity_der):
        self.W = np.random.normal(0, 0.02, shape)
        self.b = np.random.normal(0, 0.02, shape[1])
        self.nonlinearity = nonlinearity
        self.nonlinearity_der = nonlinearity_der
        
    def forward(self, x):
        self.x = x
        self.r = np.dot(x, self.W) + self.b
        self.o = self.nonlinearity(self.r)
        return self.o
    
    def backward(self, delta):
        delta = np.dot(delta, self.nonlinearity_der(self.r))
        
        dB = delta
        dW = np.outer(self.x, delta)
        delta = np.dot(delta, self.W.T)

        self.W -= learning_rate * (dW + lam*self.W.mean())
        self.b -= learning_rate * dB
        
        return delta

In [134]:
def sigmoid(x):
    return 1./(1 + np.exp(-x))

def sigmoid_der(x):
    s = sigmoid(x)
    return np.diag(s * (1 - s))

def softmax(x):
    return np.exp(x)/np.exp(x).sum()

def softmax_der(x):
    p = softmax(x)
    t = np.outer(p, p)
    return np.diag(p) - t

In [135]:
def create_net():
    net = [
        FCLayer((784, 256), sigmoid, sigmoid_der),
        FCLayer((256, 128), sigmoid, sigmoid_der),
        FCLayer((128, 64), sigmoid, sigmoid_der),
        FCLayer((64, 10), softmax, softmax_der),
    ]
    return net

In [156]:
def train(net, x, y):
    o = x
    for layer in net:
        o = layer.forward(o)
    delta = np.zeros(10)
    delta[y] = -1./o[y]
    loss = -np.log(o[y])
    for layer in net[::-1]:
        delta = layer.backward(delta)
    return loss

def predict(net, x):
    o = x
    for layer in net:
        o = layer.forward(o)
    return np.argmax(o)

def accuracy(net, X, Y):
    c = 0
    for x, y in zip(X, Y):
        if predict(net, x) == y:
            c += 1
    return c/len(X)

net = create_net()

In [161]:
learning_rate = 0.1
#x_train_cut, y_train_cut = x_train[:1000], y_train[:1000]
losses = []
train_accs = []
valid_accs = []

best_valid = 0

for k in range(10):
    loss = 0
    start = time.time()
    for i, (x, y) in enumerate(zip(x_train, y_train)):
        loss += train(net, x, y)
    loss /= len(x_train)
    valid_acc = accuracy(net, x_valid, y_valid)
    train_acc = accuracy(net, x_train, y_train)
    
    losses.append(loss)
    valid_accs.append(valid_acc)
    train_accs.append(train_acc)
    if valid_acc > best_valid:
        best_valid = valid_acc
        with open('net.pkl', 'wb') as out:
            pickle.dump(net, out)
    
    print('epoch:', k)
    print('loss:', loss)
    print('valid acc:', valid_acc)
    print('train acc:', train_acc)
    print('time:', time.time()-start)
    

epoch: 0
loss: 1.09098495751
valid acc: 0.9324
train acc: 0.92616
time: 279.37629103660583
epoch: 1
loss: 0.210268508114
valid acc: 0.9548
train acc: 0.9525
time: 195.05173921585083
epoch: 2
loss: 0.13749972534
valid acc: 0.9517
train acc: 0.95682
time: 197.0195951461792


KeyboardInterrupt: 

In [165]:
start = time.time()
print(accuracy(net, x_valid, y_valid))
print(time.time() - start)

0.96
1.2019028663635254


In [163]:
with open('net.pkl', 'wb') as out:
    pickle.dump(net, out)