# Simple Neural Networks

## 概要
- Jupyter notebook上で一望できるくらい小さなニューラルネット(すぐ肥大化しそうな予感)
- synthetic gradients試してみようと思って書きはじめた。勉強にもなるし
- ReLU, Linear, Conv2D, SoftmaxCrossEntropyをひとまず実装(バグあるかも)

## 注意
- Conv2Dは縁の処理をサボっているのでちゃんとしたフレームワークとは挙動が違うかも
- あんまり速くない
- Python2

## TODO
- Pooling, BatchNormalizationの実装
- weightのinitializeをもうちょっと丁寧に
- gpu対応(私のPCにはNvidia GPUがないのでOpenCLを使う？)
- RNNの記述法を考える
- optimizerにmomentumを導入

In [1]:
import numpy as np
import math

## Layer

In [2]:
class ReLU(object):

    def __init__(self):
        self.layer_type = 'activation'
        
    def forward(self, x):
        self.x = x
        return np.maximum(x, 0, dtype=x.dtype)
    
    def backward(self, gy):
        return gy * (self.x > 0)

class Linear(object):
    
    def __init__(self, inputs, outputs):
        self.layer_type = 'linear'
        self.W = np.random.uniform(-1/math.sqrt(inputs), 1/math.sqrt(inputs), (outputs, inputs)).astype('f')
        self.b = np.zeros((outputs), dtype=np.float32)
        
    def forward(self, x):
        self.x = x
        x = x.reshape(x.shape[0], -1)
        y = x.dot(self.W.T) + self.b
        return y
    
    def backward(self, gy):
        x = self.x.reshape(self.x.shape[0], -1)
        gx = gy.dot(self.W).reshape(self.x.shape)
        self.gW = gy.T.dot(x)
        self.gb = gy.sum(0)
        return gx.reshape(self.x.shape)
    
class Convolution2D(object):
    
    def __init__(self, in_ch, out_ch, k, stride=1, pad=1):
        self.layer_type = 'convolution'
        self.ksize = k
        self.stride = stride
        self.pad = pad
        self.W = np.random.uniform(-1/math.sqrt(k*k*in_ch), 1/math.sqrt(k*k*in_ch), (out_ch, in_ch, k, k)).astype('f')
        self.b = np.zeros((out_ch), dtype=np.float32)
        
    def forward(self, x):
        self.x = x
        b, ch, h, w = x.shape
        p = self.pad
        k = self.ksize
        s = self.stride
        
        #padding input image
        _x = np.zeros((b, ch, (h + p*2), (w + p*2)), dtype=np.float32)
        _x[:, :, p:-p, p:-p] = x
        
        #im2col
        self.col = np.zeros((b, ch, k, k, ((h + p*2 - k)//s + 1), ((w + p*2 - k)//s + 1)), dtype=np.float32)
        for i in range(0, h + p*2 - k + 1, s):
            for j in range(0, w + p*2 - k + 1, s):
                self.col[:, :, :, :, i/s, j/s] += _x[:, :, i:i+k, j:j+k]
        
        #convolution
        y = np.tensordot(self.col, self.W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype, copy=False)
        y += self.b
        return np.rollaxis(y, 3, 1)
    
    def backward(self, gy):
        self.gW = np.tensordot(gy, self.col, ((0, 2, 3), (0, 4, 5))).astype(self.W.dtype, copy=False)
        self.gb = gy.sum(axis=(0, 2, 3))
        gcol = np.tensordot(self.W, gy, (0, 1)).astype(self.x.dtype, copy=False)
        gcol = np.rollaxis(gcol, 3)
        
        #col2im
        b, ch, h, w = self.x.shape
        p = self.pad
        k = self.ksize
        s = self.stride
        gx = np.zeros((b, ch, (h + p*2), (w + p*2)), dtype=np.float32)
        for i in range(0, h + p*2 - k + 1, s):
            for j in range(0, w + p*2 - k + 1, s):
                 gx[:, :, i:i+k, j:j+k] += gcol[:, :, :, :, i/s, j/s]
        return gx[:, :, p:-p, p:-p]
    
def softmax(x):
    x -= x.max(axis=1, keepdims=True)
    exp_x = np.exp(x)
    return exp_x/np.sum(exp_x, axis=1).reshape(-1, 1)

def softmax_cross_entropy(x, t):
    log_y = np.log(softmax(x))
    log_p = log_y[range(len(t)), t.ravel()] #Labelに対応する値が1になる→log(y)=0．不正解Labelに対して期待される確率は0であるからそれらは無視できる．
    loss = - log_p.sum() / len(t)
    
    gx = np.exp(log_y)
    gx[range(len(t)), t.ravel()] -= 1
    gx *= loss
    
    return loss, gx

def accuracy(x, t):
    t_or_f = (np.argmax(x, axis=1)==t).astype('f')
    return np.sum(t_or_f)/len(t_or_f)

## function

In [3]:
def forward(nnet, x):
    for layer in nnet:
        x = layer.forward(x)
    return x

def backward(nnet, gy):
    for layer in nnet[::-1]:
        gy = layer.backward(gy)
        
def update(nnet):
    lr = 0.001
    for layer in nnet:
        if layer.layer_type is not 'activation':
            layer.W -= layer.gW * lr
            layer.b -= layer.gb * lr

## Training Test
- mnistで試してみる

In [4]:
%matplotlib inline
import pylab as plt

In [5]:
# chainerのutilityをお借りしてmnistを読み込む
import chainer
train, test = chainer.datasets.get_mnist()

X = np.zeros((60000, 1, 28, 28), dtype=np.float32)
Y = np.zeros(60000, dtype=np.int32)
X_test = np.zeros((10000, 1, 28, 28), dtype=np.float32)
Y_test = np.zeros(10000, dtype=np.int32)
for i in range(60000):
    X[i] += train[i][0].reshape(1, 28, 28)
    Y[i] = train[i][1]
for i in range(10000):
    X_test[i] += test[i][0].reshape(1, 28, 28)
    Y_test[i] = test[i][1]
    
#[-1, 1]に正規化
X -= 0.5; X_test -= 0.5
X *= 2; X_test *= 2

In [None]:
#モデルの準備

nnet = [Convolution2D(1, 32, 4, stride=2, pad=1),
        ReLU(),
        Convolution2D(32, 64, 3, stride=1, pad=1),
        ReLU(),
        Convolution2D(64, 128, 4, stride=2, pad=1),
        ReLU(),
        Convolution2D(128, 256, 3, stride=2, pad=1),
        ReLU(),
        Linear(256*4*4, 512),
        ReLU(),
        Linear(512, 10)]

In [None]:
epoch = 100
N = len(X)
N_test = len(X_test)
batchsize = 100
step = 100
n_imgs_trained = 0

for e in range(epoch):
    print 'epoch:', e
    sum_loss = 0.
    sum_acc = 0.
    perm = np.random.permutation(N)
    
    #train
    for i in range(0, N, batchsize):
        y = forward(nnet, X[perm[i:i+batchsize]])
        loss, gy = softmax_cross_entropy(y, Y[perm[i:i+batchsize]])
        acc = accuracy(y, Y[perm[i:i+batchsize]])
        backward(nnet, gy)
        update(nnet)
        
        n_imgs_trained += batchsize
        sum_loss += loss
        sum_acc += acc
        
        if i%(batchsize*step)==0 and i!=0:
            print 'type: train, sample: {}, loss: {}, accuracy: {}'.format(n_imgs_trained, sum_loss/step, sum_acc/step)
            sum_loss = 0.
            sum_acc = 0.
    #val
    sum_loss = 0.
    sum_acc = 0.
    for i in range(0, N_test, batchsize):
        y = forward(nnet, X_test[i:i+batchsize])
        loss, gy = softmax_cross_entropy(y, Y_test[i:i+batchsize])
        acc = accuracy(y, Y_test[i:i+batchsize])
        sum_loss += loss
        sum_acc += acc
    print 'type: val, sample: {}, loss: {}, accuracy: {}'.format(n_imgs_trained, sum_loss/(N_test/batchsize), sum_acc/(N_test/batchsize))

epoch: 0
type: train, sample: 10100, loss: 1.77315624008, accuracy: 0.4031
type: train, sample: 20100, loss: 0.390982931328, accuracy: 0.8817
type: train, sample: 30100, loss: 0.26652086134, accuracy: 0.9231
type: train, sample: 40100, loss: 0.229616665363, accuracy: 0.9303
type: train, sample: 50100, loss: 0.198822689676, accuracy: 0.9405
type: val, sample: 60000, loss: 0.156490174598, accuracy: 0.9508
epoch: 1
type: train, sample: 70100, loss: 0.160351283455, accuracy: 0.9598
type: train, sample: 80100, loss: 0.142623968077, accuracy: 0.9567
type: train, sample: 90100, loss: 0.149283674908, accuracy: 0.9553
type: train, sample: 100100, loss: 0.137186365223, accuracy: 0.9595
type: train, sample: 110100, loss: 0.126101793265, accuracy: 0.9608
type: val, sample: 120000, loss: 0.116489330018, accuracy: 0.9633
epoch: 2
type: train, sample: 130100, loss: 0.113217493129, accuracy: 0.9744
type: train, sample: 140100, loss: 0.122570774388, accuracy: 0.9629
type: train, sample: 150100, loss: 0