In [1]:
import sys
import numpy as np
import cupy as cp

sys.path.append('../')

from mandala.nodecore import Node
from mandala.nodecore import Variable
from mandala.autodiff import autodiff
from mandala.autodiff.linear import Linear
from mandala.autodiff.relu import relu
from mandala.autodiff import initializers
from mandala import cuda

In [2]:
from mandala.autodiff.under_development import basic_math_ho

In [3]:
basic_math_ho.install_node_arithmetics()

In [4]:
xp = cp

In [5]:
def sum_forward(x):
    xp = cuda.get_array_module(x)
    return xp.sum(x)


def sum_backward(x, gy):
    xp = cuda.get_array_module(x)
    return xp.ones_like(x) * gy


class SumFunction(autodiff.AutoDiff):
    def forward(self, xs):
        x = xs[0]
        y = Node(sum_forward, [x])
        return y

    def backward(self, xs, gy):
        x = xs[0]
        gx = Node(sum_backward, [x, gy])
        return gx,


def _sum(x):
    return SumFunction()([x])

In [6]:
l0 = Linear(   5, 1000)
l1 = Linear(1000, 1000)
l2 = Linear(1000, 1000)
l3 = Linear(1000, 1000)
l4 = Linear(1000, 1000)
l5 = Linear(1000, 1000)
l6 = Linear(1000,    3)

layer_list = [l0, l1, l2, l3, l4, l5, l6]

In [7]:
for l in layer_list:
    l.to_gpu()

In [8]:
# 真の係数
W = xp.arange(15, dtype=np.float32).reshape(3, 5)
b = xp.arange(3, dtype=np.float32)

## test

In [9]:
batchsize = 32
x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
t = Variable(xp.matmul(x.data, W.T) + b)

In [10]:
h0 = relu(l0(x))
h1 = relu(l1(h0))
h2 = relu(l2(h1))
h3 = relu(l3(h2))
h4 = relu(l4(h3))
h5 = relu(l5(h4))
y  = relu(l6(h5))
loss = (y - t) ** 2 / batchsize

In [11]:
h5._reference_count

1

In [12]:
loss.backward()

In [13]:
print(h0._reference_count)
print(h0._data)
l0.b.grad.data

print(h0._reference_count)
print(h0._data)
l1.W.grad.data

print(h0._reference_count)
print(h0._data)
l2.W.grad.data

print(h0._reference_count)

3
None
2
[[ 0.          0.          0.         ...,  0.12666433  0.76408011  0.        ]
 [ 0.55495471  0.          0.         ...,  0.88499862  0.36393264  0.        ]
 [ 0.54133213  0.          0.         ...,  1.10250103  0.15624939  0.        ]
 ..., 
 [ 0.03998751  0.          0.         ...,  0.32125255  0.88836938  0.        ]
 [ 0.          0.          0.         ...,  0.26192448  1.36837471  0.        ]
 [ 0.16891342  0.          0.         ...,  0.25272056  0.17231254  0.        ]]
2
[[ 0.          0.          0.         ...,  0.12666433  0.76408011  0.        ]
 [ 0.55495471  0.          0.         ...,  0.88499862  0.36393264  0.        ]
 [ 0.54133213  0.          0.         ...,  1.10250103  0.15624939  0.        ]
 ..., 
 [ 0.03998751  0.          0.         ...,  0.32125255  0.88836938  0.        ]
 [ 0.          0.          0.         ...,  0.26192448  1.36837471  0.        ]
 [ 0.16891342  0.          0.         ...,  0.25272056  0.17231254  0.        ]]
2


## 学習

In [14]:
import time

In [15]:
s = time.time()
lr = 1e-5

for i in range(100):
    # make batch
    x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
    t = Variable(xp.matmul(x.data, W.T) + b)
    
    # forward
    h0 = relu(l0(x))
    h1 = relu(l1(h0))
    h2 = relu(l2(h1))
    h3 = relu(l3(h2))
    h4 = relu(l4(h3))
    h5 = relu(l5(h4))
    y  = l6(h5)
    loss = (y - t) ** 2 / batchsize

    # loss
    loss = _sum((y - t) ** 2) / batchsize
    
    for l in layer_list:
        l.W.grad = 0
        if l.b is not None:
            l.b.grad = 0.

    # backward
    loss.backward()
    
    # update
    for l in layer_list[::-1]:
        l.W.data -= lr * l.W.grad.data
        if l.b is not None:
            l.b.data -= lr * l.b.grad.data

    print(loss.data)

print('time:', time.time() - s)

1629.709716796875
1433.44189453125
1263.68359375
1458.7252197265625
1375.1968994140625
1187.2822265625
1168.05322265625
1099.1854248046875
918.8353881835938
913.2614135742188
817.6146240234375
850.6698608398438
683.982666015625
575.1829833984375
602.7376708984375
503.7901611328125
399.8293762207031
394.1236267089844
272.1575927734375
216.1071319580078
151.84751892089844
119.29740905761719
86.51712036132812
71.78948974609375
62.75883483886719
34.53779602050781
31.21698570251465
19.06456184387207
13.076095581054688
11.71542739868164
9.851085662841797
7.14787483215332
4.554547309875488
3.810123920440674
6.4520263671875
4.0052642822265625
3.753199815750122
4.13516902923584
3.646177291870117
2.514437198638916
2.9052107334136963
3.3988115787506104
3.251530647277832
2.6922383308410645
1.9552860260009766
3.2730531692504883
3.1700549125671387
3.40663480758667
2.7090768814086914
1.9420878887176514
3.1146209239959717
2.5617852210998535
3.732339382171631
2.2583436965942383
3.5026397705078125
1.727

## Chainer との速度比較

In [16]:
import chainer
import chainer.functions as F
import chainer.links as L
from chainer import optimizer

In [None]:
class Model(chainer.Chain):
    def __init__(self):
        super().__init__()
        with self.init_scope():
            self.l0 = L.Linear(   5, 1000)
            self.l1 = L.Linear(1000, 1000)
            self.l2 = L.Linear(1000, 1000)
            self.l3 = L.Linear(1000, 1000)
            self.l4 = L.Linear(1000, 1000)
            self.l5 = L.Linear(1000, 1000)
            self.l6 = L.Linear(1000,    3)

    def __call__(self, x):
        h  = F.relu(self.l0(x))
        h  = F.relu(self.l1(h))
        h  = F.relu(self.l2(h))
        h  = F.relu(self.l3(h))
        h  = F.relu(self.l4(h))
        h  = F.relu(self.l5(h))
        y  = self.l6(h)
        return y

In [None]:
model = Model()
model.to_gpu()

In [13]:
opt = chainer.optimizers.SGD(lr=1e-4)
opt.setup(model)

NameError: name 'chainer' is not defined

In [14]:
batchsize = 32
s = time.time()

for i in range(100):
    # make batch
    x = xp.random.random((batchsize, 5)).astype(np.float32)
    t = xp.matmul(x, W.T) + b
    
    # forward
    y = model(x)

    # loss
    loss = F.mean_squared_error(y, t)

    # backward
    model.cleargrads()
    loss.backward()
    
    # update
    opt.update()

    print(loss.data)

print(time.time() - s)

NameError: name 'time' is not defined

In [21]:
0.3876018524169922 / 0.19963645935058594

1.9415384027439404

In [22]:
1.584319829940796 / 0.5818600654602051

2.7228536962537975

Chainer の 2.7 倍の計算時間がかかっている……

推論に限定しても 1.9 倍。

In [23]:
%%timeit
cuda.get_array_module(W)

394 ns ± 0.249 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [24]:
W = xp.arange(15, dtype=np.float32).reshape(3, 5)
b = xp.arange(3, dtype=np.float32)

In [25]:
x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
t = Variable(xp.matmul(x.data, W.T) + b)

In [26]:
%%timeit
x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
t = Variable(xp.matmul(x.data, W.T) + b)
h0 = l0(x)
h1 = l1(h0)
h2 = l2(h1)
h3 = l3(h2)
h4 = l4(h3)
h5 = l5(h4)
y  = l6(h5)
y.data.get()

1.37 ms ± 2.01 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [27]:
x = xp.random.random((batchsize, 5)).astype(np.float32)
t = xp.matmul(x, W.T) + b

In [28]:
%%timeit
x = xp.random.random((batchsize, 5)).astype(np.float32)
t = xp.matmul(x, W.T) + b

y = model(x)
y.data.get()

2.05 ms ± 16.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Chainer よりほんのちょっと早くなった（type check などやっていないせいかな。）

In [29]:
cp.cuda.cudnn.CUDNN_ACTIVATION_TANH

2

In [30]:
cp.cuda.cudnn

<module 'cupy.cuda.cudnn' from '/home/ubuntu/anaconda3/lib/python3.6/site-packages/cupy/cuda/cudnn.cpython-36m-x86_64-linux-gnu.so'>