In [1]:
import sys
import numpy as np
import cupy as cp

sys.path.append('../')

from mandala.nodecore import Node
from mandala.nodecore import Variable
from mandala.autodiff import autodiff
from mandala.autodiff.linear import Linear
from mandala.autodiff.relu import relu
from mandala.autodiff import initializers
from mandala import cuda

In [2]:
xp = np

In [3]:
def sum_forward(x):
    xp = cuda.get_array_module(x)
    return xp.sum(x)


def sum_backward(x, gy):
    xp = cuda.get_array_module(x)
    return xp.ones_like(x) * gy


class SumFunction(autodiff.AutoDiff):
    def forward(self, xs):
        x = xs[0]
        y = Node(sum_forward, [x])
        return y

    def backward(self, xs, gy):
        x = xs[0]
        gx = Node(sum_backward, [x, gy])
        return gx,


def _sum(x):
    return SumFunction()([x])

In [4]:
l0 = Linear(   5, 1000)
l1 = Linear(1000, 1000)
l2 = Linear(1000, 1000)
l3 = Linear(1000, 1000)
l4 = Linear(1000, 1000)
l5 = Linear(1000, 1000)
l6 = Linear(1000,    3)

layer_list = [l0, l1, l2, l3, l4, l5, l6]

In [8]:
for l in layer_list:
    l.to_cpu()

In [9]:
# 真の係数
W = xp.arange(15, dtype=np.float32).reshape(3, 5)
b = xp.arange(3, dtype=np.float32)

## test

In [11]:
batchsize = 32
x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
t = Variable(xp.matmul(x.data, W.T) + b)

In [12]:
h0 = relu(l0(x))
h1 = relu(l1(h0))
h2 = relu(l2(h1))
h3 = relu(l3(h2))
h4 = relu(l4(h3))
h5 = relu(l5(h4))
y  = relu(l6(h5))
loss = (y - t) ** 2 / batchsize

In [13]:
h5._reference_count

1

In [14]:
loss.backward()

In [15]:
print(h0._reference_count)
print(h0._data)
l0.b.grad.data

print(h0._reference_count)
print(h0._data)
l1.W.grad.data

print(h0._reference_count)
print(h0._data)
l2.W.grad.data

print(h0._reference_count)

3
None
2
[[ 0.          0.          0.         ...,  1.30759203  0.          0.01450732]
 [ 0.4096289   0.          0.92056292 ...,  1.122908    0.          0.14405206]
 [ 0.41204247  0.          1.06728971 ...,  1.48688972  0.          0.        ]
 ..., 
 [ 0.45349318  0.          1.29238796 ...,  0.95312977  0.          0.37512371]
 [ 0.0862558   0.          0.26748317 ...,  0.64024949  0.          0.        ]
 [ 0.51119685  0.          1.18466616 ...,  1.47835171  0.          0.        ]]
2
[[ 0.          0.          0.         ...,  1.30759203  0.          0.01450732]
 [ 0.4096289   0.          0.92056292 ...,  1.122908    0.          0.14405206]
 [ 0.41204247  0.          1.06728971 ...,  1.48688972  0.          0.        ]
 ..., 
 [ 0.45349318  0.          1.29238796 ...,  0.95312977  0.          0.37512371]
 [ 0.0862558   0.          0.26748317 ...,  0.64024949  0.          0.        ]
 [ 0.51119685  0.          1.18466616 ...,  1.47835171  0.          0.        ]]
2


## 学習

In [16]:
import time

In [17]:
s = time.time()
lr = 1e-5

for i in range(100):
    # make batch
    x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
    t = Variable(xp.matmul(x.data, W.T) + b)
    
    # forward
    h0 = relu(l0(x))
    h1 = relu(l1(h0))
    h2 = relu(l2(h1))
    h3 = relu(l3(h2))
    h4 = relu(l4(h3))
    h5 = relu(l5(h4))
    y  = l6(h5)
    loss = (y - t) ** 2 / batchsize

    # loss
    loss = _sum((y - t) ** 2) / batchsize
    
    for l in layer_list:
        l.W.grad = 0
        if l.b is not None:
            l.b.grad = 0.

    # backward
    loss.backward()
    
    # update
    for l in layer_list[::-1]:
        l.W.data -= lr * l.W.grad.data
        if l.b is not None:
            l.b.data -= lr * l.b.grad.data

    print(loss.data)

print('time:', time.time() - s)

1425.75561523
1666.04833984
1266.38574219
1243.77563477
1437.87182617
1087.68701172
1104.49267578
900.745117188
896.819641113
896.51184082
921.512695312
873.22668457
730.631835938
678.827941895
529.70123291
562.189331055
441.067993164
377.487304688
293.370727539
204.905990601
181.571426392
128.131072998
136.892333984
95.0800476074
60.8776779175
44.7653961182
28.8390007019
22.6556854248
18.1653633118
13.4435253143
12.5200576782
8.54035186768
6.77353096008
6.8616104126
5.61371660233
4.25124788284
3.65509462357
4.26582622528
3.47016763687
5.17222213745
4.16736364365
3.64300322533
3.95307016373
2.28883051872
3.42058801651
3.17661952972
3.28876161575
4.97187852859
2.69698691368
2.60293388367
2.35467576981
2.32014131546
2.40879797935
1.89380097389
2.76378822327
3.80360722542
2.60347604752
3.97976112366
2.54765462875
2.8703122139
2.38314795494
2.17342758179
3.3876080513
2.88738489151
2.76563811302
2.06089806557
3.437748909
2.6959528923
4.50141334534
4.42389678955
2.78028011322
2.53992128372
1

## Chainer との速度比較

In [18]:
import chainer
import chainer.functions as F
import chainer.links as L
from chainer import optimizer

In [19]:
class Model(chainer.Chain):
    def __init__(self):
        super().__init__()
        with self.init_scope():
            self.l0 = L.Linear(   5, 1000)
            self.l1 = L.Linear(1000, 1000)
            self.l2 = L.Linear(1000, 1000)
            self.l3 = L.Linear(1000, 1000)
            self.l4 = L.Linear(1000, 1000)
            self.l5 = L.Linear(1000, 1000)
            self.l6 = L.Linear(1000,    3)

    def __call__(self, x):
        h  = F.relu(self.l0(x))
        h  = F.relu(self.l1(h))
        h  = F.relu(self.l2(h))
        h  = F.relu(self.l3(h))
        h  = F.relu(self.l4(h))
        h  = F.relu(self.l5(h))
        y  = self.l6(h)
        return y

In [20]:
model = Model()
model.to_cpu()

<__main__.Model at 0x7f2a26ab2d30>

In [21]:
opt = chainer.optimizers.SGD(lr=1e-4)
opt.setup(model)

<chainer.optimizers.sgd.SGD at 0x7f2a26a65320>

In [22]:
batchsize = 32
s = time.time()

for i in range(100):
    # make batch
    x = xp.random.random((batchsize, 5)).astype(np.float32)
    t = xp.matmul(x, W.T) + b
    
    # forward
    y = model(x)

    # loss
    loss = F.mean_squared_error(y, t)

    # backward
    model.cleargrads()
    loss.backward()
    
    # update
    opt.update()

    print(loss.data)

print(time.time() - s)

427.190185546875
495.3069763183594
452.23974609375
432.2911376953125
462.729736328125
486.7976379394531
534.0407104492188
521.8364868164062
453.4081115722656
494.2232971191406
505.9490661621094
493.1531677246094
532.3102416992188
401.0016784667969
504.8416442871094
571.6755981445312
417.8907165527344
465.0994567871094
471.7832946777344
488.6719665527344
459.3736572265625
447.47998046875
439.361572265625
425.8631896972656
505.7315979003906
450.63671875
430.0314636230469
521.3046875
427.0911865234375
402.3125
437.7055358886719
434.9010009765625
360.189697265625
411.5248107910156
334.9768981933594
327.3570556640625
337.0296325683594
320.7826232910156
316.4875793457031
251.3565673828125
225.0121307373047
186.0250244140625
141.16319274902344
128.47906494140625
106.15259552001953
49.983394622802734
36.87723159790039
26.114599227905273
15.827926635742188
6.667320251464844
2.811443328857422
1.819498896598816
1.6840580701828003
1.0880179405212402
0.9277288317680359
1.2546000480651855
0.76315611

In [23]:
0.3876018524169922 / 0.19963645935058594

1.9415384027439404

In [24]:
1.584319829940796 / 0.5818600654602051

2.7228536962537975

Chainer の 2.7 倍の計算時間がかかっている……

推論に限定しても 1.9 倍。

In [25]:
%%timeit
cuda.get_array_module(W)

432 ns ± 1.57 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [22]:
W = xp.arange(15, dtype=np.float32).reshape(3, 5)
b = xp.arange(3, dtype=np.float32)

In [26]:
x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
t = Variable(xp.matmul(x.data, W.T) + b)

In [27]:
%%timeit
x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
t = Variable(xp.matmul(x.data, W.T) + b)
h0 = l0(x)
h1 = l1(h0)
h2 = l2(h1)
h3 = l3(h2)
h4 = l4(h3)
h5 = l5(h4)
y  = l6(h5)
y.data.get()

1.34 ms ± 2.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [28]:
x = xp.random.random((batchsize, 5)).astype(np.float32)
t = xp.matmul(x, W.T) + b

In [29]:
%%timeit
x = xp.random.random((batchsize, 5)).astype(np.float32)
t = xp.matmul(x, W.T) + b

y = model(x)
y.data.get()

1.43 ms ± 2.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


Chainer よりほんのちょっと早くなった（type check などやっていないせいかな。）

In [30]:
cp.cuda.cudnn.CUDNN_ACTIVATION_TANH

2

In [31]:
cp.cuda.cudnn

<module 'cupy.cuda.cudnn' from '/home/ubuntu/anaconda3/lib/python3.6/site-packages/cupy/cuda/cudnn.cpython-36m-x86_64-linux-gnu.so'>