In [1]:
import sys
import numpy as np
import cupy as cp

sys.path.append('../')

from mandala.nodecore import Node
from mandala.nodecore import Variable
from mandala.autodiff import autodiff
from mandala.autodiff.linear import Linear
from mandala.autodiff.relu import relu
from mandala.autodiff import initializers
from mandala import cuda

In [2]:
from mandala.autodiff.under_development import basic_math_ho

In [3]:
basic_math_ho.install_node_arithmetics()

In [4]:
xp = cp

In [5]:
def sum_forward(x):
    xp = cuda.get_array_module(x)
    return xp.sum(x)


def sum_backward(x, gy):
    xp = cuda.get_array_module(x)
    return xp.ones_like(x) * gy


class SumFunction(autodiff.AutoDiff):
    def forward(self, xs):
        x = xs[0]
        y = Node(sum_forward, [x])
        return y

    def backward(self, xs, gy):
        x = xs[0]
        gx = Node(sum_backward, [x, gy])
        return gx,


def _sum(x):
    return SumFunction()([x])

In [70]:
l0 = Linear(   5, 1000)
l1 = Linear(1000, 1000)
l2 = Linear(1000, 1000)
l3 = Linear(1000, 1000)
l4 = Linear(1000, 1000)
l5 = Linear(1000, 1000)
l6 = Linear(1000,    3)

layer_list = [l0, l1, l2, l3, l4, l5, l6]

In [71]:
for l in layer_list:
    l.to_gpu()

In [72]:
# 真の係数
W = xp.arange(15, dtype=np.float32).reshape(3, 5)
b = xp.arange(3, dtype=np.float32)

## test

In [73]:
batchsize = 32
x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
t = Variable(xp.matmul(x.data, W.T) + b)

In [74]:
h0 = relu(l0(x))
h1 = relu(l1(h0))
h2 = relu(l2(h1))
h3 = relu(l3(h2))
h4 = relu(l4(h3))
h5 = relu(l5(h4))
y  = relu(l6(h5))
loss = (y - t) ** 2 / batchsize

In [75]:
h5._reference_count

1

In [76]:
loss.backward()

In [77]:
print(h0._reference_count)
print(h0._data)
l0.b.grad.data

print(h0._reference_count)
print(h0._data)
l1.W.grad.data

print(h0._reference_count)
print(h0._data)
l2.W.grad.data

print(h0._reference_count)

3
None
2
[[ 0.          0.57037848  0.97586858 ...,  0.          0.          0.        ]
 [ 0.          0.39933702  1.69326723 ...,  0.          0.          0.57848614]
 [ 0.          0.07765651  0.74141574 ...,  0.          0.          0.00233528]
 ..., 
 [ 0.          0.13806045  0.52868599 ...,  0.          0.          0.        ]
 [ 0.          0.52704078  1.40175164 ...,  0.          0.          0.28490806]
 [ 0.          0.88213855  1.08820438 ...,  0.          0.          0.        ]]
2
[[ 0.          0.57037848  0.97586858 ...,  0.          0.          0.        ]
 [ 0.          0.39933702  1.69326723 ...,  0.          0.          0.57848614]
 [ 0.          0.07765651  0.74141574 ...,  0.          0.          0.00233528]
 ..., 
 [ 0.          0.13806045  0.52868599 ...,  0.          0.          0.        ]
 [ 0.          0.52704078  1.40175164 ...,  0.          0.          0.28490806]
 [ 0.          0.88213855  1.08820438 ...,  0.          0.          0.        ]]
2


## 学習

In [78]:
import time

In [80]:
s = time.time()
lr = 1e-5

for i in range(100):
    # make batch
    x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
    t = Variable(xp.matmul(x.data, W.T) + b)
    
    # forward
    h0 = relu(l0(x))
    h1 = relu(l1(h0))
    h2 = relu(l2(h1))
    h3 = relu(l3(h2))
    h4 = relu(l4(h3))
    h5 = relu(l5(h4))
    y  = l6(h5)
    loss = (y - t) ** 2 / batchsize

    # loss
    loss = _sum((y - t) ** 2) / batchsize
    
    for l in layer_list:
        l.W.grad = 0
        if l.b is not None:
            l.b.grad = 0.

    # backward
    loss.backward()
    
    # update
    for l in layer_list[::-1]:
        l.W.data -= lr * l.W.grad.data
        if l.b is not None:
            l.b.data -= lr * l.b.grad.data

    print(loss.data)

print('time:', time.time() - s)

894.57568359375
842.2807006835938
776.6619262695312
647.2041625976562
663.3584594726562
587.43310546875
431.1292724609375
445.94268798828125
321.49884033203125
258.0339050292969
182.3206329345703
182.62779235839844
111.57316589355469
105.64765167236328
80.47604370117188
58.344139099121094
39.61833953857422
31.013195037841797
25.392305374145508
15.32214069366455
12.973474502563477
12.737436294555664
7.291163921356201
6.148909568786621
4.056299686431885
3.795440435409546
4.631786823272705
3.210674285888672
3.762791156768799
4.308149337768555
3.6449215412139893
3.200988531112671
2.987309455871582
2.0498623847961426
3.510840892791748
2.583799362182617
2.7985048294067383
3.279245138168335
2.3766136169433594
3.6609625816345215
2.426819324493408
3.5281293392181396
2.870962142944336
2.987401008605957
2.8914365768432617
2.449875593185425
3.9658966064453125
2.757169485092163
2.931574821472168
3.5221142768859863
2.5374226570129395
2.3056631088256836
3.3382019996643066
3.0123562812805176
1.9876401

## Chainer との速度比較

In [16]:
import chainer
import chainer.functions as F
import chainer.links as L
from chainer import optimizer

In [17]:
class Model(chainer.Chain):
    def __init__(self):
        super().__init__()
        with self.init_scope():
            self.l0 = L.Linear(   5, 1000)
            self.l1 = L.Linear(1000, 1000)
            self.l2 = L.Linear(1000, 1000)
            self.l3 = L.Linear(1000, 1000)
            self.l4 = L.Linear(1000, 1000)
            self.l5 = L.Linear(1000, 1000)
            self.l6 = L.Linear(1000,    3)

    def __call__(self, x):
        h  = F.relu(self.l0(x))
        h  = F.relu(self.l1(h))
        h  = F.relu(self.l2(h))
        h  = F.relu(self.l3(h))
        h  = F.relu(self.l4(h))
        h  = F.relu(self.l5(h))
        y  = self.l6(h)
        return y

In [18]:
model = Model()
model.to_gpu()

<__main__.Model at 0x7fd8dc4fb668>

In [19]:
opt = chainer.optimizers.SGD(lr=1e-4)
opt.setup(model)

<chainer.optimizers.sgd.SGD at 0x7fd8dc4ab9e8>

In [20]:
batchsize = 32
s = time.time()

for i in range(100):
    # make batch
    x = xp.random.random((batchsize, 5)).astype(np.float32)
    t = xp.matmul(x, W.T) + b
    
    # forward
    y = model(x)

    # loss
    loss = F.mean_squared_error(y, t)

    # backward
    model.cleargrads()
    loss.backward()
    
    # update
    opt.update()

    print(loss.data)

print(time.time() - s)

484.5459899902344
440.8442077636719
503.1265563964844
550.2545776367188
441.613037109375
501.4929504394531
413.2272644042969
569.9443969726562
502.867431640625
501.7845764160156
498.6287536621094
449.3643798828125
451.263427734375
489.0592956542969
510.0538635253906
488.62548828125
499.8179626464844
368.2991027832031
382.900634765625
465.1600036621094
454.8226013183594
476.9388122558594
479.9302062988281
421.3287048339844
409.83203125
459.923828125
486.7774963378906
407.2345275878906
395.7827453613281
425.9024353027344
422.9732666015625
407.4546203613281
347.1906433105469
332.0914001464844
310.6220703125
314.3719787597656
269.88616943359375
218.4090576171875
242.2445526123047
167.9180908203125
155.75143432617188
125.58639526367188
93.26728057861328
48.98112869262695
29.781511306762695
21.675058364868164
8.618934631347656
4.790206432342529
2.231900930404663
1.9461106061935425
1.2503701448440552
1.1134284734725952
1.2792353630065918
1.2976678609848022
0.8787908554077148
1.294006705284118

In [21]:
0.3876018524169922 / 0.19963645935058594

1.9415384027439404

In [22]:
1.584319829940796 / 0.5818600654602051

2.7228536962537975

Chainer の 2.7 倍の計算時間がかかっている……

推論に限定しても 1.9 倍。

In [23]:
%%timeit
cuda.get_array_module(W)

389 ns ± 0.379 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [24]:
W = xp.arange(15, dtype=np.float32).reshape(3, 5)
b = xp.arange(3, dtype=np.float32)

In [25]:
x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
t = Variable(xp.matmul(x.data, W.T) + b)

In [26]:
%%timeit
x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
t = Variable(xp.matmul(x.data, W.T) + b)
h0 = l0(x)
h1 = l1(h0)
h2 = l2(h1)
h3 = l3(h2)
h4 = l4(h3)
h5 = l5(h4)
y  = l6(h5)
y.data.get()

1.35 ms ± 1.72 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [27]:
x = xp.random.random((batchsize, 5)).astype(np.float32)
t = xp.matmul(x, W.T) + b

In [28]:
%%timeit
x = xp.random.random((batchsize, 5)).astype(np.float32)
t = xp.matmul(x, W.T) + b

y = model(x)
y.data.get()

2.02 ms ± 10.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


Chainer よりほんのちょっと早くなった（type check などやっていないせいかな。）

In [29]:
cp.cuda.cudnn.CUDNN_ACTIVATION_TANH

2

In [30]:
cp.cuda.cudnn

<module 'cupy.cuda.cudnn' from '/home/ubuntu/anaconda3/lib/python3.6/site-packages/cupy/cuda/cudnn.cpython-36m-x86_64-linux-gnu.so'>