In [1]:
import sys
import numpy as np
import cupy as cp

sys.path.append('../')

from mandala.nodecore import Node
from mandala.nodecore import Variable
from mandala.autodiff import autodiff
from mandala.autodiff.linear import Linear
from mandala.autodiff.relu import relu
from mandala.autodiff import initializers
from mandala import cuda

In [2]:
from mandala.autodiff.under_development import basic_math_ho

In [3]:
basic_math_ho.install_node_arithmetics()

In [4]:
xp = cp

In [5]:
def sum_forward(x):
    xp = cuda.get_array_module(x)
    return xp.sum(x)


def sum_backward(x, gy):
    xp = cuda.get_array_module(x)
    return xp.ones_like(x) * gy


class SumFunction(autodiff.AutoDiff):
    def forward(self, xs):
        x = xs[0]
        y = Node(sum_forward, [x])
        return y

    def backward(self, xs, gy):
        x = xs[0]
        gx = Node(sum_backward, [x, gy])
        return gx,


def _sum(x):
    return SumFunction()([x])

In [6]:
l0 = Linear(   5, 1000)
l1 = Linear(1000, 1000)
l2 = Linear(1000, 1000)
l3 = Linear(1000, 1000)
l4 = Linear(1000, 1000)
l5 = Linear(1000, 1000)
l6 = Linear(1000,    3)

layer_list = [l0, l1, l2, l3, l4, l5, l6]

In [7]:
for l in layer_list:
    l.to_gpu()

In [8]:
# 真の係数
W = xp.arange(15, dtype=np.float32).reshape(3, 5)
b = xp.arange(3, dtype=np.float32)

## test

In [9]:
batchsize = 32
x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
t = Variable(xp.matmul(x.data, W.T) + b)

In [10]:
h0 = relu(l0(x))
h1 = relu(l1(h0))
h2 = relu(l2(h1))
h3 = relu(l3(h2))
h4 = relu(l4(h3))
h5 = relu(l5(h4))
y  = relu(l6(h5))
loss = (y - t) ** 2 / batchsize

In [11]:
h5._reference_count

1

In [12]:
loss.backward()

In [13]:
print(h0._reference_count)
print(h0._data)
l0.b.grad.data

print(h0._reference_count)
print(h0._data)
l1.W.grad.data

print(h0._reference_count)
print(h0._data)
l2.W.grad.data

print(h0._reference_count)

3
None
2
[[ 0.59670782  1.09020698  0.30631328 ...,  0.          0.          0.        ]
 [ 0.6659686   1.36422372  1.55435622 ...,  0.          0.          0.        ]
 [ 0.45300755  0.99499518  0.47963625 ...,  0.          0.          0.        ]
 ..., 
 [ 0.26698124  1.20601308  0.58078444 ...,  0.          0.          0.        ]
 [ 0.05644015  1.67113698  0.9623642  ...,  0.          0.          0.        ]
 [ 0.69267452  0.52235496  0.69640452 ...,  0.          0.          0.        ]]
2
[[ 0.59670782  1.09020698  0.30631328 ...,  0.          0.          0.        ]
 [ 0.6659686   1.36422372  1.55435622 ...,  0.          0.          0.        ]
 [ 0.45300755  0.99499518  0.47963625 ...,  0.          0.          0.        ]
 ..., 
 [ 0.26698124  1.20601308  0.58078444 ...,  0.          0.          0.        ]
 [ 0.05644015  1.67113698  0.9623642  ...,  0.          0.          0.        ]
 [ 0.69267452  0.52235496  0.69640452 ...,  0.          0.          0.        ]]
2


## 学習

In [14]:
import time

In [15]:
s = time.time()
lr = 1e-5

for i in range(100):
    # make batch
    x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
    t = Variable(xp.matmul(x.data, W.T) + b)
    
    # forward
    h0 = relu(l0(x))
    h1 = relu(l1(h0))
    h2 = relu(l2(h1))
    h3 = relu(l3(h2))
    h4 = relu(l4(h3))
    h5 = relu(l5(h4))
    y  = l6(h5)
    loss = (y - t) ** 2 / batchsize

    # loss
    loss = _sum((y - t) ** 2) / batchsize
    
    for l in layer_list:
        l.W.grad = 0
        if l.b is not None:
            l.b.grad = 0.

    # backward
    loss.backward()
    
    # update
    for l in layer_list[::-1]:
        l.W.data -= lr * l.W.grad.data
        if l.b is not None:
            l.b.data -= lr * l.b.grad.data

    print(loss.data)

print('time:', time.time() - s)

1396.805419921875
1305.3095703125
1256.254150390625
1312.8984375
1181.73095703125
1146.4600830078125
1029.11865234375
975.6500244140625
1043.843994140625
1033.31982421875
759.9957275390625
674.3994140625
674.7313232421875
669.4490966796875
621.3905029296875
467.6435546875
437.77301025390625
334.9789733886719
323.18743896484375
268.540283203125
219.34317016601562
172.913818359375
127.12586975097656
77.21531677246094
57.07967758178711
54.84688186645508
32.645809173583984
38.06387710571289
21.330846786499023
13.694775581359863
9.410962104797363
11.974480628967285
7.110748291015625
7.429330825805664
5.900032043457031
4.2625885009765625
5.156417369842529
2.918550491333008
3.675497531890869
2.483914613723755
3.2017428874969482
2.0841445922851562
1.7912100553512573
3.3450913429260254
2.7417564392089844
3.0529346466064453
3.971822738647461
2.085923671722412
2.0216760635375977
2.335366725921631
2.7088687419891357
2.5782241821289062
2.4556870460510254
2.569121837615967
1.1891124248504639
2.41433

## Chainer との速度比較

In [16]:
import chainer
import chainer.functions as F
import chainer.links as L
from chainer import optimizer

In [17]:
class Model(chainer.Chain):
    def __init__(self):
        super().__init__()
        with self.init_scope():
            self.l0 = L.Linear(   5, 1000)
            self.l1 = L.Linear(1000, 1000)
            self.l2 = L.Linear(1000, 1000)
            self.l3 = L.Linear(1000, 1000)
            self.l4 = L.Linear(1000, 1000)
            self.l5 = L.Linear(1000, 1000)
            self.l6 = L.Linear(1000,    3)

    def __call__(self, x):
        h  = F.relu(self.l0(x))
        h  = F.relu(self.l1(h))
        h  = F.relu(self.l2(h))
        h  = F.relu(self.l3(h))
        h  = F.relu(self.l4(h))
        h  = F.relu(self.l5(h))
        y  = self.l6(h)
        return y

In [18]:
model = Model()
model.to_gpu()

<__main__.Model at 0x7f852ac013c8>

In [19]:
opt = chainer.optimizers.SGD(lr=1e-4)
opt.setup(model)

<chainer.optimizers.sgd.SGD at 0x7f852ac14cf8>

In [20]:
batchsize = 32
s = time.time()

for i in range(100):
    # make batch
    x = xp.random.random((batchsize, 5)).astype(np.float32)
    t = xp.matmul(x, W.T) + b
    
    # forward
    y = model(x)

    # loss
    loss = F.mean_squared_error(y, t)

    # backward
    model.cleargrads()
    loss.backward()
    
    # update
    opt.update()

    print(loss.data)

print(time.time() - s)

423.5740661621094
531.382080078125
443.461669921875
420.7068176269531
506.3071594238281
423.91015625
510.5898742675781
517.8383178710938
551.9871826171875
456.3919982910156
474.23291015625
436.4140319824219
444.9059753417969
567.0789184570312
448.3560485839844
467.77392578125
550.9495239257812
545.7780151367188
497.3526916503906
474.7747497558594
445.9039001464844
414.466796875
457.751953125
442.871337890625
384.0526428222656
282.5722961425781
286.8321838378906
389.83642578125
381.3673400878906
302.9704895019531
308.5923156738281
299.9503479003906
221.6492462158203
242.1636962890625
204.03977966308594
176.8579559326172
109.96512603759766
80.73186492919922
46.962066650390625
21.91271209716797
9.370952606201172
5.117706298828125
2.2746541500091553
1.0506415367126465
0.46438512206077576
0.6512341499328613
0.6124750971794128
0.4400928318500519
0.3481735289096832
0.444492906332016
0.3427414894104004
0.44413816928863525
0.4404982030391693
0.5565033555030823
0.4639120399951935
0.4644575119018

In [21]:
0.3876018524169922 / 0.19963645935058594

1.9415384027439404

In [22]:
1.584319829940796 / 0.5818600654602051

2.7228536962537975

Chainer の 2.7 倍の計算時間がかかっている……

推論に限定しても 1.9 倍。

In [23]:
%%timeit
cuda.get_array_module(W)

394 ns ± 0.249 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [24]:
W = xp.arange(15, dtype=np.float32).reshape(3, 5)
b = xp.arange(3, dtype=np.float32)

In [25]:
x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
t = Variable(xp.matmul(x.data, W.T) + b)

In [26]:
%%timeit
x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
t = Variable(xp.matmul(x.data, W.T) + b)
h0 = l0(x)
h1 = l1(h0)
h2 = l2(h1)
h3 = l3(h2)
h4 = l4(h3)
h5 = l5(h4)
y  = l6(h5)
y.data.get()

1.37 ms ± 2.01 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [27]:
x = xp.random.random((batchsize, 5)).astype(np.float32)
t = xp.matmul(x, W.T) + b

In [28]:
%%timeit
x = xp.random.random((batchsize, 5)).astype(np.float32)
t = xp.matmul(x, W.T) + b

y = model(x)
y.data.get()

2.05 ms ± 16.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Chainer よりほんのちょっと早くなった（type check などやっていないせいかな。）

In [29]:
cp.cuda.cudnn.CUDNN_ACTIVATION_TANH

2

In [30]:
cp.cuda.cudnn

<module 'cupy.cuda.cudnn' from '/home/ubuntu/anaconda3/lib/python3.6/site-packages/cupy/cuda/cudnn.cpython-36m-x86_64-linux-gnu.so'>