In [1]:
import sys
import numpy as np
import cupy as cp

sys.path.append('../')

from mandala.nodecore import Node
from mandala.nodecore import Variable
from mandala.autodiff import autodiff
from mandala.autodiff.linear import Linear
from mandala.autodiff.relu import relu
from mandala.autodiff import initializers
from mandala import cuda

In [2]:
from mandala.autodiff import basic_math_ho

In [3]:
basic_math_ho.install_node_arithmetics()

In [4]:
xp = cp

In [5]:
def sum_forward(x):
    xp = cuda.get_array_module(x)
    return xp.sum(x)


def sum_backward(x, gy):
    xp = cuda.get_array_module(x)
    return xp.ones_like(x) * gy


class SumFunction(autodiff.AutoDiff):
    def forward(self, xs):
        x = xs[0]
        y = Node(sum_forward, [x])
        return y

    def backward(self, xs, gy):
        x = xs[0]
        gx = Node(sum_backward, [x, gy])
        return gx,


def _sum(x):
    return SumFunction()([x])

In [6]:
l0 = Linear(   5, 1000)
l1 = Linear(1000, 1000)
l2 = Linear(1000, 1000)
l3 = Linear(1000, 1000)
l4 = Linear(1000, 1000)
l5 = Linear(1000, 1000)
l6 = Linear(1000,    3)

layer_list = [l0, l1, l2, l3, l4, l5, l6]

In [7]:
for l in layer_list:
    l.to_gpu()

In [8]:
# 真の係数
W = xp.arange(15, dtype=np.float32).reshape(3, 5)
b = xp.arange(3, dtype=np.float32)

## test

In [9]:
batchsize = 32
x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
t = Variable(xp.matmul(x.data, W.T) + b)

In [10]:
h0 = relu(l0(x))
h1 = relu(l1(h0))
h2 = relu(l2(h1))
h3 = relu(l3(h2))
h4 = relu(l4(h3))
h5 = relu(l5(h4))
y  = relu(l6(h5))
loss = (y - t) ** 2 / batchsize

In [11]:
h5._reference_count

1

In [12]:
loss.backward()

In [13]:
print(h0._reference_count)
print(h0._data)
l0.b.grad.data

print(h0._reference_count)
print(h0._data)
l1.W.grad.data

print(h0._reference_count)
print(h0._data)
l2.W.grad.data

print(h0._reference_count)

3
None
2
[[ 0.64860862  0.28967083  1.13119984 ...,  0.          0.25194496  0.        ]
 [ 0.35686362  0.          0.71489704 ...,  0.11941893  0.29779166  0.        ]
 [ 0.          0.07290179  1.27660513 ...,  0.          0.          0.        ]
 ..., 
 [ 0.20193653  0.          1.21434498 ...,  0.          0.01959161  0.        ]
 [ 0.          0.4635005   0.20899332 ...,  0.54076755  0.          0.        ]
 [ 0.22650175  0.24522282  1.46314657 ...,  0.          0.          0.        ]]
2
[[ 0.64860862  0.28967083  1.13119984 ...,  0.          0.25194496  0.        ]
 [ 0.35686362  0.          0.71489704 ...,  0.11941893  0.29779166  0.        ]
 [ 0.          0.07290179  1.27660513 ...,  0.          0.          0.        ]
 ..., 
 [ 0.20193653  0.          1.21434498 ...,  0.          0.01959161  0.        ]
 [ 0.          0.4635005   0.20899332 ...,  0.54076755  0.          0.        ]
 [ 0.22650175  0.24522282  1.46314657 ...,  0.          0.          0.        ]]
2


## 学習

In [14]:
import time

In [15]:
s = time.time()
lr = 1e-5

for i in range(100):
    # make batch
    x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
    t = Variable(xp.matmul(x.data, W.T) + b)
    
    # forward
    h0 = relu(l0(x))
    h1 = relu(l1(h0))
    h2 = relu(l2(h1))
    h3 = relu(l3(h2))
    h4 = relu(l4(h3))
    h5 = relu(l5(h4))
    y  = l6(h5)
    loss = (y - t) ** 2 / batchsize

    # loss
    loss = _sum((y - t) ** 2) / batchsize
    
    for l in layer_list:
        l.W.grad = 0
        if l.b is not None:
            l.b.grad = 0.

    # backward
    loss.backward()
    
    # update
    for l in layer_list[::-1]:
        l.W.data -= lr * l.W.grad.data
        if l.b is not None:
            l.b.data -= lr * l.b.grad.data

    print(loss.data)

print('time:', time.time() - s)

1378.634033203125
1577.5926513671875
1508.54638671875
1440.867919921875
1214.9478759765625
1073.578125
1086.6549072265625
1011.872314453125
1114.7161865234375
906.729736328125
930.90869140625
714.5150146484375
627.3939208984375
538.3048706054688
601.6226806640625
488.2144775390625
413.9114074707031
357.665771484375
266.42926025390625
247.65594482421875
162.33644104003906
140.4581298828125
108.37310791015625
71.75572204589844
59.47739791870117
38.98833465576172
35.296478271484375
17.604759216308594
14.492101669311523
10.682107925415039
8.7097806930542
7.2034502029418945
6.685288906097412
5.958806991577148
4.1995158195495605
4.69024658203125
5.342360496520996
4.324943542480469
5.834258556365967
4.883648872375488
6.356687545776367
4.239684104919434
4.648891448974609
3.93034029006958
3.7006022930145264
3.360018491744995
4.683257102966309
2.8244776725769043
3.436717987060547
3.113896369934082
4.188852310180664
4.472102165222168
4.18264627456665
2.925189971923828
2.7946431636810303
3.0143485

## Chainer との速度比較

In [16]:
import chainer
import chainer.functions as F
import chainer.links as L
from chainer import optimizer

In [17]:
class Model(chainer.Chain):
    def __init__(self):
        super().__init__()
        with self.init_scope():
            self.l0 = L.Linear(   5, 1000)
            self.l1 = L.Linear(1000, 1000)
            self.l2 = L.Linear(1000, 1000)
            self.l3 = L.Linear(1000, 1000)
            self.l4 = L.Linear(1000, 1000)
            self.l5 = L.Linear(1000, 1000)
            self.l6 = L.Linear(1000,    3)

    def __call__(self, x):
        h  = F.relu(self.l0(x))
        h  = F.relu(self.l1(h))
        h  = F.relu(self.l2(h))
        h  = F.relu(self.l3(h))
        h  = F.relu(self.l4(h))
        h  = F.relu(self.l5(h))
        y  = self.l6(h)
        return y

In [22]:
model = Model()
model.to_gpu()

<__main__.Model at 0x7f0670c1a7b8>

In [23]:
opt = chainer.optimizers.SGD(lr=1e-4)
opt.setup(model)

<chainer.optimizers.sgd.SGD at 0x7f0670b40048>

In [24]:
batchsize = 32
s = time.time()

for i in range(100):
    # make batch
    x = xp.random.random((batchsize, 5)).astype(np.float32)
    t = xp.matmul(x, W.T) + b
    
    # forward
    y = model(x)

    # loss
    loss = F.mean_squared_error(y, t)

    # backward
    model.cleargrads()
    loss.backward()
    
    # update
    opt.update()

    print(loss.data)

print(time.time() - s)

549.1631469726562
471.0884704589844
376.5736083984375
517.8898315429688
453.1171875
467.7423095703125
435.8872985839844
462.77392578125
458.2810363769531
460.0186767578125
501.9132385253906
451.4162292480469
471.0533752441406
461.5989685058594
445.4179992675781
396.8384704589844
503.6466979980469
411.8832092285156
391.8820495605469
477.526123046875
367.99462890625
458.1058654785156
490.9054870605469
397.1836853027344
485.1408996582031
420.5108337402344
427.3149719238281
392.0328674316406
409.8675842285156
380.3116149902344
421.8919982910156
341.1537780761719
260.7998962402344
304.8304748535156
288.5037536621094
245.0092315673828
213.2000732421875
202.4524688720703
151.91368103027344
126.92919158935547
68.91231536865234
70.22505950927734
32.596317291259766
20.18245506286621
12.99322509765625
4.969906330108643
3.6021602153778076
0.9478638768196106
1.2193830013275146
1.3530960083007812
1.4127308130264282
2.0905981063842773
1.2113178968429565
1.327100157737732
1.20295250415802
0.8615207076

In [25]:
0.3876018524169922 / 0.19963645935058594

1.9415384027439404

In [26]:
1.584319829940796 / 0.5818600654602051

2.7228536962537975

Chainer の 2.7 倍の計算時間がかかっている……

推論に限定しても 1.9 倍。

In [27]:
%%timeit
cuda.get_array_module(W)

390 ns ± 0.122 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [28]:
W = xp.arange(15, dtype=np.float32).reshape(3, 5)
b = xp.arange(3, dtype=np.float32)

In [29]:
x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
t = Variable(xp.matmul(x.data, W.T) + b)

In [30]:
%%timeit
x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
t = Variable(xp.matmul(x.data, W.T) + b)
h0 = l0(x)
h1 = l1(h0)
h2 = l2(h1)
h3 = l3(h2)
h4 = l4(h3)
h5 = l5(h4)
y  = l6(h5)
y.data.get()

1.36 ms ± 1.05 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [31]:
x = xp.random.random((batchsize, 5)).astype(np.float32)
t = xp.matmul(x, W.T) + b

In [32]:
%%timeit
x = xp.random.random((batchsize, 5)).astype(np.float32)
t = xp.matmul(x, W.T) + b

y = model(x)
y.data.get()

2.02 ms ± 20.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Chainer よりほんのちょっと早くなった（type check などやっていないせいかな。）

In [33]:
cp.cuda.cudnn.CUDNN_ACTIVATION_TANH

2

In [34]:
cp.cuda.cudnn

<module 'cupy.cuda.cudnn' from '/home/ubuntu/anaconda3/lib/python3.6/site-packages/cupy/cuda/cudnn.cpython-36m-x86_64-linux-gnu.so'>