In [1]:
import sys
import numpy as np
import cupy as cp

sys.path.append('../')

from mandala.nodecore import Node
from mandala.nodecore import Variable
from mandala.autodiff import autodiff
from mandala.autodiff.linear import Linear
from mandala.autodiff import initializers
from mandala import cuda

In [2]:
xp = cp

In [3]:
def sum_forward(x):
    xp = cuda.get_array_module(x)
    return xp.sum(x)


def sum_backward(x, gy):
    xp = cuda.get_array_module(x)
    return xp.ones_like(x) * gy


class SumFunction(autodiff.AutoDiff):
    def forward(self, xs):
        x = xs[0]
        y = Node(sum_forward, [x])
        return y

    def backward(self, xs, gy):
        x = xs[0]
        gx = Node(sum_backward, [x, gy])
        return gx,


def _sum(x):
    return SumFunction()([x])

In [4]:
l0 = Linear(   5, 1000)
l1 = Linear(1000, 1000)
l2 = Linear(1000, 1000)
l3 = Linear(1000, 1000)
l4 = Linear(1000, 1000)
l5 = Linear(1000, 1000)
l6 = Linear(1000,    3)

layer_list = [l0, l1, l2, l3, l4, l5, l6]

In [5]:
for l in layer_list:
    l.to_gpu()

In [6]:
# 真の係数
W = xp.arange(15, dtype=np.float32).reshape(3, 5)
b = xp.arange(3, dtype=np.float32)

## test

In [7]:
batchsize = 32
x = Variable(cp.random.random((batchsize, 5)).astype(np.float32))
t = Variable(cp.matmul(x.data, W.T) + b)

In [8]:
h0 = l0(x)
h1 = l1(h0)
h2 = l2(h1)
h3 = l3(h2)
h4 = l4(h3)
h5 = l5(h4)
y  = l6(h5)
loss = (y - t) ** 2 / batchsize

In [9]:
loss.backward()

In [10]:
print(h0._reference_count)
print(h0._data)
l0.b.grad.data

print(h0._reference_count)
print(h0._data)
l1.W.grad.data

print(h0._reference_count)
print(h0._data)
l2.W.grad.data

print(h0._reference_count)

2
None
1
[[-0.32309359  0.2884514   1.51070857 ..., -0.95198089 -0.07371156
  -0.03532079]
 [-0.40260783 -0.34337464  1.31651044 ..., -0.472036    0.81224024
   0.48255664]
 [-0.34221035 -0.84781873  1.4369849  ..., -0.48086375  1.42222321
   1.19213176]
 ..., 
 [-0.14121969 -0.42710575  0.58321166 ..., -0.30614865  0.67090493
   0.53001797]
 [-0.27783871 -0.28232253  0.29207876 ..., -0.36407527  0.13322188
   0.45857289]
 [-0.40956718 -0.17585471  0.73497617 ..., -0.46759143  0.3324213
   0.14299998]]
0
[[-0.32309359  0.2884514   1.51070857 ..., -0.95198089 -0.07371156
  -0.03532079]
 [-0.40260783 -0.34337464  1.31651044 ..., -0.472036    0.81224024
   0.48255664]
 [-0.34221035 -0.84781873  1.4369849  ..., -0.48086375  1.42222321
   1.19213176]
 ..., 
 [-0.14121969 -0.42710575  0.58321166 ..., -0.30614865  0.67090493
   0.53001797]
 [-0.27783871 -0.28232253  0.29207876 ..., -0.36407527  0.13322188
   0.45857289]
 [-0.40956718 -0.17585471  0.73497617 ..., -0.46759143  0.3324213
   0.14

## 学習

In [11]:
import time

In [12]:
s = time.time()
lr = 1e-6

for i in range(100):
    # make batch
    x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
    t = Variable(xp.matmul(x.data, W.T) + b)
    
    # forward
    h0 = l0(x)
    h1 = l1(h0)
    h2 = l2(h1)
    h3 = l3(h2)
    h4 = l4(h3)
    h5 = l5(h4)
    y  = l6(h5)
    loss = (y - t) ** 2 / batchsize

    # loss
    loss = _sum((y - t) ** 2) / batchsize
    
    for l in layer_list:
        l.W.grad = 0
        if l.b is not None:
            l.b.grad = 0.

    # backward
    loss.backward()
    
    # update
    for l in layer_list:
        l.W.data -= lr * l.W.grad.data
        if l.b is not None:
            l.b.data -= lr * l.b.grad.data

    print(loss.data)

print('time:', time.time() - s)

1551.1934814453125
1227.002685546875
754.77880859375
504.7569885253906
322.0633544921875
212.13937377929688
148.88702392578125
102.05595397949219
63.85234451293945
47.79963684082031
42.99308776855469
29.727035522460938
19.77566146850586
17.944822311401367
16.91914176940918
13.179389953613281
12.85638427734375
14.407691955566406
13.981608390808105
12.85222339630127
12.640665054321289
14.82580280303955
12.276845932006836
11.85091781616211
8.660057067871094
10.931921005249023
9.344804763793945
13.824132919311523
11.176336288452148
11.0354585647583
12.808568954467773
6.382782936096191
8.150248527526855
9.556539535522461
7.627527236938477
10.338979721069336
9.798160552978516
9.085975646972656
9.828990936279297
7.91644811630249
5.717816352844238
7.92717170715332
8.945561408996582
6.558893203735352
5.622929096221924
5.059146881103516
4.753493309020996
7.270957946777344
5.122196197509766
5.1840925216674805
5.467142105102539
5.234935760498047
5.173527717590332
4.1845903396606445
4.5229349136352

## Chainer との速度比較

In [13]:
import chainer
import chainer.functions as F
import chainer.links as L
from chainer import optimizer

In [14]:
class Model(chainer.Chain):
    def __init__(self):
        super().__init__()
        with self.init_scope():
            self.l0 = L.Linear(   5, 1000)
            self.l1 = L.Linear(1000, 1000)
            self.l2 = L.Linear(1000, 1000)
            self.l3 = L.Linear(1000, 1000)
            self.l4 = L.Linear(1000, 1000)
            self.l5 = L.Linear(1000, 1000)
            self.l6 = L.Linear(1000,    3)

    def __call__(self, x):
        h  = self.l0(x)
        h  = self.l1(h)
        h  = self.l2(h)
        h  = self.l3(h)
        h  = self.l4(h)
        h  = self.l5(h)
        y  = self.l6(h)
        return y

In [15]:
model = Model()
model.to_gpu()

<__main__.Model at 0x7f9d6bec1cf8>

In [16]:
opt = chainer.optimizers.SGD(lr=1e-4)
opt.setup(model)

<chainer.optimizers.sgd.SGD at 0x7f9d6be767b8>

In [17]:
batchsize = 32
s = time.time()

for i in range(100):
    # make batch
    x = xp.random.random((batchsize, 5)).astype(np.float32)
    t = xp.matmul(x, W.T) + b
    
    # forward
    y = model(x)

    # loss
    loss = F.mean_squared_error(y, t)

    # backward
    model.cleargrads()
    loss.backward()
    
    # update
    opt.update()

    print(loss.data)

print(time.time() - s)

505.260986328125
321.1620178222656
327.0655822753906
247.981201171875
155.99098205566406
76.3377685546875
30.1422176361084
7.860241413116455
2.2543418407440186
1.3737939596176147
1.1086504459381104
1.0362097024917603
0.9039679169654846
1.1105233430862427
0.8453274369239807
1.0538126230239868
0.9996352195739746
0.7425522208213806
0.671079158782959
0.9096050262451172
0.7945298552513123
0.4635123908519745
0.7390081286430359
0.6977785229682922
0.7034571170806885
0.6067087054252625
0.6501199007034302
0.5543020367622375
0.5112015604972839
0.5471829771995544
0.5768411755561829
0.5076608061790466
0.4802683889865875
0.3676702678203583
0.3961446285247803
0.5038512349128723
0.3644340932369232
0.37355533242225647
0.44834694266319275
0.4133519232273102
0.34727799892425537
0.44477400183677673
0.344064325094223
0.4300054609775543
0.3671726882457733
0.3045612871646881
0.3572017252445221
0.3388247489929199
0.3930419981479645
0.36113572120666504
0.35798025131225586
0.285039484500885
0.3495961129665375
0

In [18]:
0.3876018524169922 / 0.19963645935058594

1.9415384027439404

In [19]:
1.584319829940796 / 0.5818600654602051

2.7228536962537975

Chainer の 2.7 倍の計算時間がかかっている……

推論に限定しても 1.9 倍。

In [20]:
%%timeit
cuda.get_array_module(W)

395 ns ± 0.672 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [21]:
W = xp.arange(15, dtype=np.float32).reshape(3, 5)
b = xp.arange(3, dtype=np.float32)

In [22]:
x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
t = Variable(xp.matmul(x.data, W.T) + b)

In [23]:
%%timeit
x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
t = Variable(xp.matmul(x.data, W.T) + b)
h0 = l0(x)
h1 = l1(h0)
h2 = l2(h1)
h3 = l3(h2)
h4 = l4(h3)
h5 = l5(h4)
y  = l6(h5)
y.data.get()

1.36 ms ± 3.27 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [24]:
x = xp.random.random((batchsize, 5)).astype(np.float32)
t = xp.matmul(x, W.T) + b

In [25]:
%%timeit
x = xp.random.random((batchsize, 5)).astype(np.float32)
t = xp.matmul(x, W.T) + b

y = model(x)
y.data.get()

1.44 ms ± 2.94 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


Chainer よりほんのちょっと早くなった（type check などやっていないせいかな。）