In [1]:
import sys
import numpy as np
import cupy as cp

sys.path.append('../')

from mandala.nodecore import Node
from mandala.nodecore import Variable
from mandala.autodiff import autodiff
from mandala.autodiff.linear import Linear
from mandala.autodiff import initializers
from mandala import cuda

In [2]:
xp = cp

In [3]:
def sum_forward(x):
    xp = cuda.get_array_module(x)
    return xp.sum(x)


def sum_backward(x, gy):
    xp = cuda.get_array_module(x)
    return xp.ones_like(x) * gy


class SumFunction(autodiff.AutoDiff):
    def forward(self, xs):
        x = xs[0]
        y = Node(sum_forward, [x])
        return y

    def backward(self, xs, gy):
        x = xs[0]
        gx = Node(sum_backward, [x, gy])
        return gx,


def _sum(x):
    return SumFunction()([x])

In [40]:
l0 = Linear(   5, 1000)
l1 = Linear(1000, 1000)
l2 = Linear(1000, 1000)
l3 = Linear(1000, 1000)
l4 = Linear(1000, 1000)
l5 = Linear(1000, 1000)
l6 = Linear(1000,    3)

layer_list = [l0, l1, l2, l3, l4, l5, l6]

In [41]:
for l in layer_list:
    l.to_gpu()

In [42]:
# 真の係数
W = xp.arange(15, dtype=np.float32).reshape(3, 5)
b = xp.arange(3, dtype=np.float32)

## test

In [7]:
batchsize = 32
x = Variable(cp.random.random((batchsize, 5)).astype(np.float32))
t = Variable(cp.matmul(x.data, W.T) + b)

In [8]:
h0 = l0(x)
h1 = l1(h0)
h2 = l2(h1)
h3 = l3(h2)
h4 = l4(h3)
h5 = l5(h4)
y  = l6(h5)
loss = (y - t) ** 2 / batchsize

In [9]:
loss.backward()

In [10]:
print(h0._reference_count)
print(h0._data)
l0.b.grad.data

print(h0._reference_count)
print(h0._data)
l1.W.grad.data

print(h0._reference_count)
print(h0._data)
l2.W.grad.data

print(h0._reference_count)

2
None
1
[[-1.45269883 -1.14134967  0.40837008 ..., -0.28470078  0.76509279
   0.19574267]
 [-0.99341041  0.94193918  1.17227781 ..., -1.48845172  1.69526696
  -0.0106365 ]
 [-0.7119956  -1.23975325  0.83523363 ..., -0.19874306  1.32450438
  -0.20664756]
 ..., 
 [-0.72323817 -1.85495985  0.12650588 ...,  0.74108309  0.13316025
   0.35768634]
 [-1.39501119 -1.93006849  0.48279673 ...,  0.29560131  0.653211
   0.36548492]
 [-1.17374861 -1.7825135   1.00569379 ...,  0.08792435  0.80141807
  -0.23789522]]
0
None
0


## 学習

In [12]:
import time

In [20]:
s = time.time()
lr = 1e-6

for i in range(100):
    # make batch
    x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
    t = Variable(xp.matmul(x.data, W.T) + b)
    
    # forward
    h0 = l0(x)
    h1 = l1(h0)
    h2 = l2(h1)
    h3 = l3(h2)
    h4 = l4(h3)
    h5 = l5(h4)
    y  = l6(h5)
    loss = (y - t) ** 2 / batchsize

    # loss
    loss = _sum((y - t) ** 2) / batchsize
    
    for l in layer_list:
        l.W.grad = 0
        if l.b is not None:
            l.b.grad = 0.

    # backward
    loss.backward()
    
    # update
    for l in layer_list:
        l.W.data -= lr * l.W.grad.data
        if l.b is not None:
            l.b.data -= lr * l.b.grad.data

    print(loss.data)

print('time:', time.time() - s)

1022.7572021484375
974.2899169921875
644.5301513671875
373.63812255859375
257.5555114746094
189.54306030273438
164.09173583984375
132.56117248535156
118.82176208496094
76.20274353027344
62.10357666015625
69.38015747070312
43.59333801269531
30.510448455810547
42.937278747558594
33.311737060546875
37.1468391418457
33.093849182128906
29.00464630126953
28.886247634887695
29.530941009521484
24.751270294189453
29.86236572265625
21.546585083007812
35.712520599365234
23.965333938598633
26.268512725830078
25.813737869262695
20.795028686523438
19.924579620361328
24.809545516967773
16.80914306640625
23.0338191986084
19.310558319091797
23.546424865722656
20.26229476928711
21.46619987487793
18.038280487060547
19.623844146728516
16.68825912475586
21.560611724853516
17.24473762512207
18.96063995361328
13.776128768920898
16.683149337768555
13.212377548217773
19.090614318847656
15.614459991455078
13.525449752807617
16.461498260498047
13.263801574707031
10.483776092529297
15.686051368713379
15.336315155

## Chainer との速度比較

In [14]:
import chainer
import chainer.functions as F
import chainer.links as L
from chainer import optimizer

In [15]:
class Model(chainer.Chain):
    def __init__(self):
        super().__init__()
        with self.init_scope():
            self.l0 = L.Linear(   5, 1000)
            self.l1 = L.Linear(1000, 1000)
            self.l2 = L.Linear(1000, 1000)
            self.l3 = L.Linear(1000, 1000)
            self.l4 = L.Linear(1000, 1000)
            self.l5 = L.Linear(1000, 1000)
            self.l6 = L.Linear(1000,    3)

    def __call__(self, x):
        h  = self.l0(x)
        h  = self.l1(h)
        h  = self.l2(h)
        h  = self.l3(h)
        h  = self.l4(h)
        h  = self.l5(h)
        y  = self.l6(h)
        return y

In [16]:
model = Model()
model.to_gpu()

<__main__.Model at 0x7fb453d40c88>

In [17]:
opt = chainer.optimizers.SGD(lr=1e-4)
opt.setup(model)

<chainer.optimizers.sgd.SGD at 0x7fb453cc9d68>

In [21]:
batchsize = 32
s = time.time()

for i in range(100):
    # make batch
    x = xp.random.random((batchsize, 5)).astype(np.float32)
    t = xp.matmul(x, W.T) + b
    
    # forward
    y = model(x)

    # loss
    loss = F.mean_squared_error(y, t)

    # backward
    model.cleargrads()
    loss.backward()
    
    # update
    opt.update()

    print(loss.data)

print(time.time() - s)

505.6922912597656
377.7620544433594
306.7679748535156
232.7377471923828
153.09405517578125
115.99675750732422
45.89982223510742
11.542582511901855
2.639256238937378
0.7446701526641846
0.5373705625534058
0.5749110579490662
0.3841663897037506
0.5908631682395935
0.6780630946159363
0.5003799796104431
0.5126085877418518
0.34513425827026367
0.3841787874698639
0.45766568183898926
0.3691488206386566
0.3792717456817627
0.3119562566280365
0.4458272457122803
0.3733067512512207
0.28308025002479553
0.31647855043411255
0.5060803294181824
0.44729065895080566
0.26748839020729065
0.2679491937160492
0.43642210960388184
0.27527356147766113
0.3281126618385315
0.3682795464992523
0.29757025837898254
0.3379487097263336
0.278115838766098
0.1977410465478897
0.302072674036026
0.29780301451683044
0.31758785247802734
0.24617637693881989
0.31168970465660095
0.2683892548084259
0.22383351624011993
0.261991024017334
0.280195951461792
0.25830817222595215
0.20409195125102997
0.22746340930461884
0.20843298733234406
0.24

In [19]:
0.3876018524169922 / 0.19963645935058594

1.9415384027439404

In [18]:
1.584319829940796 / 0.5818600654602051

2.7228536962537975

Chainer の 2.7 倍の計算時間がかかっている……

推論に限定しても 1.9 倍。

In [22]:
%%timeit
cuda.get_array_module(W)

398 ns ± 0.139 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [48]:
W = xp.arange(15, dtype=np.float32).reshape(3, 5)
b = xp.arange(3, dtype=np.float32)

In [49]:
x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
t = Variable(xp.matmul(x.data, W.T) + b)

In [55]:
%%timeit
x = Variable(xp.random.random((batchsize, 5)).astype(np.float32))
t = Variable(xp.matmul(x.data, W.T) + b)
h0 = l0(x)
h1 = l1(h0)
h2 = l2(h1)
h3 = l3(h2)
h4 = l4(h3)
h5 = l5(h4)
y  = l6(h5)
y.data.get()

2.72 ms ± 114 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [57]:
x = xp.random.random((batchsize, 5)).astype(np.float32)
t = xp.matmul(x, W.T) + b

In [60]:
%%timeit
x = xp.random.random((batchsize, 5)).astype(np.float32)
t = xp.matmul(x, W.T) + b

y = model(x)
y.data.get()

1.42 ms ± 768 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
