In [1]:
import sys
import numpy as np
import cupy as cp

sys.path.append('../')

import mandala
from mandala import Node
from mandala import Variable

import mandala.autodiff as ad
import mandala.autodiff.functions as F

cp.cuda.Device(0).use()

In [2]:
class Model(ad.Graph):
    def __init__(self):
        super(Model, self).__init__()

        self.l0 = F.Linear(   5, 1000)
        self.l1 = F.Linear(1000, 1000)
        self.l2 = F.Linear(1000, 1000)
        self.l3 = F.Linear(1000, 1000)
        self.l4 = F.Linear(1000, 1000)
        self.l5 = F.Linear(1000, 1000)
        self.l6 = F.Linear(1000,    3)

    def __call__(self, x):
        h = F.relu(self.l0(x))
        h = F.relu(self.l1(h))
        h = F.relu(self.l2(h))
        h = F.relu(self.l3(h))
        h = F.relu(self.l4(h))
        h = F.relu(self.l5(h))
        y = F.relu(self.l6(h))
        return y

In [3]:
xp = cp

In [4]:
model = Model()
model.to_gpu()

In [5]:
W = xp.arange(15, dtype=np.float32).reshape(3, 5)
b = xp.arange(3, dtype=np.float32)

In [6]:
model.subgraphs

{'l0': <mandala.autodiff.functions.linear.Linear at 0x7f5410ef42b0>,
 'l1': <mandala.autodiff.functions.linear.Linear at 0x7f5410ef4390>,
 'l2': <mandala.autodiff.functions.linear.Linear at 0x7f5410ef4400>,
 'l3': <mandala.autodiff.functions.linear.Linear at 0x7f5410ef45f8>,
 'l4': <mandala.autodiff.functions.linear.Linear at 0x7f5410ef4518>,
 'l5': <mandala.autodiff.functions.linear.Linear at 0x7f5410ef4748>,
 'l6': <mandala.autodiff.functions.linear.Linear at 0x7f5410ef4908>}

## Test

In [7]:
import time

In [8]:
batch_size = 1024

In [9]:
model.l1.b.data.device

<CUDA Device 0>

In [10]:
model.l0.W.data

array([[ 0.4282168 , -0.2958299 , -0.38699242, -0.7930306 , -0.18369986],
       [ 0.26051593, -0.09872897, -0.18696935,  0.38902462, -0.5075895 ],
       [ 1.0514548 , -0.38491637,  0.18238868, -0.40378782, -0.6972679 ],
       ...,
       [ 0.43771583, -1.1421463 , -0.08425725, -0.03215528, -0.2581966 ],
       [ 0.63309914, -0.01810372,  0.09087458, -0.03959232, -0.13156793],
       [ 0.1572466 , -0.14136283, -0.18202619, -0.04973411, -0.48832953]],
      dtype=float32)

In [11]:
x = Variable(xp.random.random((batch_size, 5)).astype(np.float32))
y = model(x)

In [12]:
y.reserve(10)

In [13]:
y.data

array([[0.3221437 , 0.01131536, 0.        ],
       [0.68380153, 0.1905182 , 0.        ],
       [0.5967949 , 0.23613709, 0.        ],
       ...,
       [0.7532366 , 0.        , 0.        ],
       [1.0009242 , 0.7052074 , 0.        ],
       [0.5347421 , 0.59838414, 0.        ]], dtype=float32)

In [14]:
y.reserve_count

9

In [15]:
# model(x).data

In [18]:
y.reserve_count

0

In [21]:
s = time.time()
lr = 1e-4

for i in range(1000):
    # make batch
    x = Variable(xp.random.random((batch_size, 5)).astype(np.float32))
    t = Variable(xp.matmul(x.data, W.T) + b)
    
    # forward
    y  = model(x)
    loss = (y - t) ** 2 / batch_size

    # loss
    loss = F.sum((y - t) ** 2) / batch_size

    # backward
    model.cleargrads()
    loss.backward()

    # update
    for p in model.params.values():
        p.grad.reserve()

    print(y.reserve_count)
    for p in model.params.values():
        p.data -= lr * p.grad.data

    #print(loss.data)



print('time:', time.time() - s)

2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2


KeyboardInterrupt: 

In [None]:
p.__class__

In [15]:
def check_memory(start):
    if start._data is not None:
        print(start.__class__, start.shape)
    for node in start.input_nodes:
        check_memory(node)

In [None]:
check_memory(x.grad)

In [None]:
%who

## Chainer との速度比較

In [16]:
import chainer
import chainer.functions as F
import chainer.links as L
from chainer import optimizer

In [17]:
cp.cuda.Device(0).use()

In [18]:
W = xp.arange(15, dtype=np.float32).reshape(3, 5)
b = xp.arange(3, dtype=np.float32)

In [19]:
class Model(chainer.Chain):
    def __init__(self):
        super().__init__()
        with self.init_scope():
            self.l0 = L.Linear(   5, 1000)
            self.l1 = L.Linear(1000, 1000)
            self.l2 = L.Linear(1000, 1000)
            self.l3 = L.Linear(1000, 1000)
            self.l4 = L.Linear(1000, 1000)
            self.l5 = L.Linear(1000, 1000)
            self.l6 = L.Linear(1000,    3)

    def __call__(self, x):
        h  = F.relu(self.l0(x))
        h  = F.relu(self.l1(h))
        h  = F.relu(self.l2(h))
        h  = F.relu(self.l3(h))
        h  = F.relu(self.l4(h))
        h  = F.relu(self.l5(h))
        y  = self.l6(h)
        return y

In [20]:
model = Model()
model.to_gpu()

<__main__.Model at 0x7fcac3fab240>

In [21]:
opt = chainer.optimizers.SGD(lr=1e-4)
opt.setup(model)

<chainer.optimizers.sgd.SGD at 0x7fcac3fbb7f0>

In [24]:
batch_size = 1024
s = time.time()

for i in range(1000):
    # make batch
    x = xp.random.random((batch_size, 5)).astype(np.float32)
    t = xp.matmul(x, W.T) + b
    
    # forward
    y = model(x)

    # loss
    loss = F.mean_squared_error(y, t)

    # backward
    model.cleargrads()
    loss.backward(retain_grad=True)
    
    # update
    opt.update()

    # print(loss.data)

print(time.time() - s)

6.175689697265625


In [23]:
model.l0.W.data

array([[-0.6417413 ,  0.49128303, -0.57387   ,  0.04320413, -0.4345516 ],
       [-0.42418316,  0.06069907,  0.02359849,  0.34330904,  0.48370636],
       [-0.15359078, -0.15393898, -0.16277276,  0.27090344, -0.44372785],
       ...,
       [-0.09313224,  0.37690037, -0.24420881, -0.48479527, -1.469176  ],
       [ 0.6556039 ,  0.15861796, -0.5061554 , -0.199916  ,  0.46875098],
       [ 0.03929974, -0.07333896,  0.5156775 , -0.6401377 , -0.17092767]],
      dtype=float32)

In [30]:
def hoge(x):
    pass

In [24]:
print (hoge(0))

None


In [25]:
y is None

False

In [26]:
y = None

In [27]:
print(y)

None
