In [1]:
import sys
import numpy as np
import time

try:
    import cupy as cp
    xp = cp
except:
    xp = np

sys.path.append('../')

import mandala
from mandala import Node
from mandala import Variable

import mandala.autodiff as ad
import mandala.autodiff.functions as F

In [2]:
class Model(ad.Graph):
    def __init__(self):
        super(Model, self).__init__()

        self.l0 = F.Linear(   5, 1000)
        self.l1 = F.Linear(1000, 1000)
        self.l2 = F.Linear(1000, 1000)
        self.l3 = F.Linear(1000, 1000)
        self.l4 = F.Linear(1000, 1000)
        self.l5 = F.Linear(1000, 1000)
        self.l6 = F.Linear(1000,    3)

    def __call__(self, x):
        h = F.relu(self.l0(x))
        h = F.relu(self.l1(h))
        h = F.relu(self.l2(h))
        h = F.relu(self.l3(h))
        h = F.relu(self.l4(h))
        h = F.relu(self.l5(h))
        y = F.relu(self.l6(h))
        return y

In [3]:
model = Model()
if not xp == np:
    model.to_gpu()

In [4]:
W = xp.arange(15, dtype=np.float32).reshape(3, 5)
b = xp.arange(3, dtype=np.float32)

In [5]:
model.subgraphs

{'l0': <mandala.autodiff.functions.linear.Linear at 0x1948536cd30>,
 'l1': <mandala.autodiff.functions.linear.Linear at 0x1948536ce80>,
 'l2': <mandala.autodiff.functions.linear.Linear at 0x1948536c908>,
 'l3': <mandala.autodiff.functions.linear.Linear at 0x19485362048>,
 'l4': <mandala.autodiff.functions.linear.Linear at 0x19485362160>,
 'l5': <mandala.autodiff.functions.linear.Linear at 0x19485362278>,
 'l6': <mandala.autodiff.functions.linear.Linear at 0x19485362390>}

In [6]:
batch_size = 32

## Mandala

In [7]:
s = time.time()
lr = 1e-4

for i in range(100):
    # make batch
    x = Variable(xp.random.random((batch_size, 5)).astype(np.float32))
    t = Variable(xp.matmul(x.data, W.T) + b)
    
    # forward
    y  = model(x)
    loss = (y - t) ** 2 / batch_size

    # loss
    loss = F.sum((y - t) ** 2) / batch_size

    # backward
    model.cleargrads()
    loss.backward()
    
    del loss
    del y

    # update
#    for p in model.params.values():
#        p.grad.reserve()

    for p in model.params.values():
        p.data -= lr * p.grad.data

    #print(loss.data)

print('time:', time.time() - s)

time: 5.399904251098633


## Chainer

In [9]:
import chainer
import chainer.functions as F
import chainer.links as L
from chainer import optimizer

In [10]:
class Model(chainer.Chain):
    def __init__(self):
        super().__init__()
        with self.init_scope():
            self.l0 = L.Linear(   5, 1000)
            self.l1 = L.Linear(1000, 1000)
            self.l2 = L.Linear(1000, 1000)
            self.l3 = L.Linear(1000, 1000)
            self.l4 = L.Linear(1000, 1000)
            self.l5 = L.Linear(1000, 1000)
            self.l6 = L.Linear(1000,    3)

    def __call__(self, x):
        h  = F.relu(self.l0(x))
        h  = F.relu(self.l1(h))
        h  = F.relu(self.l2(h))
        h  = F.relu(self.l3(h))
        h  = F.relu(self.l4(h))
        h  = F.relu(self.l5(h))
        y  = self.l6(h)
        return y

In [11]:
model = Model()
if not xp == np:
    model.to_gpu()

In [12]:
opt = chainer.optimizers.SGD(lr=1e-4)
opt.setup(model)

In [13]:
s = time.time()

for i in range(100):
    # make batch
    x = xp.random.random((batch_size, 5)).astype(np.float32)
    t = xp.matmul(x, W.T) + b
    
    # forward
    y = model(x)

    # loss
    loss = F.mean_squared_error(y, t)

    # backward
    model.cleargrads()
    loss.backward(retain_grad=True)
    
    # update
    opt.update()

    # print(loss.data)

print(time.time() - s)

6.075519323348999
