* Tensor Coreの効果を調査
* FP16 拡大6層 MNIST ,単層全結合
* 自宅PC RTX2080

In [1]:
# FP16 拡大6層 MNIST
import chainer
import chainer.functions as F
import chainer.links as L
from chainer import training
from chainer.training import extensions
import numpy as np

chainer.global_config.dtype =  np.float16
#chainer.global_config.dtype =  np.float32
device = chainer.get_device('0')
unit =4096
batchsize = 4096
epoch = 20

chainer.print_runtime_info()

Platform: Windows-10-10.0.17763-SP0
Chainer: 7.0.0a1
NumPy: 1.16.2
CuPy:
  CuPy Version          : 7.0.0a1
  CUDA Root             : C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1
  CUDA Build Version    : 10010
  CUDA Driver Version   : 10010
  CUDA Runtime Version  : 10010
  cuDNN Build Version   : 7500
  cuDNN Version         : 7500
  NCCL Build Version    : None
  NCCL Runtime Version  : None
iDeep: Not Available


In [2]:
# Network definition
class MLP(chainer.Chain):

    def __init__(self, n_units, n_out):
        super(MLP, self).__init__()
        with self.init_scope():
            # the size of the inputs to each layer will be inferred
            self.l1 = L.Linear(None, n_units)  # n_in -> n_units
            self.l2 = L.Linear(None, n_units)  # n_units -> n_units
            self.l3 = L.Linear(None, n_units)  # n_units -> n_units
            self.l4 = L.Linear(None, n_units)  # n_units -> n_units
            self.l5 = L.Linear(None, n_units)  # n_units -> n_units
            self.l6 = L.Linear(None, n_out)  # n_units -> n_out

    def forward(self, x):
        h1 = F.relu(self.l1(x))
        h2 = F.relu(self.l2(h1))
        h3 = F.relu(self.l3(h2))
        h4 = F.relu(self.l4(h3))
        h5 = F.relu(self.l5(h4))
        return self.l6(h5)

# Set up a neural network to train
model = L.Classifier(MLP(unit, 10))
model.to_device(device)
device.use()

# Setup an optimizer
#optimizer = chainer.optimizers.Adam()
optimizer = chainer.optimizers.SGD()
optimizer.setup(model)

# Load the MNIST dataset
train, test = chainer.datasets.get_mnist()
train_iter = chainer.iterators.SerialIterator(train, batchsize)
test_iter = chainer.iterators.SerialIterator(test, batchsize,repeat=False, shuffle=False)

# Set up a trainer
updater = training.updaters.StandardUpdater( train_iter, optimizer, device=device)
trainer = training.Trainer(updater, (epoch, 'epoch'), out='result')
trainer.extend(extensions.Evaluator(test_iter, model, device=device))
trainer.extend(extensions.LogReport())
trainer.extend(extensions.PrintReport(
    ['epoch', 'main/loss', 'validation/main/loss',
     'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))

In [3]:
# Run the training mnist
trainer.run()

epoch       main/loss   validation/main/loss  main/accuracy  validation/main/accuracy  elapsed_time
1           2.29492     2.2832                0.149536       0.232422                  3.08555       
2           2.27344     2.25977               0.265137       0.291504                  3.887         
3           2.25195     2.24023               0.315186       0.339844                  4.63864       
4           2.23047     2.21484               0.371826       0.4104                    5.44276       
5           2.20898     2.18945               0.44458        0.477539                  6.24275       
6           2.18359     2.16602               0.500977       0.527344                  7.008         
7           2.1582      2.13477               0.55127        0.574707                  7.80822       
8           2.12891     2.10352               0.587891       0.61084                   8.63134       
9           2.09766     2.06836               0.619629       0.638672               

In [5]:
# FP16 単層全結合
import chainer
import chainer.functions as F
import numpy as np
import cupy as cp
import time

COUNT = 1000
N = 4096   
x = np.random.uniform(size=(N, N))
W = np.random.uniform(size=(N, N))

x = chainer.Variable(cp.asarray(x,dtype= np.float16))
W = chainer.Variable(cp.asarray(W,dtype= np.float16))

start = time.time()

for i in range(COUNT):
    y = F.linear(x, W, b=None, n_batch_axes=1)  
    
print(y[0][0])
end = time.time()
elapsed = end - start

print('計算時間：{:.3f} s'.format(elapsed))
print('計算速度：{:.3f} TFlops'.format(1e-12* COUNT * 2*N*N*N / elapsed))

variable(1031.)
計算時間：3.340 s
計算速度：41.147 TFlops
