# 微分可能LUTモデルによるCNNでのCIFAR-10学習

Differentiable LUTモデルで畳み込み層を形成して、一般的なデータに対してCNNによる回路学習を行います。 

In [1]:
import os
import numpy as np
from tqdm.notebook import tqdm

import torch
import torchvision
import torchvision.transforms as transforms

import binarybrain as bb

### データセット

データセットの準備には torchvision を使います

In [2]:
# configuration
net_name              = 'Cifar10DifferentiableLutCnn_ave1'
data_path             = os.path.join('./data/', net_name)
rtl_sim_path          = '../../verilog/cifar10'
rtl_module_name       = 'Cifar10LutCnn'
output_velilog_file   = os.path.join(data_path, rtl_module_name + '.v')
sim_velilog_file      = os.path.join(rtl_sim_path, rtl_module_name + '.v')

bin_mode              = True
frame_modulation_size = 7
depth_modulation_size = 15
epochs                = 8
mini_batch_size       = 32

# dataset
dataset_path = './data/'
dataset_train = torchvision.datasets.CIFAR10(root=dataset_path, train=True, transform=transforms.ToTensor(), download=True)
dataset_test  = torchvision.datasets.CIFAR10(root=dataset_path, train=False, transform=transforms.ToTensor(), download=True)
loader_train = torch.utils.data.DataLoader(dataset=dataset_train, batch_size=mini_batch_size, shuffle=True, num_workers=2)
loader_test  = torch.utils.data.DataLoader(dataset=dataset_test,  batch_size=mini_batch_size, shuffle=False, num_workers=2)

Files already downloaded and verified
Files already downloaded and verified


## ネットワーク構築

Convolution2d を使って畳み込み層を作ります。<br>
Convolution2d は指定した層を im2col と col2im で挟み込んで Lowering による畳み込みをサポートします。<br>
DenseAffine を Lowering すると一般にCNNで知られる畳み込み層になりますが、LUT-Network では
ここに DifferentiableLut を組み合わせて作った層を設定することでDenseAffineとは異なる効率の良い畳み込み層を実現します。

In [3]:
# バイナリ時は BIT型を使えばメモリ削減可能
bin_dtype = bb.DType.BIT if bin_mode else bb.DType.FP32

class DifferentiableLutBlock(bb.Sequential):
    def __init__(self, output_shape, depth, name=None, batch_norm=True, binarize=True, average=True, bin_dtype=bb.DType.FP32):
        self.layers = []
        for i in range(depth):
            if name is None:
                layer_name = None
            else:
                layer_name = name + '_' + str(i)
            
            connection ='serial' if i < depth-1 else 'random'
            if i == 0 and average:
                self.layers.insert(0, bb.AverageLut(output_shape, connection=connection, binarize=binarize, name=layer_name, N=4, bin_dtype=bin_dtype))
            else:
                self.layers.insert(0, bb.DifferentiableLut(output_shape, connection=connection, batch_norm=batch_norm, binarize=binarize, name=layer_name, N=4, bin_dtype=bin_dtype))
            output_shape[0] *= 4
        super(DifferentiableLutBlock, self).__init__(self.layers, name=name)

In [4]:
class DifferentiableLutConvolution2d(bb.Convolution2d):
    def __init__(self, output_ch, depth, filter_size=(3, 3), padding='valid', batch_norm=True, binarize=True, name=None, fw_dtype=bb.DType.FP32):
        super(DifferentiableLutConvolution2d, self).__init__(
                DifferentiableLutBlock([output_ch, 1, 1], depth, batch_norm=batch_norm, binarize=binarize, name=name, bin_dtype=fw_dtype),
                filter_size=filter_size, padding=padding, name=name, fw_dtype=fw_dtype)

In [5]:
# define network
net = bb.Sequential([
            bb.RealToBinary(frame_modulation_size=frame_modulation_size, depth_modulation_size=depth_modulation_size, bin_dtype=bin_dtype),
            bb.Sequential([
                bb.Convolution2d(  # pointwise
                    bb.Sequential([
                        DifferentiableLutBlock([32], 3, bin_dtype=bin_dtype),
                    ]),
                    filter_size=(1, 1),
                    fw_dtype=bin_dtype),

                bb.Convolution2d(  # 32x32 -> 30x30
                    bb.Sequential([
                        DifferentiableLutBlock([32], 3, bin_dtype=bin_dtype),
                    ]),
                    filter_size=(3, 3),
                    fw_dtype=bin_dtype),
                bb.Convolution2d(  # 30x30 -> 28x28
                    bb.Sequential([
                        DifferentiableLutBlock([64], 3, bin_dtype=bin_dtype),
                    ]),
                    filter_size=(3, 3),
                    fw_dtype=bin_dtype),
                bb.MaxPooling(filter_size=(2, 2), fw_dtype=bin_dtype),  # 28x28-> 14x14
            ]),
            bb.Sequential([
                bb.Convolution2d(  # 14x14-> 12x12
                    bb.Sequential([
                        DifferentiableLutBlock([64], 3, bin_dtype=bin_dtype),
                    ]),
                    filter_size=(3, 3),
                    fw_dtype=bin_dtype),
                bb.Convolution2d(  # 12x12-> 10x10
                    bb.Sequential([
                        DifferentiableLutBlock([128], 3, bin_dtype=bin_dtype),
                    ]),
                    filter_size=(3, 3),
                    fw_dtype=bin_dtype),
                bb.MaxPooling(filter_size=(2, 2), fw_dtype=bin_dtype),  # 10x10-> 5x5
            ]),
            bb.Sequential([
                bb.Convolution2d(  # 5x5-> 1x1
                    bb.Sequential([
                        DifferentiableLutBlock([512], 2, bin_dtype=bin_dtype),
                        DifferentiableLutBlock([10], 3, average=True, bin_dtype=bin_dtype),
                    ]),
                    filter_size=(5, 5),
                    fw_dtype=bin_dtype),
            ]),
            bb.BinaryToReal(frame_integration_size=frame_modulation_size, bin_dtype=bin_dtype)
        ])

net.set_input_shape([3, 32, 32])

if bin_mode:
    net.send_command("binary true")

# print(net.get_info()) # ネットを表示

In [6]:
net.print_info()

----------------------------------------------------------------------
[Sequential] 
 input  shape : [3, 32, 32] output shape : [10, 1, 1]
  --------------------------------------------------------------------
  [RealToBinary] 
   input  shape : {3, 32, 32} output shape : {45, 32, 32}
  --------------------------------------------------------------------
  [Sequential] 
   input  shape : [45, 32, 32]   output shape : [64, 14, 14]
    ------------------------------------------------------------------
    [Convolution2d] 
     input  shape : [45, 32, 32]     output shape : [32, 32, 32]
      ----------------------------------------------------------------
      [ConvolutionIm2Col] 
       input  shape : {45, 32, 32} output shape : {45, 1, 1}
      ----------------------------------------------------------------
      [Sequential] 
       input  shape : [45, 1, 1]       output shape : [32]
        --------------------------------------------------------------
        [Sequential] 
       

## 学習実施

学習を行います

In [7]:
#bb.load_networks(data_path, net)

# learning
loss      = bb.LossSoftmaxCrossEntropy()
metrics   = bb.MetricsCategoricalAccuracy()
optimizer = bb.OptimizerAdam(learning_rate=0.001)

optimizer.set_variables(net.get_parameters(), net.get_gradients())

for epoch in range(epochs):
    loss.clear()
    metrics.clear()

    # learning
    with tqdm(loader_train) as t:
        for images, labels in t:
            x_buf = bb.FrameBuffer.from_numpy(np.array(images).astype(np.float32))
            t_buf = bb.FrameBuffer.from_numpy(np.identity(10)[np.array(labels)].astype(np.float32))

            y_buf = net.forward(x_buf, train=True)

            dy_buf = loss.calculate(y_buf, t_buf)
            metrics.calculate(y_buf, t_buf)
            net.backward(dy_buf)

            optimizer.update()

            t.set_postfix(loss=loss.get(), acc=metrics.get())

    # test
    loss.clear()
    metrics.clear()
    for images, labels in loader_test:
        x_buf = bb.FrameBuffer.from_numpy(np.array(images).astype(np.float32))
        t_buf = bb.FrameBuffer.from_numpy(np.identity(10)[np.array(labels)].astype(np.float32))

        y_buf = net.forward(x_buf, train=False)

        loss.calculate(y_buf, t_buf)
        metrics.calculate(y_buf, t_buf)

    bb.save_networks(data_path, net)

    print('epoch[%d] : loss=%f accuracy=%f' % (epoch, loss.get(), metrics.get()))

  0%|          | 0/1563 [00:00<?, ?it/s]

  x_buf = bb.FrameBuffer.from_numpy(np.array(images).astype(np.float32))
  t_buf = bb.FrameBuffer.from_numpy(np.identity(10)[np.array(labels)].astype(np.float32))
  x_buf = bb.FrameBuffer.from_numpy(np.array(images).astype(np.float32))
  t_buf = bb.FrameBuffer.from_numpy(np.identity(10)[np.array(labels)].astype(np.float32))


epoch[0] : loss=1.926563 accuracy=0.314200


  0%|          | 0/1563 [00:00<?, ?it/s]

epoch[1] : loss=1.938607 accuracy=0.315900


  0%|          | 0/1563 [00:00<?, ?it/s]

epoch[2] : loss=1.918276 accuracy=0.345000


  0%|          | 0/1563 [00:00<?, ?it/s]

epoch[3] : loss=1.944370 accuracy=0.277600


  0%|          | 0/1563 [00:00<?, ?it/s]

epoch[4] : loss=1.897659 accuracy=0.339800


  0%|          | 0/1563 [00:00<?, ?it/s]

epoch[5] : loss=1.902620 accuracy=0.337000


  0%|          | 0/1563 [00:00<?, ?it/s]

epoch[6] : loss=1.912261 accuracy=0.345900


  0%|          | 0/1563 [00:00<?, ?it/s]

epoch[7] : loss=1.928044 accuracy=0.357200


## RTL(Verilog)変換

FPGA化するために Verilog に変換します。インターフェースはXilinx社のAXI4 Stream Video 仕様(フレームスタートでtuserが立つ)となります。
MaxPooling の単位で画像サイズが縮小されてしまうので、現状、この単位でしか変換できないため3つに分けて出力しています。

In [8]:
# export verilog
with open(output_velilog_file, 'w') as f:
    f.write('`timescale 1ns / 1ps\n\n')
    bb.dump_verilog_lut_cnv_layers(f, rtl_module_name + 'Cnv0', net[1])
    bb.dump_verilog_lut_cnv_layers(f, rtl_module_name + 'Cnv1', net[2])
    bb.dump_verilog_lut_cnv_layers(f, rtl_module_name + 'Cnv2', net[3])