# DnnWeaver v2.0 tutorial
### The tutorial covers the basics of using DnnWeaver v2.0

## Part 1: Basics

In [1]:
import logging
import numpy as np
import array

from dnnweaver2.benchmarks import get_graph
from dnnweaver2.simulator.accelerator import Accelerator
from dnnweaver2.compiler import *
from dnnweaver2.fpga.fpgamanager import FPGAManager

from dnnweaver2.scalar.dtypes import FixedPoint

### Step 1: Express the DNN
DnnWeaver v2.0 expresses DNNs as a graph, 
where the nodes are the operations/layers like convolution and edges are tensors

The datatypes and bitwidths for the tensors are programmable
By default, DnnWeaver 2.0 uses 16-bit fixed-point datatypes for all tensors

The code snippe below expresses a single layers convolutional neural network

In [2]:
graph = Graph('YOLOv2-Test: 16-bit', dataset='imagenet', log_level=logging.INFO)
batch_size = 1

with graph.as_default():

    with graph.name_scope('inputs'):
        i = get_tensor(shape=(batch_size,32,32,3), name='data', dtype=FQDtype.FXP16, trainable=False)

    with graph.name_scope('conv0'):
        weights = get_tensor(shape=(128, 3, 3, 3),
                             name='weights',
                             dtype=FixedPoint(16,12))
        biases = get_tensor(shape=(128),
                             name='biases',
                             dtype=FixedPoint(32,20))
        conv = conv2D(i, weights, biases, pad='SAME', dtype=FixedPoint(16,8))
    # DnnWeaver2 automatically takes care of type conversion
    with graph.name_scope('pool1'):
        pool = maxPool(conv, pooling_kernel=(1,2,2,1), stride=(1,2,2,1), pad='VALID')


print('*'*50)
print('List of ops (nodes) in the graph')
# print the ops in the yolo2_graph
for op in graph.op_registry:
    print('\tOp name: {}'.format(op))
print('*'*50)
    
print('*'*50)
print('List of tensors (edges) in the graph')
# print the tensors in the yolo2_graph
for tname, t in graph.tensor_registry.items():
    print('\t{}'.format(t))
print('*'*50)

**************************************************
List of ops (nodes) in the graph
	Op name: conv0/Convolution
	Op name: conv0/TypeCastOp
	Op name: pool1/MaxPooling
**************************************************
**************************************************
List of tensors (edges) in the graph
	inputs/data[1,32,32,3] (FXP16 (8,8))
	conv0/weights[128,3,3,3] (FXP16 (4,12))
	conv0/biases[128] (FXP32 (12,20))
	conv0/Convolution[1,32,32,128] (FXP64 (44,20))
	conv0/TypeCastOp[1,32,32,128] (FXP16 (8,8))
	pool1/MaxPooling[1,16,16,128] (FXP16 (8,8))
**************************************************


### Step 2: Compile the graph to generate instructions for the FPGA accelerator
1. Define the accelerator object
2. Optimize tiling for the accelerator and generate instruction binary for the accelerator

In [3]:
# Step 2.1
# on-chip BRAM buffers (number_bram * data_type * entries)
num_rows = 32
num_cols = 32
bram = {
    'ibuf':            num_cols * 16 * 2048 / 2,
    'obuf':            num_rows * 64 * 2048 / 2,
    'wbuf': num_cols * num_rows * 16 *  512 / 2,
    'bbuf':            num_rows * 32 * 2048 / 2,
}
acc_obj = Accelerator(
    N=num_rows, M=num_cols,
    prec=16,
    mem_if_width=256,
    frequency=150e6,
    sram=bram
)

print(acc_obj.__str__())

Accelerator object
	Precision: 16
	Systolic array size: 32 -rows x 32 -columns
	IBUF size:     65,536 Bytes
	WBUF size:    524,288 Bytes
	OBUF size:    262,144 Bytes
	BBUF size:    131,072 Bytes
Double buffering enabled. Sizes of SRAM are halved


In [4]:
# Step 2.2
log_level = logging.INFO
compiler = GraphCompiler(log_level=log_level)
inst_binary = compiler.compile(graph=graph, acc_obj=acc_obj)

print('Number of instructions: {}'.format(inst_binary.size))

Number of instructions: 124
