In [1]:
!pip install git+https://github.com/benoitmartin88/pytorchtrainer.git

!pip install torchsummary

!pip install brevitas

!pip install -U netron

# PyTorch libraries and modules
import torch
import torch.nn as nn
from torch.optim import Adam, SGD
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
from torchsummary import summary

import pandas as pd
import numpy as np
from sklearn.utils import resample

from torch.nn import Module, ModuleList, BatchNorm2d, MaxPool2d, BatchNorm1d, ReLU, Softmax, CrossEntropyLoss, Sequential, Dropout, Conv2d, Linear

from brevitas.nn import QuantConv2d, QuantIdentity, QuantLinear, QuantReLU
from brevitas.core.restrict_val import RestrictValueType
from tensor_norm import TensorNorm
from common import CommonWeightQuant, CommonActQuant

from qonnx.core.modelwrapper import ModelWrapper
from qonnx.custom_op.registry import getCustomOp

Requirement already up-to-date: netron in /opt/conda/lib/python3.8/site-packages (7.3.6)


In [2]:
       
CNV_OUT_CH_POOL = [(21, False), (21, True), (21, False)]#, (128, True), (256, False), (256, False)]
INTERMEDIATE_FC_FEATURES = [(3549, 16), (16, 16)]
LAST_FC_IN_FEATURES = 16
LAST_FC_PER_OUT_CH_SCALING = False
POOL_SIZE = 2
KERNEL_SIZE = 6

class CNV(Module):

    def __init__(self, num_classes, weight_bit_width, act_bit_width, in_bit_width, in_ch):
        super(CNV, self).__init__()

        self.conv_features = ModuleList()
        self.linear_features = ModuleList()

        self.conv_features.append(QuantIdentity( # for Q1.7 input format
            act_quant=CommonActQuant,
            bit_width=in_bit_width,
            min_val=- 1.0,
            max_val=1.0 - 2.0 ** (-7),
            narrow_range=False,
            restrict_scaling_type=RestrictValueType.POWER_OF_TWO))

        for out_ch, is_pool_enabled in CNV_OUT_CH_POOL:
            self.conv_features.append(QuantConv2d(
                kernel_size=KERNEL_SIZE,
                in_channels=in_ch,
                out_channels=out_ch,
                bias=True,
                padding=4,
                weight_quant=CommonWeightQuant,
                weight_bit_width=weight_bit_width))
            in_ch = out_ch
            self.conv_features.append(BatchNorm2d(in_ch, eps=1e-4))
            self.conv_features.append(QuantIdentity(
                act_quant=CommonActQuant,
                bit_width=act_bit_width))
            if is_pool_enabled:
                self.conv_features.append(MaxPool2d(kernel_size=2))

        for in_features, out_features in INTERMEDIATE_FC_FEATURES:
            self.linear_features.append(QuantLinear(
                in_features=in_features,
                out_features=out_features,
                bias=True,
                weight_quant=CommonWeightQuant,
                weight_bit_width=weight_bit_width))
            self.linear_features.append(BatchNorm1d(out_features, eps=1e-4))
            self.linear_features.append(QuantIdentity(
                act_quant=CommonActQuant,
                bit_width=act_bit_width))

        self.linear_features.append(QuantLinear(
            in_features=LAST_FC_IN_FEATURES,
            out_features=num_classes,
            bias=False,
            weight_quant=CommonWeightQuant,
            weight_bit_width=weight_bit_width))
        self.linear_features.append(TensorNorm())
        
        for m in self.modules():
            if isinstance(m, QuantConv2d) or isinstance(m, QuantLinear):
                torch.nn.init.uniform_(m.weight.data, -1, 1)


    def clip_weights(self, min_val, max_val):
        for mod in self.conv_features:
            if isinstance(mod, QuantConv2d):
                mod.weight.data.clamp_(min_val, max_val)
        for mod in self.linear_features:
            if isinstance(mod, QuantLinear):
                mod.weight.data.clamp_(min_val, max_val)

    def forward(self, x):
        x = 2.0 * x - torch.tensor([1.0], device=x.device)
        for mod in self.conv_features:
            x = mod(x)
        x = x.view(x.shape[0], -1)
        for mod in self.linear_features:
            x = mod(x)
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNV(num_classes=5, weight_bit_width=1, act_bit_width=1, in_bit_width=8, in_ch=1)

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
def print_summary(model):
    total_params = 0
    trainable_params = 0
    non_trainable_params = 0

    layer_num = 0
    print("________________________________________________________________________________")
    print("Layer (type)                           Output Shape                    Param #")
    print("--------------------------------------------------------------------------------")

    for name, param in model.named_parameters():
        if param.requires_grad:
            trainable_params += param.numel()
        else:
            non_trainable_params += param.numel()
        total_params += param.numel()

        layer_num += 1
        layer_name = name.split('.')[0]
        output_shape = list(param.size())

        print(f"{layer_num:3d} {layer_name:<34s}{str(output_shape):28s}{param.numel():10d}")

    print("_______________________________________________________________________________")
    print("Total parameters =", total_params)
    print("Trainable parameters =", trainable_params)
    print("Non-trainable parameters =", non_trainable_params)
    print("-------------------------------------------------------------------------------")
    
    
    input_size = (1, 14, 14)  # Example input size
    
    print("Input Size (MB):", input_size[0] * input_size[1] * input_size[2] * 4 / (1024 * 1024))
    print("Forward/Backward pass size (MB):", 2 * trainable_params * 4 / (1024 * 1024))
    print("Param Size (MB):", total_params * 4 / (1024 * 1024))
    print("Estimated Total Size (MB) :", (input_size[0] * input_size[1] * input_size[2] + 2 * trainable_params + total_params) * 4 / (1024 * 1024))

print_summary(model)

________________________________________________________________________________
Layer (type)                           Output Shape                    Param #
--------------------------------------------------------------------------------
  1 conv_features                     [21, 1, 6, 6]                      756
  2 conv_features                     [21]                                21
  3 conv_features                     [21]                                21
  4 conv_features                     [21]                                21
  5 conv_features                     [21, 21, 6, 6]                   15876
  6 conv_features                     [21]                                21
  7 conv_features                     [21]                                21
  8 conv_features                     [21]                                21
  9 conv_features                     [21, 21, 6, 6]                   15876
 10 conv_features                     [21]                        

In [2]:
xtrain_reshape = torch.load("xtrain25000014x14float32.pth")
ytrain_tensor = torch.load("ytrain250000int64.pth")
xval_reshape = torch.load("xval4412014x14float32.pth")
yval_tensor = torch.load("yval44120int64.pth")
xtest_reshape = torch.load("xtest4527014x14float32.pth")
ytest_tensor = torch.load("ytest45270int64.pth")

In [3]:
class Data(Dataset):
    def __init__(self, X, y):
        self.X = X.unsqueeze(1)
        self.y = y
        self.len = self.X.shape[0]

    def __getitem__(self, index):
        return self.X[index], self.y[index]

    def __len__(self):
        return self.len

    
batch_size = 100
#Just for wast training making batch_size = 100
#batch_size = 2500

#

# Instantiate training and test data
train_data = Data(xtrain_reshape, ytrain_tensor)
train_dataloader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)

val_data = Data(xval_reshape, yval_tensor)
val_dataloader = DataLoader(dataset=val_data, batch_size=batch_size, shuffle=True)

test_data = Data(xtest_reshape, ytest_tensor)
test_dataloader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=True)

# # Check it's working
# for batch, (X, y) in enumerate(train_dataloader):
#     print(f"Batch: {batch+1}")
#     print(f"XTrain shape: {X.shape}")
#     print(f"yTrain shape: {y.shape}")
#     break
# for batch, (X, y) in enumerate(val_dataloader):
#     print(f"Batch: {batch+1}")
#     print(f"XVal: {X.shape}")
#     print(f"yVal: {y.shape}")
#     break
# for batch, (X, y) in enumerate(test_dataloader):
#     print(f"Batch: {batch+1}")
#     print(f"XTest: {X.shape}")
#     print(f"yTest: {y.shape}")
#     break

In [16]:
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = CrossEntropyLoss()
local_ep = 10

In [17]:
from tqdm import tqdm

epoch_loss = []
batch_loss = []

for iter in range(local_ep):
    model.train()
    criterion.train()
    
    train_correct = 0
    train_total = 0
    train_batch_loss =[]
    
    total_batches = len(train_dataloader)
    
    progress_bar = tqdm(total=total_batches, desc="Processing Local Epoch(s)", unit="batch", position=0, leave=True)
    
    for batch_idx, (xtrain, ytrain) in enumerate(train_dataloader):
        xtrain, ytrain = xtrain.to(device), ytrain.to(device)
        model_preds = model(xtrain)
        _, pred_labels = torch.max(model_preds, 1)
        train_correct += torch.sum(pred_labels == ytrain).item()
        train_total += ytrain.size(0)
        loss = criterion(model_preds, ytrain)
        train_batch_loss.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        model.clip_weights(-1, 1)############This Line in DPFL needs to be added
        
        batch_loss.append(loss.item())
                
        progress_bar.update(1)
        progress_bar.set_postfix(batch=batch_idx + 1, refresh=True)
        
    progress_bar.close()
    
    epoch_loss.append(sum(batch_loss)/len(batch_loss))
    
    model.eval()  # Set the model to evaluation mode
    criterion.eval()
    
    val_correct = 0
    val_total = 0
    val_batch_loss = []       
    
    with torch.no_grad():
        for batch_idx, (xval, yval) in enumerate(val_dataloader):
            xval, yval = xval.to(device), yval.to(device)
            val_model_preds = model(xval)
            val_loss = criterion(val_model_preds, yval)
            val_batch_loss.append(val_loss.item())
            _, val_pred_labels = torch.max(val_model_preds, 1)
            val_correct += torch.sum(val_pred_labels == yval).item()
            val_total += yval.size(0)
    
    # Calculate and print average training and validation losses and accuracies
    avg_train_loss = sum(train_batch_loss) / len(train_batch_loss)
    train_accuracy = train_correct / train_total if train_total > 0 else 0.0
    avg_val_loss = sum(val_batch_loss) / len(val_batch_loss)
    val_accuracy = val_correct / val_total if val_total > 0 else 0.0
    print('Local Epoch: ', iter + 1,
          'Training Loss: ', avg_train_loss, 'Training Accuracy:{:.4f}%'.format(100*train_accuracy))
    print('Validation Loss: ', avg_val_loss, 'Validation Accuracy:{:.4f}%'.format(100*val_accuracy))
    model.train() 

Processing Local Epoch(s): 100%|█| 2500/2500 [02:35<00:00, 15.17batch/s, batch=2500]
Processing Local Epoch(s):   0%|   | 3/2500 [00:00<02:48, 14.83batch/s, batch=3]

Local Epoch:  1 Training Loss:  0.8144089071512223 Training Accuracy:70.8996%
Validation Loss:  0.703330924203493 Validation Accuracy:75.4420%


Processing Local Epoch(s): 100%|█| 2500/2500 [02:37<00:00, 16.01batch/s, batch=2500]
Processing Local Epoch(s):   0%|   | 3/2500 [00:00<02:46, 14.99batch/s, batch=3]

Local Epoch:  2 Training Loss:  0.5757969007968903 Training Accuracy:80.8348%
Validation Loss:  0.553743796043806 Validation Accuracy:81.2761%


Processing Local Epoch(s): 100%|█| 2500/2500 [02:38<00:00, 15.57batch/s, batch=2500]
Processing Local Epoch(s):   0%|   | 2/2500 [00:00<03:07, 13.30batch/s, batch=2]

Local Epoch:  3 Training Loss:  0.5236251120567321 Training Accuracy:83.1824%
Validation Loss:  0.6119288470410653 Validation Accuracy:78.1732%


Processing Local Epoch(s): 100%|█| 2500/2500 [02:36<00:00, 15.67batch/s, batch=2500]
Processing Local Epoch(s):   0%|   | 3/2500 [00:00<02:44, 15.17batch/s, batch=3]

Local Epoch:  4 Training Loss:  0.4869573950946331 Training Accuracy:84.5176%
Validation Loss:  0.4945628519646183 Validation Accuracy:82.9556%


Processing Local Epoch(s): 100%|█| 2500/2500 [02:35<00:00, 15.99batch/s, batch=2500]
Processing Local Epoch(s):   0%|   | 3/2500 [00:00<02:46, 14.98batch/s, batch=3]

Local Epoch:  5 Training Loss:  0.45384656904339793 Training Accuracy:85.7292%
Validation Loss:  0.5168481111121933 Validation Accuracy:82.5091%


Processing Local Epoch(s): 100%|█| 2500/2500 [02:35<00:00, 16.27batch/s, batch=2500]
Processing Local Epoch(s):   0%|   | 3/2500 [00:00<02:48, 14.82batch/s, batch=3]

Local Epoch:  6 Training Loss:  0.4362562894046307 Training Accuracy:86.2844%
Validation Loss:  0.489858385124897 Validation Accuracy:84.5512%


Processing Local Epoch(s): 100%|█| 2500/2500 [02:35<00:00, 15.83batch/s, batch=2500]
Processing Local Epoch(s):   0%|   | 3/2500 [00:00<02:45, 15.10batch/s, batch=3]

Local Epoch:  7 Training Loss:  0.41627859105467796 Training Accuracy:86.9676%
Validation Loss:  0.5268518675236681 Validation Accuracy:83.1233%


Processing Local Epoch(s): 100%|█| 2500/2500 [02:37<00:00, 16.19batch/s, batch=2500]
Processing Local Epoch(s):   0%|   | 3/2500 [00:00<02:44, 15.13batch/s, batch=3]

Local Epoch:  8 Training Loss:  0.3989079293906689 Training Accuracy:87.4712%
Validation Loss:  0.5963216067304439 Validation Accuracy:81.9606%


Processing Local Epoch(s): 100%|█| 2500/2500 [02:37<00:00, 16.67batch/s, batch=2500]
Processing Local Epoch(s):   0%|   | 3/2500 [00:00<02:43, 15.29batch/s, batch=3]

Local Epoch:  9 Training Loss:  0.3842227696299553 Training Accuracy:87.9516%
Validation Loss:  0.5213541446498077 Validation Accuracy:82.7516%


Processing Local Epoch(s): 100%|█| 2500/2500 [02:37<00:00, 16.31batch/s, batch=2500]


Local Epoch:  10 Training Loss:  0.3745816120862961 Training Accuracy:88.3344%
Validation Loss:  0.48162411348852097 Validation Accuracy:85.0816%


In [18]:
torch.save(model.state_dict(), 'DPFL_AIAccel34_state_dict.pth')
print ("Loss", sum(epoch_loss) / len(epoch_loss))

Loss 0.5895327242701595


In [19]:
def test_inference(model, test_dataloader):
    model.eval()

    loss, total, correct = 0.0, 0.0, 0.0
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    criterion = nn.CrossEntropyLoss().to(device)

    for batch_idx, (xtest, ytest) in enumerate(test_dataloader):
        xtest, ytest = xtest.to(device), ytest.to(device)
        # Inference
        outputs = model(xtest)
        batch_loss = criterion(outputs, ytest)
        loss += batch_loss.item()
        # Prediction
        #print(outputs)
        _, pred_labels = torch.max(outputs, 1)
        pred_labels = pred_labels.view(-1)
        correct += torch.sum(torch.eq(pred_labels, ytest)).item()
        total += len(ytest)

    accuracy = correct/total
    loss = loss/total
    return accuracy, loss

test_inference(model, test_dataloader)

(0.853611663353214, 0.004737781558654048)

In [20]:
import brevitas.onnx as bo
bo.export_finn_onnx(model, (1, 1, 14, 14), "DPFL_AIAccel34_export.onnx");

ir_version: 6
producer_name: "pytorch"
producer_version: "1.7"
graph {
  node {
    input: "0"
    input: "41"
    output: "42"
    name: "Mul_1"
    op_type: "Mul"
  }
  node {
    input: "42"
    input: "43"
    output: "44"
    name: "Sub_3"
    op_type: "Sub"
  }
  node {
    input: "44"
    input: "45"
    output: "46"
    name: "MultiThreshold_5"
    op_type: "MultiThreshold"
    attribute {
      name: "out_dtype"
      s: "INT8"
      type: STRING
    }
    domain: "finn.custom_op.general"
  }
  node {
    input: "46"
    input: "47"
    output: "48"
    name: "Add_7"
    op_type: "Add"
    domain: ""
  }
  node {
    input: "48"
    input: "49"
    output: "50"
    name: "Mul_9"
    op_type: "Mul"
  }
  node {
    input: "50"
    input: "51"
    output: "52"
    name: "Conv_11"
    op_type: "Conv"
    attribute {
      name: "dilations"
      ints: 1
      ints: 1
      type: INTS
    }
    attribute {
      name: "group"
      i: 1
      type: INT
    }
    attribute {
      

In [4]:
build_dir = '.'
import onnx
from finn.util.test import get_test_model_trained
import brevitas.onnx as bo
from qonnx.core.modelwrapper import ModelWrapper
from qonnx.transformation.infer_shapes import InferShapes
from qonnx.transformation.fold_constants import FoldConstants
from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames, RemoveStaticGraphInputs
from qonnx.transformation.infer_datatypes import InferDataTypes

model = ModelWrapper(build_dir + "/DPFL_AIAccel34_export.onnx")
model = model.transform(InferShapes())
model = model.transform(FoldConstants())
model.save(build_dir + "/DPFL_AIAccel34_tidy.onnx")

                i.e. domain=finn to domain=qonnx.custom_op.<general|fpgadataflow|...>


In [5]:
test_Onnx_dataloader = DataLoader(dataset=test_data, batch_size=1, shuffle=True)

In [6]:
import torch
import matplotlib.pyplot as plt
from pkgutil import get_data
import onnx
import onnx.numpy_helper as nph
# raw_i = get_data("qonnx.data", "onnx/mnist-conv/test_data_set_0/input_0.pb")
# input_tensor = onnx.load_tensor_from_string(raw_i)
import finn.core.onnx_exec as oxe
from qonnx.core.modelwrapper import ModelWrapper
model_t = ModelWrapper("DPFL_AIAccel34_tidy.onnx")
lossf = 0.0
correctf = 0
totalf = 0
max_iterations = 1800


for batch_idv, (xtest, ytest) in enumerate(test_Onnx_dataloader):
    if batch_idv >= max_iterations:
        break
    # Process the first batch
    # Access the first element of xtest and ytest
    single_xtest = xtest
    single_ytest = ytest
    input_tensor_pyt = single_xtest.float()
    input_tensor_npy = input_tensor_pyt.numpy()
    # Your processing code here..
    input_dict = {"0": input_tensor_npy}
    output_dict = oxe.execute_onnx(model_t, input_dict)
    produced_finn = output_dict[list(output_dict.keys())[0]]
    #print(produced_finn)
    produced_finnt = torch.from_numpy(produced_finn)
    _, pred_finn = torch.max(produced_finnt, 1)
    pred_finn = pred_finn.view(-1)
    correctf += torch.sum(torch.eq(pred_finn, ytest)).item()
    totalf += len(ytest)
        

accuracyf = correctf/totalf
lossf = lossf/totalf
print(accuracyf,lossf)

0.8483333333333334 0.0


In [7]:
from finn.util.pytorch import ToTensor
from qonnx.transformation.merge_onnx_models import MergeONNXModels
from qonnx.core.datatype import DataType
from qonnx.transformation.insert_topk import InsertTopK
from qonnx.transformation.infer_datatypes import InferDataTypes

model = ModelWrapper(build_dir+"/DPFL_AIAccel34_tidy.onnx")
global_inp_name = model.graph.input[0].name
global_inp_name = model.graph.input[0].name
ishape = model.get_tensor_shape(global_inp_name)
# preprocessing: torchvision's ToTensor divides uint8 inputs by 255
totensor_pyt = ToTensor()                                              
chkpt_preproc_name = build_dir+"/DPFL_AIAccel34_preproc.onnx"
bo.export_finn_onnx(totensor_pyt, ishape, chkpt_preproc_name)

# join preprocessing and core model
pre_model = ModelWrapper(chkpt_preproc_name)
model = model.transform(MergeONNXModels(pre_model))
# add input quantization annotation: UINT8 for all BNN-PYNQ models
global_inp_name = model.graph.input[0].name
model.set_tensor_datatype(global_inp_name, DataType["UINT8"])



In [8]:
model = model.transform(InsertTopK(k=1))
model = model.transform(InferShapes())
model = model.transform(FoldConstants())
# model = model.transform(GiveUniqueNodeNames())
# model = model.transform(GiveReadableTensorNames())
model = model.transform(InferDataTypes())
# model = model.transform(RemoveStaticGraphInputs())
model.save(build_dir+"/DPFL_AIAccel34_preproc.onnx")

In [9]:
from finn.transformation.streamline import Streamline
from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
import finn.transformation.streamline.absorb as absorb
from finn.transformation.streamline.reorder import MakeMaxPoolNHWC, MoveScalarLinearPastInvariants
from qonnx.transformation.infer_data_layouts import InferDataLayouts
from qonnx.transformation.general import RemoveUnusedTensors

model = ModelWrapper(build_dir + "/DPFL_AIAccel34_preproc.onnx")
model = model.transform(MoveScalarLinearPastInvariants())
model = model.transform(Streamline())
model = model.transform(LowerConvsToMatMul())
model = model.transform(MakeMaxPoolNHWC())
model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
model = model.transform(ConvertBipolarMatMulToXnorPopcount())
model = model.transform(Streamline())
# absorb final add-mul nodes into TopK
model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
model = model.transform(InferDataLayouts())
model = model.transform(RemoveUnusedTensors())
model.save(build_dir + "/DPFL_AIAccel34_streamlines.onnx")

In [10]:
import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
from finn.transformation.fpgadataflow.create_dataflow_partition import (
    CreateDataflowPartition,
)
from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
from qonnx.custom_op.registry import getCustomOp
from qonnx.transformation.infer_data_layouts import InferDataLayouts

# choose the memory mode for the MVTU units, decoupled or const
mem_mode = "decoupled"

model = ModelWrapper(build_dir + "/DPFL_AIAccel34_streamlines.onnx")
model = model.transform(to_hls.InferBinaryMatrixVectorActivation(mem_mode))
model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode))
# TopK to LabelSelect
model = model.transform(to_hls.InferLabelSelectLayer())
# input quantization (if any) to standalone thresholding
model = model.transform(to_hls.InferThresholdingLayer())
model = model.transform(to_hls.InferConvInpGen())
model = model.transform(to_hls.InferStreamingMaxPool())
# get rid of Reshape(-1, 1) operation between hlslib nodes
model = model.transform(RemoveCNVtoFCFlatten())
# get rid of Tranpose -> Tranpose identity seq
model = model.transform(absorb.AbsorbConsecutiveTransposes())
# infer tensor data layouts
model = model.transform(InferDataLayouts())

parent_model = model.transform(CreateDataflowPartition())
parent_model.save(build_dir + "/DPFL_AIAccel34_dataflow_parent.onnx")
sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
sdp_node = getCustomOp(sdp_node)
dataflow_model_filename = sdp_node.get_nodeattr("model")
# save the dataflow partition with a different name for easier access
dataflow_model = ModelWrapper(dataflow_model_filename)
dataflow_model.save(build_dir + "/DPFL_AIAccel34_dataflow_model.onnx")

In [11]:
from finn.util.visualization import showInNetron
showInNetron(build_dir + "/DPFL_AIAccel34_dataflow_parent.onnx")

Serving './v83qfloat32_dataflow_parent.onnx' at http://0.0.0.0:8081


In [1]:
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg
import os
import shutil

model_file = "DPFL_AIAccel34_dataflow_model.onnx"

rtlsim_output_dir = "output_AI-Accel-4"

#Delete previous run results if exist
if os.path.exists(rtlsim_output_dir):
    shutil.rmtree(rtlsim_output_dir)
    print("Previous run results deleted!")

cfg_stitched_ip = build.DataflowBuildConfig(
    output_dir          = rtlsim_output_dir,
    mvau_wwidth_max     = 10,
    target_fps          = 1000000,
    synth_clk_period_ns = 10.0,
    fpga_part           = "xczu9eg-ffvb1156-2-e",
    board               = "ZCU102",
    shell_flow_type     = build_cfg.ShellFlowType.VIVADO_ZYNQ,

    folding_config_file = "AIAccel4_hw_config.json",
    
    steps=["step_apply_folding_config",
           "step_generate_estimate_reports",
           "step_hls_codegen",
           "step_hls_ipgen",
           "step_set_fifo_depths",
           "step_create_stitched_ip",
           "step_measure_rtlsim_performance",
           "step_out_of_context_synthesis",
           "step_synthesize_bitfile",
           "step_make_pynq_driver",
          ],
    generate_outputs=[
        build_cfg.DataflowOutputType.STITCHED_IP,
        build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
        build_cfg.DataflowOutputType.OOC_SYNTH,
        build_cfg.DataflowOutputType.BITFILE,
        build_cfg.DataflowOutputType.PYNQ_DRIVER,
        build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE]
)

In [2]:
%%time
build.build_dataflow_cfg(model_file, cfg_stitched_ip)

Building dataflow accelerator from v83qfloat32_dataflow_model.onnx
Intermediate outputs will be generated in /tmp/finn_dev_shakeelarkam00
Final outputs will be generated in output_v83qfloat32
Build log is at output_v83qfloat32/build_dataflow.log
Running step: step_apply_folding_config [1/10]
Running step: step_generate_estimate_reports [2/10]
Running step: step_hls_codegen [3/10]
Running step: step_hls_ipgen [4/10]
Running step: step_set_fifo_depths [5/10]
Running step: step_create_stitched_ip [6/10]
Running step: step_measure_rtlsim_performance [7/10]
Running step: step_out_of_context_synthesis [8/10]
Running step: step_synthesize_bitfile [9/10]
Running step: step_make_pynq_driver [10/10]
Completed successfully
CPU times: user 6.92 s, sys: 1.28 s, total: 8.2 s
Wall time: 47min 28s


0