In [1]:
!pip install git+https://github.com/benoitmartin88/pytorchtrainer.git

!pip install torchsummary

!pip install brevitas

!pip install -U netron

# PyTorch libraries and modules
import torch
import torch.nn as nn
from torch.optim import Adam, SGD
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
# from torchsummary import summary

import pandas as pd
import numpy as np
# from sklearn.utils import resample

from torch.nn import Module, ModuleList, BatchNorm2d, MaxPool2d, BatchNorm1d, ReLU, Softmax, CrossEntropyLoss, Sequential, Dropout, Conv2d, Linear

from brevitas.nn import QuantConv2d, QuantIdentity, QuantLinear, QuantReLU
from brevitas.core.restrict_val import RestrictValueType
from tensor_norm import TensorNorm
from common import CommonWeightQuant, CommonActQuant

from qonnx.core.modelwrapper import ModelWrapper
from qonnx.custom_op.registry import getCustomOp

  _dtype_to_storage = {data_type(0).dtype: data_type for data_type in _storages}


In [2]:
CNV_OUT_CH_POOL = [(21, False), (21, True), (21, False)]#, (128, True), (256, False), (256, False)]
INTERMEDIATE_FC_FEATURES = [(3549, 16), (16, 16)]
LAST_FC_IN_FEATURES = 16
LAST_FC_PER_OUT_CH_SCALING = False
POOL_SIZE = 2
KERNEL_SIZE = 6

class CNV(Module):

    def __init__(self, num_classes, weight_bit_width, act_bit_width, in_bit_width, in_ch):
        super(CNV, self).__init__()

        self.conv_features = ModuleList()
        self.linear_features = ModuleList()

        self.conv_features.append(QuantIdentity( # for Q1.7 input format
            act_quant=CommonActQuant,
            bit_width=in_bit_width,
            min_val=- 1.0,
            max_val=1.0 - 2.0 ** (-7),
            narrow_range=False,
            restrict_scaling_type=RestrictValueType.POWER_OF_TWO))

        for out_ch, is_pool_enabled in CNV_OUT_CH_POOL:
            self.conv_features.append(QuantConv2d(
                kernel_size=KERNEL_SIZE,
                in_channels=in_ch,
                out_channels=out_ch,
                bias=False,
                padding=4,
                weight_quant=CommonWeightQuant,
                weight_bit_width=weight_bit_width))
            in_ch = out_ch
            #self.conv_features.append(BatchNorm2d(in_ch, eps=1e-4))
            self.conv_features.append(QuantIdentity(
                act_quant=CommonActQuant,
                bit_width=act_bit_width))
            if is_pool_enabled:
                self.conv_features.append(MaxPool2d(kernel_size=2))

        for in_features, out_features in INTERMEDIATE_FC_FEATURES:
            self.linear_features.append(QuantLinear(
                in_features=in_features,
                out_features=out_features,
                bias=False,
                weight_quant=CommonWeightQuant,
                weight_bit_width=weight_bit_width))
            #self.linear_features.append(BatchNorm1d(out_features, eps=1e-4))
            self.linear_features.append(QuantIdentity(
                act_quant=CommonActQuant,
                bit_width=act_bit_width))

        self.linear_features.append(QuantLinear(
            in_features=LAST_FC_IN_FEATURES,
            out_features=num_classes,
            bias=False,
            weight_quant=CommonWeightQuant,
            weight_bit_width=weight_bit_width))
        self.linear_features.append(TensorNorm())
        
        for m in self.modules():
            if isinstance(m, QuantConv2d) or isinstance(m, QuantLinear):
                torch.nn.init.uniform_(m.weight.data, -1, 1)


    def clip_weights(self, min_val, max_val):
        for mod in self.conv_features:
            if isinstance(mod, QuantConv2d):
                mod.weight.data.clamp_(min_val, max_val)
        for mod in self.linear_features:
            if isinstance(mod, QuantLinear):
                mod.weight.data.clamp_(min_val, max_val)

    def forward(self, x):
        x = 2.0 * x - torch.tensor([1.0], device=x.device)
        for mod in self.conv_features:
            x = mod(x)
        x = x.view(x.shape[0], -1)
        for mod in self.linear_features:
            x = mod(x)
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNV(num_classes=5, weight_bit_width=1, act_bit_width=1, in_bit_width=8, in_ch=1)

In [3]:
def print_summary(model):
    total_params = 0
    trainable_params = 0
    non_trainable_params = 0

    layer_num = 0
    print("________________________________________________________________________________")
    print("Layer (type)                           Output Shape                    Param #")
    print("--------------------------------------------------------------------------------")

    for name, param in model.named_parameters():
        if param.requires_grad:
            trainable_params += param.numel()
        else:
            non_trainable_params += param.numel()
        total_params += param.numel()

        layer_num += 1
        layer_name = name.split('.')[0]
        output_shape = list(param.size())

        print(f"{layer_num:3d} {layer_name:<34s}{str(output_shape):28s}{param.numel():10d}")

    print("_______________________________________________________________________________")
    print("Total parameters =", total_params)
    print("Trainable parameters =", trainable_params)
    print("Non-trainable parameters =", non_trainable_params)
    print("-------------------------------------------------------------------------------")
    
    
    input_size = (1, 14, 14)  # Example input size
    
    print("Input Size (MB):", input_size[0] * input_size[1] * input_size[2] * 4 / (1024 * 1024))
    print("Forward/Backward pass size (MB):", 2 * trainable_params * 4 / (1024 * 1024))
    print("Param Size (MB):", total_params * 4 / (1024 * 1024))
    print("Estimated Total Size (MB) :", (input_size[0] * input_size[1] * input_size[2] + 2 * trainable_params + total_params) * 4 / (1024 * 1024))

print_summary(model)

________________________________________________________________________________
Layer (type)                           Output Shape                    Param #
--------------------------------------------------------------------------------
  1 conv_features                     [21, 1, 6, 6]                      756
  2 conv_features                     [21, 21, 6, 6]                   15876
  3 conv_features                     [21, 21, 6, 6]                   15876
  4 linear_features                   [16, 3549]                       56784
  5 linear_features                   [16, 16]                           256
  6 linear_features                   [5, 16]                             80
  7 linear_features                   [1]                                  1
  8 linear_features                   [1]                                  1
_______________________________________________________________________________
Total parameters = 89630
Trainable parameters = 89630
Non-train

In [4]:
xtrain_reshape_uint8 = torch.load("xtrain2500001414reshapeduint8s3.pth")
scaled_ytrain_uint8 = torch.load("ytrain250000reshapedint64s3.pth")
xval_reshape_uint8 = torch.load("xval441201414reshapeduint8s3.pth" )
scaled_yval_uint8 = torch.load("yval44120reshapedint64s3.pth")
xtest_reshape_uint8 = torch.load("xtest452701414reshapeduint8s3.pth")
scaled_ytest_uint8 = torch.load("ytest45270reshapedint64s3.pth")

In [5]:
class Data(Dataset):
    def __init__(self, X, y):
        self.X = X.unsqueeze(1)
        self.y = y
        self.len = self.X.shape[0]

    def __getitem__(self, index):
        return self.X[index], self.y[index]

    def __len__(self):
        return self.len

    
batch_size = 503
#Just for wast training making batch_size = 100
#batch_size = 2500

#

# Instantiate training and test data
train_data = Data(xtrain_reshape_uint8, scaled_ytrain_uint8)
train_dataloader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)

val_data = Data(xval_reshape_uint8, scaled_yval_uint8)
val_dataloader = DataLoader(dataset=val_data, batch_size=batch_size, shuffle=True)

test_data = Data(xtest_reshape_uint8, scaled_ytest_uint8)
test_dataloader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=True)

# # Check it's working
# for batch, (X, y) in enumerate(train_dataloader):
#     print(f"Batch: {batch+1}")
#     print(f"XTrain shape: {X.shape}")
#     print(f"yTrain shape: {y.shape}")
#     break
# for batch, (X, y) in enumerate(val_dataloader):
#     print(f"Batch: {batch+1}")
#     print(f"XVal: {X.shape}")
#     print(f"yVal: {y.shape}")
#     break
# for batch, (X, y) in enumerate(test_dataloader):
#     print(f"Batch: {batch+1}")
#     print(f"XTest: {X.shape}")
#     print(f"yTest: {y.shape}")
#     break

In [22]:
import pytorchtrainer as ptt


device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = CrossEntropyLoss()

trainer = ptt.create_default_trainer(model, optimizer, criterion, verbose=1)
trainer.register_post_iteration_callback(ptt.callback.ValidationCallback(val_dataloader, metric=ptt.metric.TorchLoss(criterion)), frequency=200)
validation_callback = ptt.callback.ValidationCallback(val_dataloader, metric=ptt.metric.TorchLoss(criterion))
trainer.register_post_epoch_callback(validation_callback, frequency=1)
accuracy_callback = ptt.callback.MetricCallback(metric=ptt.metric.Accuracy(prediction_transform=lambda x: x.argmax(dim=1, keepdim=False)))
trainer.register_post_iteration_callback(accuracy_callback, frequency=1)
trainer.add_progressbar_metric("validation loss %.4f | accuracy %.4f", [validation_callback, accuracy_callback])

trainer.train(train_dataloader, max_epochs=2) #max_epochs=15


In [23]:
torch.save(model.state_dict(), 'pretrainedWeights_E71.pth')

In [7]:
def test_inference(model, test_dataloader):
    model.eval()

    loss, total, correct = 0.0, 0.0, 0.0
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    criterion = nn.CrossEntropyLoss().to(device)

    for batch_idx, (xtest, ytest) in enumerate(test_dataloader):
        xtest, ytest = xtest.to(device), ytest.to(device)
        # Inference
        outputs = model(xtest)
        batch_loss = criterion(outputs, ytest)
        loss += batch_loss.item()
        # Prediction
        #print(outputs)
        _, pred_labels = torch.max(outputs, 1)
        pred_labels = pred_labels.view(-1)
        correct += torch.sum(torch.eq(pred_labels, ytest)).item()
        total += len(ytest)

    accuracy = correct/total
    loss = loss/total
    return accuracy, loss

#%time test_inference(model, test_dataloader)

In [None]:
import brevitas.onnx as bo

model.load_state_dict(torch.load('pretrainedWeights_E71.pth', map_location=torch.device('cpu')))
bo.export_finn_onnx(model, (1, 1, 14, 14), "DPFL_AIAccel12_export.onnx");


In [None]:
import onnx
from finn.util.visualization import showInNetron
from finn.util.test import get_test_model_trained
from qonnx.core.modelwrapper import ModelWrapper
from qonnx.transformation.infer_shapes import InferShapes
from qonnx.transformation.fold_constants import FoldConstants
from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames, RemoveStaticGraphInputs

model = ModelWrapper("DPFL_AIAccel12_export.onnx")
model = model.transform(InferShapes())
model = model.transform(FoldConstants())
model.save("DPFL_AIAccel12_tidy.onnx")

showInNetron("DPFL_AIAccel12_tidy.onnx")

In [None]:
from finn.util.pytorch import ToTensor
from qonnx.transformation.merge_onnx_models import MergeONNXModels
from qonnx.core.datatype import DataType

model = ModelWrapper("DPFL_AIAccel12_tidy.onnx")
global_inp_name = model.graph.input[0].name
ishape = model.get_tensor_shape(global_inp_name)

model.set_tensor_datatype(global_inp_name, DataType["UINT8"])

model.save("DPFL_AIAccel12_preproc2.onnx")
showInNetron("DPFL_AIAccel12_preproc2.onnx")

In [None]:
from qonnx.transformation.insert_topk import InsertTopK
from qonnx.transformation.infer_datatypes import InferDataTypes

model = model.transform(InsertTopK(k=1))
chkpt_name = "DPFL_AIAccel12_pre_post.onnx"
# tidy-up again
model = model.transform(InferShapes())
model = model.transform(FoldConstants())
model = model.transform(InferDataTypes())
model.save(chkpt_name)

In [None]:
showInNetron("DPFL_AIAccel12_pre_post.onnx")

In [None]:
from finn.transformation.streamline import Streamline
from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
import finn.transformation.streamline.absorb as absorb
from finn.transformation.streamline.reorder import MakeMaxPoolNHWC, MoveScalarLinearPastInvariants
from qonnx.transformation.infer_data_layouts import InferDataLayouts
from qonnx.transformation.general import RemoveUnusedTensors

model = ModelWrapper("DPFL_AIAccel12_pre_post.onnx")
model = model.transform(MoveScalarLinearPastInvariants())
model = model.transform(Streamline())
model = model.transform(LowerConvsToMatMul())


model = model.transform(MakeMaxPoolNHWC())
model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
model = model.transform(ConvertBipolarMatMulToXnorPopcount())
model = model.transform(Streamline())
# absorb final add-mul nodes into TopK
model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
model = model.transform(InferDataLayouts())
model = model.transform(RemoveUnusedTensors())
model.save("DPFL_AIAccel12_streamlined.onnx")

In [None]:
showInNetron("DPFL_AIAccel12_streamlined.onnx")

In [None]:
import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
from finn.transformation.fpgadataflow.create_dataflow_partition import (
    CreateDataflowPartition,
)
from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
from qonnx.custom_op.registry import getCustomOp
from qonnx.transformation.infer_data_layouts import InferDataLayouts

# choose the memory mode for the MVTU units, decoupled or const
mem_mode = "decoupled"

model = ModelWrapper("DPFL_AIAccel12_streamlined.onnx")
model = model.transform(to_hls.InferBinaryMatrixVectorActivation(mem_mode))
model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode))
# TopK to LabelSelect
model = model.transform(to_hls.InferLabelSelectLayer())
# input quantization (if any) to standalone thresholding
model = model.transform(to_hls.InferThresholdingLayer())
model = model.transform(to_hls.InferConvInpGen())
model = model.transform(to_hls.InferStreamingMaxPool())
# get rid of Reshape(-1, 1) operation between hlslib nodes
model = model.transform(RemoveCNVtoFCFlatten())
# get rid of Tranpose -> Tranpose identity seq
model = model.transform(absorb.AbsorbConsecutiveTransposes())
# infer tensor data layouts
model = model.transform(InferDataLayouts())

model = model.transform(MakeMaxPoolNHWC())
model = model.transform(to_hls.InferStreamingMaxPool())
model = model.transform(absorb.AbsorbConsecutiveTransposes())
model = model.transform(to_hls.InferStreamingMaxPool())



parent_model = model.transform(CreateDataflowPartition())
parent_model.save("DPFL_AIAccel12_dataflow_parent.onnx")
sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
sdp_node = getCustomOp(sdp_node)
dataflow_model_filename = sdp_node.get_nodeattr("model")
# save the dataflow partition with a different name for easier access
dataflow_model = ModelWrapper(dataflow_model_filename)
dataflow_model.save("DPFL_AIAccel12_dataflow_model.onnx")

In [2]:
from finn.util.visualization import showInNetron
showInNetron("DPFL_AIAccel12_dataflow_parent.onnx")

Serving 'v60qManualfifo_dataflow_parent.onnx' at http://0.0.0.0:8081


In [3]:
showInNetron("DPFL_AIAccel12_dataflow_model.onnx")

Stopping http://0.0.0.0:8081
Serving 'v60qManualfifo_dataflow_model.onnx' at http://0.0.0.0:8081


In [2]:
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg
import os
import shutil

model_file = "DPFL_AIAccel12_dataflow_model.onnx"

rtlsim_output_dir = "output_AI-Accel-1"

#Delete previous run results if exist
if os.path.exists(rtlsim_output_dir):
    shutil.rmtree(rtlsim_output_dir)
    print("Previous run results deleted!")

cfg_stitched_ip = build.DataflowBuildConfig(
    output_dir          = rtlsim_output_dir,
    mvau_wwidth_max     = 80,
    target_fps          = 1000000,
    synth_clk_period_ns = 10.0,
    fpga_part           = "xczu9eg-ffvb1156-2-e",
    board               = "ZCU102",
    shell_flow_type     = build_cfg.ShellFlowType.VIVADO_ZYNQ,

    folding_config_file = "AI-Accel1_hw_config.json", 
    
    steps=["step_apply_folding_config",
           "step_generate_estimate_reports",
           "step_hls_codegen",
           "step_hls_ipgen",
           "step_set_fifo_depths",
           "step_create_stitched_ip",
           "step_measure_rtlsim_performance",
           "step_out_of_context_synthesis",
           "step_synthesize_bitfile",
           "step_make_pynq_driver",
           "step_deployment_package",
          ],
    generate_outputs=[
        build_cfg.DataflowOutputType.STITCHED_IP,
        build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
        build_cfg.DataflowOutputType.OOC_SYNTH,
        build_cfg.DataflowOutputType.BITFILE,
        build_cfg.DataflowOutputType.PYNQ_DRIVER,
        build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE]
)


In [3]:
%%time
build.build_dataflow_cfg(model_file, cfg_stitched_ip)

Building dataflow accelerator from v60qManualfifo_dataflow_model.onnx
Intermediate outputs will be generated in /tmp/finn_dev_shakeelarkam00
Final outputs will be generated in output_ipstitch_ooc_rtlsim_unit8v60qW
Build log is at output_ipstitch_ooc_rtlsim_unit8v60qW/build_dataflow.log
Running step: step_apply_folding_config [1/11]
Running step: step_generate_estimate_reports [2/11]
Running step: step_hls_codegen [3/11]
Running step: step_hls_ipgen [4/11]
Running step: step_set_fifo_depths [5/11]
Running step: step_create_stitched_ip [6/11]
Running step: step_measure_rtlsim_performance [7/11]
Running step: step_out_of_context_synthesis [8/11]
Running step: step_synthesize_bitfile [9/11]
Running step: step_make_pynq_driver [10/11]
Running step: step_deployment_package [11/11]
Completed successfully
CPU times: user 6.79 s, sys: 991 ms, total: 7.78 s
Wall time: 43min 21s


0