# Train a Resnet on CIFAR10

In [1]:
pip install --upgrade netron

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torchvision.datasets as datasets
import torchvision.models as models
import torchvision.transforms as transforms
import torch.nn.functional as F
from tqdm import tqdm
from torch.optim.lr_scheduler import OneCycleLR, ReduceLROnPlateau
%matplotlib inline

# Load Dataset

In [3]:
from torchvision import transforms
import numpy as np

class GetTransforms():
    '''Returns a list of transformations when type as requested amongst train/test
       Transforms('train') = list of transforms to apply on training data
       Transforms('test') = list of transforms to apply on testing data'''

    def __init__(self):
        pass

    def trainparams(self):
        train_transformations = [ #resises the image so it can be perfect for our model.
            transforms.RandomHorizontalFlip(), # FLips the image w.r.t horizontal axis
            transforms.RandomRotation((-7,7)),     #Rotates the image to a specified angel
            transforms.RandomAffine(0, shear=10, scale=(0.8,1.2)), #Performs actions like zooms, change shear angles.
            transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2), # Set the color params
            transforms.ToTensor(), # comvert the image to tensor so that it can work with torch
            transforms.Normalize((0.491, 0.482, 0.446), (0.247, 0.243, 0.261)) #Normalize all the images
            ]

        return train_transformations

    def testparams(self):
        test_transforms = [
            transforms.ToTensor(),
            transforms.Normalize((0.491, 0.482, 0.446), (0.247, 0.243, 0.261))
        ]
        return test_transforms

In [4]:
transformations = GetTransforms()
train_transforms = transforms.Compose(transformations.trainparams())
test_transforms = transforms.Compose(transformations.testparams())


class GetCIFAR10_TrainData():
    def __init__(self, dir_name:str):
        self.dirname = dir_name

    def download_train_data(self):
        return datasets.CIFAR10('resnet18/data', train=True, download=True, transform=train_transforms)

    def download_test_data(self):
        return datasets.CIFAR10('resnet18/data', train=False, download=True, transform=test_transforms)

In [5]:
import os
data = GetCIFAR10_TrainData(os.chdir(".."))
trainset = data.download_train_data()
testset = data.download_test_data()
trainloader = torch.utils.data.DataLoader(trainset, batch_size=592,
                                          shuffle=True, num_workers=0)
testloader = torch.utils.data.DataLoader(testset, batch_size=592,
                                         shuffle=False, num_workers=0)



Files already downloaded and verified
Files already downloaded and verified



# Define a PyTorch Device

GPUs can significantly speed-up training of deep neural networks. We check for availability of a GPU and if so define it as target device.


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Target device: " + str(device))

Target device: cpu


# Define the Model

In [7]:
from brevitas.nn import QuantConv2d, QuantLinear, QuantReLU

torch.manual_seed(0)

weight_bit_width = 2
act_bit_width = 2

class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1, dropout=0.0):
        super(BasicBlock, self).__init__()
        self.conv1 = QuantConv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=True, weight_bit_width=weight_bit_width,quant_type="int")
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu1 = QuantReLU(bit_width=act_bit_width, quant_type="int")
        self.conv2 = QuantConv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=True, weight_bit_width=weight_bit_width,quant_type="int")
        self.bn2 = nn.BatchNorm2d(planes)
        self.relu2 = QuantReLU(bit_width=act_bit_width, quant_type="int")

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                QuantConv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=True, weight_bit_width=weight_bit_width,quant_type="int"),
                nn.BatchNorm2d(self.expansion*planes)
            )
        self.dropout = dropout

    def forward(self, x):
        out = self.relu1(self.bn1(self.conv1(x)))
        out = F.dropout(out, p=self.dropout)
        out = self.bn2(self.conv2(out))
        out = F.dropout(out, p=self.dropout)
        out += self.shortcut(x)
        out = self.relu2(out)
        out = F.dropout(out, p=self.dropout)
        return out

class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = QuantConv2d(in_planes, planes, kernel_size=1, bias=True, weight_bit_width=weight_bit_width,quant_type="int")
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu1 = QuantReLU(bit_width=act_bit_width, quant_type="int")
        self.conv2 = QuantConv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=True, weight_bit_width=weight_bit_width,quant_type="int")
        self.bn2 = nn.BatchNorm2d(planes)
        self.relu2 = QuantReLU(bit_width=act_bit_width, quant_type="int")
        self.conv3 = QuantConv2d(planes, self.expansion*planes, kernel_size=1, bias=True, weight_bit_width=weight_bit_width,quant_type="int")
        self.bn3 = nn.BatchNorm2d(self.expansion*planes)
        self.relu3 = QuantReLU(bit_width=act_bit_width, quant_type="int")

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                QuantConv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=True, weight_bit_width=weight_bit_width,quant_type="int"),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = self.relu1(self.bn1(self.conv1(x)))
        out = self.relu2(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = self.relu3(out)
        return out

class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=200, dropout=0.0):
        super(ResNet, self).__init__()
        self.in_planes = 64
        self.dropout = dropout

        self.conv1 = QuantConv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=True, weight_bit_width=weight_bit_width,quant_type="int")
        self.bn1 = nn.BatchNorm2d(64)
        self.relu1 = QuantReLU(bit_width=act_bit_width, quant_type="int")
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = QuantLinear(512*block.expansion, num_classes, bias=True, weight_bit_width=weight_bit_width,quant_type="int")

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride, dropout=self.dropout))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.relu1(self.bn1(self.conv1(x)))
        out = F.dropout(out, p=self.dropout)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.adaptive_avg_pool2d(out, 1)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ResNet18(num_classes=10, dropout=0.0):
    return ResNet(BasicBlock, [2,2,2,2], num_classes=num_classes, dropout=dropout)


print("Model define")

Model define


# Train and Test

In [8]:
classes = ["%s" % i for i in range(10)]

In [9]:
def test(model, device, test_loader, criterion, classes, test_losses, test_accs,
         misclassified_imgs, correct_imgs, is_last_epoch):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss +=criterion(output, target).item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            is_correct = pred.eq(target.view_as(pred))
            if is_last_epoch:
              misclassified_inds = (is_correct==0).nonzero()[:,0]
              for mis_ind in misclassified_inds:
                if len(misclassified_imgs) == 25:
                  break
                misclassified_imgs.append({
                    "target": target[mis_ind].cpu().numpy(),
                    "pred": pred[mis_ind][0].cpu().numpy(),
                    "img": data[mis_ind]
                })
              
              correct_inds = (is_correct==1).nonzero()[:,0]
              for ind in correct_inds:
                if len(correct_imgs) == 25:
                  break
                correct_imgs.append({
                    "target": target[ind].cpu().numpy(),
                    "pred": pred[ind][0].cpu().numpy(),
                    "img": data[ind]
                })
            correct += is_correct.sum().item()

    test_loss /= len(test_loader)
    test_losses.append(test_loss)
    
    test_acc = 100. * correct / len(test_loader.dataset)
    test_accs.append(test_acc)

    if test_acc >= 90.0:
        classwise_acc(model, device, test_loader, classes)

    print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
    test_loss, correct, len(test_loader.dataset), test_acc))


In [10]:
def classwise_acc(model, device, test_loader, classes):
    class_correct = list(0. for i in range(10))
    class_total = list(0. for i in range(10))
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            c = (predicted == labels).squeeze()
            for i in range(4):
                label = labels[i]
                class_correct[label] += c[i].item()
                class_total[label] += 1
    # print class-wise test accuracies
    print()
    for i in range(10):
      print('Accuracy of %5s : %2d %%' % (
          classes[i], 100 * class_correct[i] / class_total[i]))
    print()


In [11]:
import os
import torch
from finn.util.visualization import showSrc, showInNetron
from finn.util.basic import make_build_dir

In [12]:
showInNetron("./models/quentresnet18_weight2.pth")

Serving './models/quentresnet18_weight2.pth' at http://0.0.0.0:8081


In [13]:
# --- Helper function ---
def remove_export_handlers(model):
    count = 0
    for module in model.modules():
        if hasattr(module, "export_handler"):
            module.export_handler = None
            count += 1
    print(f"✅ Removed export_handler from {count} Quant layers.")


In [14]:
# --- Main export pipeline ---
# Step 1: Construct model
model = ResNet18(num_classes=10)

# Step 2: Load weights
trained_state_dict = torch.load("./models/quentresnet18_weight2.pth", map_location='cpu')
model.load_state_dict(trained_state_dict, strict=False)

# Step 3: Remove export_handler from all quant layers
#remove_export_handlers(model)

# Step 4: Prepare for export
model.eval()
model.cpu()

print("model to cpu")

model to cpu


In [15]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum= 0.9)
test_losses, train_losses, test_accs, train_accs = [], [], [], []
misclassified_imgs, correct_imgs = [], []

In [16]:
#Test for accuracy
exact_acc = test(model, device, testloader, criterion, classes, test_losses,
                 test_accs, misclassified_imgs, correct_imgs,False)

  return super(Tensor, self).rename(names)


Test set: Average loss: 2.6973, Accuracy: 1070/10000 (10.70%)



# Export to QONNX and Conversion to FINN-ONNX

ONNX is an open format built to represent machine learning models, and the FINN compiler expects an ONNX model as input. We'll now export our network into ONNX to be imported and used in FINN for the next notebooks. Note that the particular ONNX representation used for FINN differs from standard ONNX, you can read more about this here.

You can see below how we export a trained network in Brevitas into a FINN-compatible ONNX representation (QONNX). QONNX is the format we can export from Brevitas, to feed it into the FINN compiler, we will need to make a conversion to the FINN-ONNX format which is the intermediate representation the compiler works on. The conversion of the FINN-ONNX format is a FINN compiler transformation and to be able to apply it to our model, we will need to wrap it into ModelWrapper. This is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model. Then we can call the conversion function to obtain the model in FINN-ONNX format.

In [17]:
from brevitas.export import export_qonnx
from qonnx.util.cleanup import cleanup as qonnx_cleanup
from qonnx.core.modelwrapper import ModelWrapper
from qonnx.core.datatype import DataType
from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN

ready_model_filename = "hardware/quantresnet18_weight2_files/quantresnet18_weight2.onnx"
input_shape = (1, 3, 32, 32)

# create a QuantTensor instance to mark input as bipolar during export
input_a = np.random.randint(0, 1, size=input_shape).astype(np.float32)
input_a = 2 * input_a - 1
scale = 1.0
input_t = torch.from_numpy(input_a * scale)

#Move to CPU before export
model.cpu()

# Export to ONNX
export_qonnx(
    model, export_path=ready_model_filename, input_t=input_t
)

# clean-up
qonnx_cleanup(ready_model_filename, out_file=ready_model_filename)

# ModelWrapper
model = ModelWrapper(ready_model_filename)
# Setting the input datatype explicitly because it doesn't get derived from the export function
model.set_tensor_datatype(model.graph.input[0].name, DataType["BIPOLAR"])
model = model.transform(ConvertQONNXtoFINN())
model.save(ready_model_filename)

print("Model saved to %s" % ready_model_filename)

[W NNPACK.cpp:53] Could not initialize NNPACK! Reason: Unsupported hardware.


Model saved to hardware/quantresnet18_weight2_files/quantresnet18_weight2.onnx


In [19]:
showInNetron("hardware/quantresnet18_weight2_files/quantresnet18_weight2.onnx")

Serving 'hardware/quantresnet18_weight2_files/quantresnet18_weight2.onnx' at http://0.0.0.0:8081


# Tidy-up transformations <a id='basic_trafo'></a>
This section deals with some basic transformations, which are applied to the model like a kind of "tidy-up" to make it easier to be processed. They do not appear in the diagram above, but they are applied in many steps in the FINN flow to postprocess the model after a transformation and/or to prepare it for the next transformation.

These transformations are:
* GiveUniqueNodeNames
* GiveReadableTensorNames
* InferShapes
* InferDataTypes
* FoldConstants
* RemoveStaticGraphInputs

In the first two transformations (GiveUniqueNodeNames, GiveReadableTensorNames) the nodes in the graph are first given unique (by enumeration) names, then the tensors are given human-readable names (based on the node names). The following two transformations (InferShapes, InferDataTypes) derive the shapes and data types of the tensors from the model properties and set them in the ValueInfo of the model. These transformations can almost always be applied without negative effects and do not affect the structure of the graph, ensuring that all the information needed is available.

The next listed transformation is FoldConstants, which performs constant folding. It identifies a node with constant inputs and determines its output. The result is then set as constant-only inputs for the following node and the old node is removed. Although this transformation changes the structure of the model, it is a transformation that is usually always desired and can be applied to any model. And finally, we have RemoveStaticGraphInputs to remove any top-level graph inputs that already have ONNX initializers associated with them.

These transformations can be imported and applied as follows.

In [20]:
from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames, RemoveStaticGraphInputs
from qonnx.transformation.infer_shapes import InferShapes
from qonnx.transformation.infer_datatypes import InferDataTypes
from qonnx.transformation.fold_constants import FoldConstants

model = model.transform(InferShapes())
model = model.transform(FoldConstants())
model = model.transform(GiveUniqueNodeNames())
model = model.transform(GiveReadableTensorNames())
model = model.transform(InferDataTypes())
model = model.transform(RemoveStaticGraphInputs())

model.save("hardware/quantresnet18_weight2_files/quantresnet18_weight2_tidy.onnx")

In [22]:
showInNetron("hardware/quantresnet18_weight2_files/quantresnet18_weight2_tidy.onnx")

Serving 'hardware/quantresnet18_weight2_files/quantresnet18_weight2_tidy.onnx' at http://0.0.0.0:8081


# Adding Pre- and Postprocessing

In many cases, it's common to apply some preprocessing to the raw data in a machine learning framework prior to training. For image classification networks, this may include conversion of raw 8-bit RGB values into floating point values between 0 and 1. Similarly, at the output of the network some postprocessing may be performed during deployment, such as extracting the indices of the classifications with the largest value (top-K indices).

In FINN, we can bake some of these pre/postprocessing operatings into the graph, and in some cases these can be highly beneficial for performance by allowing our accelerator to directly consume raw data instead of going through CPU preprocessing.

We'll demonstrate this for our small image classification network as follows. Brevitas preprocesses BNN-PYNQ network inputs with torchvision.transforms.ToTensor() prior to training, which converts 8-bit RGB values into floats between 0 and 1 by dividing the input by 255. We can achieve the same effect in FINN by exporting a single-node ONNX graph for division by 255 (which already exists as finn.util.pytorch.ToTensor and merging this with our original model. Finally, we're going to mark our input tensor as 8-bit to let FINN know which level of precision to use.

In [23]:
from finn.util.pytorch import ToTensor
from qonnx.transformation.merge_onnx_models import MergeONNXModels
from qonnx.core.datatype import DataType

model = ModelWrapper("hardware/quantresnet18_weight2_files/quantresnet18_weight2_tidy.onnx")
global_inp_name = model.graph.input[0].name
ishape = model.get_tensor_shape(global_inp_name)
# preprocessing: torchvision's ToTensor divides uint8 inputs by 255
totensor_pyt = ToTensor()
chkpt_preproc_name = "hardware/quantresnet18_weight2_files/quantresnet18_weight2_preproc.onnx"
export_qonnx(totensor_pyt, torch.randn(ishape), chkpt_preproc_name)
qonnx_cleanup(chkpt_preproc_name, out_file=chkpt_preproc_name)
pre_model = ModelWrapper(chkpt_preproc_name)
pre_model = pre_model.transform(ConvertQONNXtoFINN())

# join preprocessing and core model
model = model.transform(MergeONNXModels(pre_model))
# add input quantization annotation: UINT8 for all BNN-PYNQ models
global_inp_name = model.graph.input[0].name
model.set_tensor_datatype(global_inp_name, DataType["UINT8"])

model.save("hardware/quantresnet18_weight2_files/quantresnet18_weight2_preproc.onnx")




In [25]:
showInNetron("hardware/quantresnet18_weight2_files/quantresnet18_weight2_preproc.onnx")

Serving 'hardware/quantresnet18_weight2_files/quantresnet18_weight2_preproc.onnx' at http://0.0.0.0:8081


You can observe two changes in the graph above: a Div node has appeared in the beginning to perform the input preprocessing, and the global_in tensor now has a quantization annotation to mark it as an unsigned 8-bit value.

For the postprocessing we'll insert a TopK node for k=1 at the end of our graph. This will extract the index (class number) for the largest-valued output.


In [26]:
from qonnx.transformation.insert_topk import InsertTopK

# postprocessing: insert Top-1 node at the end
model = model.transform(InsertTopK(k=1))
chkpt_name = "hardware/quantresnet18_weight2_files/quantresnet18_weight2_pre_post.onnx"
# tidy-up again
model = model.transform(InferShapes())
model = model.transform(FoldConstants())
model = model.transform(GiveUniqueNodeNames())
model = model.transform(GiveReadableTensorNames())
model = model.transform(InferDataTypes())
model = model.transform(RemoveStaticGraphInputs())
model.save(chkpt_name)

In [28]:
showInNetron("hardware/quantresnet18_weight2_files/quantresnet18_weight2_pre_post.onnx")

Serving 'hardware/quantresnet18_weight2_files/quantresnet18_weight2_pre_post.onnx' at http://0.0.0.0:8081


# Streamlining

Streamlining is a transformation containing several sub-transformations. The goal of streamlining is to eliminate floating point operations by moving them around, then collapsing them into one operation and in the last step transform them into multi-thresholding nodes. For more information on the theoretical background of this, see this paper.

Let's have a look at which sub-transformations Streamline consists of:

In [29]:
from finn.transformation.streamline import Streamline
showSrc(Streamline)

class Streamline(Transformation):
    """Apply the streamlining transform, see arXiv:1709.04060."""

    def apply(self, model):
        streamline_transformations = [
            ConvertSubToAdd(),
            ConvertDivToMul(),
            BatchNormToAffine(),
            ConvertSignToThres(),
            MoveMulPastMaxPool(),
            MoveScalarLinearPastInvariants(),
            AbsorbSignBiasIntoMultiThreshold(),
            MoveAddPastMul(),
            MoveScalarAddPastMatMul(),
            MoveAddPastConv(),
            MoveScalarMulPastMatMul(),
            MoveScalarMulPastConv(),
            MoveAddPastMul(),
            CollapseRepeatedAdd(),
            CollapseRepeatedMul(),
            MoveMulPastMaxPool(),
            AbsorbAddIntoMultiThreshold(),
            FactorOutMulSignMagnitude(),
            AbsorbMulIntoMultiThreshold(),
            Absorb1BitMulIntoMatMul(),
            Absorb1BitMulIntoConv(),
            RoundAndClipThresholds(),
        ]
        for tr

In [30]:
from finn.transformation.streamline.reorder import MoveScalarLinearPastInvariants
import finn.transformation.streamline.absorb as absorb

model = ModelWrapper("hardware/quantresnet18_weight2_files/quantresnet18_weight2_pre_post.onnx")
# move initial Mul (from preproc) past the Reshape
model = model.transform(MoveScalarLinearPastInvariants())
# streamline
model = model.transform(Streamline())
model.save("hardware/quantresnet18_weight2_files/quantresnet18_weight2_streamlined.onnx")

In [32]:
showInNetron("hardware/quantresnet18_weight2_files/quantresnet18_weight2_streamlined.onnx")

Serving 'hardware/quantresnet18_weight2_files/quantresnet18_weight2_streamlined.onnx' at http://0.0.0.0:8081


You can see that the network has become simplified considerably compared to the previous step -- a lot of nodes have disappeared between the `MatMul` layers. 

**The current implementation of streamlining is highly network-specific and may not work for your network if its topology is very different than the example network here. We hope to rectify this in future releases.**

Our example network is a quantized network with 1-bit bipolar (-1, +1 values) precision, and we want FINN to implement them as XNOR-popcount operations [as described in the original FINN paper](https://arxiv.org/pdf/1612.07119). For this reason, after streamlining, the resulting bipolar matrix multiplications are converted into xnorpopcount operations. This transformation produces operations that are again collapsed and converted into thresholds. This procedure is shown below. 

In [33]:
from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
from qonnx.transformation.infer_data_layouts import InferDataLayouts
from qonnx.transformation.general import RemoveUnusedTensors

model = model.transform(ConvertBipolarMatMulToXnorPopcount())
model = model.transform(absorb.AbsorbAddIntoMultiThreshold())
model = model.transform(absorb.AbsorbMulIntoMultiThreshold())
# absorb final add-mul nodes into TopK
model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
model = model.transform(RoundAndClipThresholds())

# bit of tidy-up
model = model.transform(InferDataLayouts())
model = model.transform(RemoveUnusedTensors())

model.save("hardware/quantresnet18_weight2_files/quantresnet18_weight2_for_hw_conversion.onnx")

In [35]:
showInNetron("hardware/quantresnet18_weight2_files/quantresnet18_weight2_for_hw_conversion.onnx")

Serving 'hardware/quantresnet18_weight2_files/quantresnet18_weight2_for_hw_conversion.onnx' at http://0.0.0.0:8081


Observe the pairs of `XnorPopcountmatMul` and `MultiThreshold` layers following each other -- this is the particular pattern that the next step will be looking for in order to convert them to hardware (HW) layers.

# Conversion to HW layers

Converts the nodes to HW layers, these layers are abstraction layers that do not directly correspond to an HLS or Verilog implementation but they will be converted in either one later in the flow. In our case this transformation converts pairs of binary XnorPopcountMatMul layers to MVAU layers (matrix vector activation unit). Any immediately following MultiThreshold layers will also be absorbed into the MVAU.

Below is the code for the transformation and the network is visualized using netron to create the new structure with MVAU nodes.

In [36]:
import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
model = ModelWrapper("hardware/quantresnet18_weight2_files/quantresnet18_weight2_for_hw_conversion.onnx")
model = model.transform(to_hw.InferBinaryMatrixVectorActivation())
# TopK to LabelSelect
model = model.transform(to_hw.InferLabelSelectLayer())
# input quantization (if any) to standalone thresholding
model = model.transform(to_hw.InferThresholdingLayer())
model.save("hardware/quantresnet18_weight2_files/quantresnet18_weight2_for_layers.onnx")

In [38]:
showInNetron("hardware/quantresnet18_weight2_files/quantresnet18_weight2_for_layers.onnx")

Serving 'hardware/quantresnet18_weight2_files/quantresnet18_weight2_for_layers.onnx' at http://0.0.0.0:8081


# Creating a Dataflow Partition

In the graph above, you can see that there is a mixture of FINN HW layers (MVAU and Thresholding) with one regular ONNX layers (Reshape). To create a bitstream, FINN needs a model with only HW layers. In order to achieve this, we will use the CreateDataflowPartition transformation to create a "dataflow partition" in this graph, separating out the HLS layers into another model, and replacing them with a placeholder layer called StreamingDataflowPartition.

In [39]:
from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition

model = ModelWrapper("hardware/quantresnet18_weight2_files/quantresnet18_weight2_for_layers.onnx")
parent_model = model.transform(CreateDataflowPartition())
parent_model.save("hardware/quantresnet18_weight2_files/quantresnet18_weight2_dataflow_parent.onnx")

In [41]:
showInNetron("hardware/quantresnet18_weight2_files/quantresnet18_weight2_dataflow_parent.onnx")

Serving 'hardware/quantresnet18_weight2_files/quantresnet18_weight2_dataflow_parent.onnx' at http://0.0.0.0:8081


We can see that the MVAU instances and the Thresholding in the beginning have all been replaced with a single StreamingDataflowPartition, which has an attribute model that points to the extracted, HW dataflow-only graph:

In [42]:
from qonnx.custom_op.registry import getCustomOp
sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
sdp_node = getCustomOp(sdp_node)
dataflow_model_filename = sdp_node.get_nodeattr("model")

In [44]:
showInNetron(dataflow_model_filename)

Serving '/tmp/finn_dev_pamela/dataflow_partition_27flq59v/partition_0.onnx' at http://0.0.0.0:8081


We can see all the extracted MVAU instances and the Thresholding have been moved to the child (dataflow) model. We will load the child model with ModelWrapper and continue working on it.


In [45]:
model = ModelWrapper(dataflow_model_filename)

# Specialize layers

The network is converted to HW abstraction layers and we have excluded the non-HW layers to continue with the processing of the model. HW abstraction layers are abstract (placeholder) layers that can be either implemented in HLS or as an RTL module using FINN. In the next flow step, we convert each of these layers to either an HLS or RTL variant by calling the SpecializeLayers transformation. It is possible to let the FINN flow know a preference for the implementation style {"hls", "rtl"} and depending on the layer type this wish will be fulfilled or it will be set to a reasonable default. In the tfc example, we will set all layers to their HLS variants. To showcase how to set the preferred implementation, we will set the node attribute in the Thresholding layer to "hls", for the MVAUs and the LabelSelect we will leave this node attribute empty and in this case by default it will be set to HLS.


In [46]:
thresh_node = model.get_nodes_by_op_type("Thresholding")[0]
thresh_node_inst = getCustomOp(thresh_node)
thresh_node_inst.set_nodeattr("preferred_impl_style", "hls")

We'll define two helper variables that describe the Xilinx FPGA part name and the PYNQ board name that we are targeting.

In [47]:
# print the names of the supported PYNQ boards
from finn.util.basic import pynq_part_map
print(pynq_part_map.keys())

dict_keys(['Ultra96', 'Ultra96-V2', 'Pynq-Z1', 'Pynq-Z2', 'ZCU102', 'ZCU104', 'ZCU111', 'RFSoC2x2', 'RFSoC4x2', 'KV260_SOM'])


In [48]:
# change this if you have a different PYNQ board, see list above
pynq_board = "Pynq-Z1"
fpga_part = pynq_part_map[pynq_board]
target_clk_ns = 10

Then we will call SpecializeLayers to convert each HW abstraction layer to (in this case) an HLS variant.

In [49]:
from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
model = model.transform(SpecializeLayers(fpga_part))

model.save("hardware/quantresnet18_weight2_files/quantresnet18_weight2_specialize_layers.onnx")

In [51]:
showInNetron("hardware/quantresnet18_weight2_files/quantresnet18_weight2_specialize_layers.onnx")

Serving 'hardware/quantresnet18_weight2_files/quantresnet18_weight2_specialize_layers.onnx' at http://0.0.0.0:8081


Each node type has now a suffix (_hls) and the module ( finn.custom_op.fpgadataflow.hls also indicates that that the HLS variant of the layer is selected. We can now proceed by adjusting the parallelism of each node to customize the performance and resource usage.)

#  Folding: Adjusting the Parallelism

Folding in FINN describes how much a layer is time-multiplexed in terms of execution resources. There are several folding factors for each layer, controlled by the PE (parallelization over outputs) and SIMD (parallelization over inputs) parameters as described by the original FINN paper. The higher the PE and SIMD values are set, the faster the generated accelerator will run, and the more FPGA resources it will consume.

Each MVAU_hls node has two attributes that specify the degree of folding, PE and SIMD. In all nodes the values for these attributes are set as default to 1, which would correspond to a maximum folding (time multiplexing) and thus minimum performance.

Since the folding parameters are node attributes, they can be easily accessed and changed using a helper function of the ModelWrapper. But first we take a closer look at one of the nodes that implement a Matrix-Vector-Activation operation. This is where the Netron visualization helps us, in the above diagram we can see that the model contains four MVAUs. So as an example we extract the second node of the graph.

We can use the higher-level CustomOp wrappers for this node. These wrappers provide easy access to specific properties of these nodes, such as the folding factors (PE and SIMD). Above, we have already used this abstraction to set the node attribute of the Thresholding HW layer. Let's have a look at which node attributes are defined by the CustomOp wrapper, and adjust the SIMD and PE attributes.

In [52]:
fc0 = model.graph.node[0]
fc0w = getCustomOp(fc0)

print("CustomOp wrapper is of class " + fc0w.__class__.__name__)

fc0w.get_nodeattr_types()

CustomOp wrapper is of class Thresholding_hls


{'mem_mode': ('s',
  False,
  'internal_decoupled',
  {'internal_decoupled', 'internal_embedded'}),
 'ram_style': ('s', False, 'distributed', {'block', 'distributed'}),
 'runtime_writeable_weights': ('i', False, 0, {0, 1}),
 'PE': ('i', True, 0),
 'NumChannels': ('i', True, 0),
 'numSteps': ('i', True, 1),
 'inputDataType': ('s', True, ''),
 'weightDataType': ('s', True, ''),
 'outputDataType': ('s', True, ''),
 'numInputVectors': ('ints', False, [1]),
 'ActVal': ('i', False, 0),
 'backend': ('s', True, 'fpgadataflow'),
 'preferred_impl_style': ('s', False, '', {'', 'hls', 'rtl'}),
 'code_gen_dir_ipgen': ('s', False, ''),
 'ipgen_path': ('s', False, ''),
 'ip_path': ('s', False, ''),
 'ip_vlnv': ('s', False, ''),
 'exec_mode': ('s', False, '', {'', 'cppsim', 'rtlsim'}),
 'cycles_rtlsim': ('i', False, 0),
 'cycles_estimate': ('i', False, 0),
 'rtlsim_trace': ('s', False, ''),
 'res_estimate': ('s', False, ''),
 'res_synth': ('s', False, ''),
 'rtlsim_so': ('s', False, ''),
 'slr': ('i',

We can see that the PE and SIMD are listed as node attributes, as well as the depths of the FIFOs that will be inserted between consecutive layers, and all can be adjusted using set_nodeattr subject to certain constraints. There are also a lot of additional attributes that can be set for this node type. In this notebook we are setting the folding factors and FIFO depths manually but it is possible to use FINN transformations for this (SetFolding and InsertAndSetFIFODepths).

In [67]:
fc_layers = model.get_nodes_by_op_type("MVAU_hls")
# (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer
config = [
    (16, 49, [16], [64], "block"),
    (8, 8, [64], [64], "auto"),
    (8, 8, [64], [64], "auto"),
    (10, 8, [64], [10], "distributed"),
]
for fcl, (pe, simd, ififo, ofifo, ramstyle) in zip(fc_layers, config):
    fcl_inst = getCustomOp(fcl)
    fcl_inst.set_nodeattr("PE", pe)
    fcl_inst.set_nodeattr("SIMD", simd)
    fcl_inst.set_nodeattr("inFIFODepths", ififo)
    fcl_inst.set_nodeattr("outFIFODepths", ofifo)
    fcl_inst.set_nodeattr("ram_style", ramstyle)
    
# set parallelism for input quantizer to be same as first layer's SIMD
inp_qnt_node = model.get_nodes_by_op_type("Thresholding_hls")[0]
inp_qnt = getCustomOp(inp_qnt_node)
inp_qnt.set_nodeattr("PE", 16)

We are setting PE and SIMD so that each layer has a total folding of 16.

Besides PE and SIMD three other node attributes are set. ram_style specifies how the weights are to be stored (BRAM, LUTRAM, and so on). It can be selected explicitly or with the option auto you can let Vivado decide. inFIFODepths and outFIFODepths specifies the FIFO depths that is needed by the node from the surrounding FIFOs. These attributes are used in the transformation 'InsertFIFO' to insert the appropriate FIFOs between the nodes, which will be automatically called as part of the hardware build process.

In previous versions of FINN we had to call transformations to insert data width converters, FIFOs and TLastMarker manually at this step. This is no longer needed, as all this is taken care of by the ZynqBuild or VitisBuild transformations.


In [58]:
inp_qnt_node = model.get_nodes_by_op_type("Thresholding_hls")[0]
inp_qnt = getCustomOp(inp_qnt_node)
print("NumChannels:", inp_qnt.get_nodeattr("NumChannels"))
print("PE:", inp_qnt.get_nodeattr("PE"))

NumChannels: 64
PE: 49


In [65]:
inp_qnt.set_nodeattr("PE", 16)

In [66]:
print("PE:", inp_qnt.get_nodeattr("PE"))

PE: 16


In [68]:
model.save("hardware/quantresnet18_weight2_files/quantresnet18_weight2_set_folding_factors.onnx")

In [70]:
showInNetron("hardware/quantresnet18_weight2_files/quantresnet18_weight2_set_folding_factors.onnx")

Serving 'hardware/quantresnet18_weight2_files/quantresnet18_weight2_set_folding_factors.onnx' at http://0.0.0.0:8081


# Hardware Build

We're finally ready to start generating hardware from our network. Depending on whether you want to target a Zynq or Alveo platform, FINN offers two transformations to build the accelerator, integrate into an appropriate shell and build a bitfile. These are ZynqBuild and VitisBuild for Zynq and Alveo, respectively. In this notebook we'll demonstrate the ZynqBuild as these boards are more common and it's much faster to complete bitfile generation for the smaller FPGAs found on them.

In previous versions of FINN, we had to manually go through several steps to generate HLS/RTL code, stitch IP, create a PYNQ project and run synthesis. All these steps are now performed by the `ZynqBuild` transform (or the `VitisBuild` transform for Alveo). **As this involves calling HLS synthesis and Vivado synthesis, this transformation will run for some time (up to half an hour depending on your PC).**

In [71]:
from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
model = ModelWrapper("hardware/quantresnet18_weight2_files/quantresnet18_weight2_set_folding_factors.onnx")
model = model.transform(ZynqBuild(platform = pynq_board, period_ns = target_clk_ns))

                        be created. This may cause RTL simulation issues.
                        
                        be created. This may cause RTL simulation issues.
                        
                You may experience incorrect stitched-IP rtlsim or hardware
                behavior. It is strongly recommended to insert FIFOs prior to
                calling CreateStitchedIP.


In [73]:
from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
model = model.transform(MakePYNQDriver("zynq-iodma"))

In [74]:
model.save("hardware/quantresnet18_weight2_files/quantresnet18_weight2_post_synthesis.onnx")

In [76]:
showInNetron("hardware/quantresnet18_weight2_files/quantresnet18_weight2_post_synthesis.onnx")

Serving 'hardware/quantresnet18_weight2_files/quantresnet18_weight2_post_synthesis.onnx' at http://0.0.0.0:8081


# Examining the generated outputs

Let's start by viewing the post-synthesis model in Netron:

In [78]:
model = ModelWrapper("hardware/quantresnet18_weight2_files/quantresnet18_weight2_post_synthesis.onnx")
sdp_node_middle = getCustomOp(model.graph.node[1])
postsynth_layers = sdp_node_middle.get_nodeattr("model")

showInNetron(postsynth_layers)

Serving '/tmp/finn_dev_pamela/dataflow_partition_e9yym5a5/partition_2.onnx' at http://0.0.0.0:8081


In [79]:
model = ModelWrapper(postsynth_layers)
model.model.metadata_props

[key: "floorplan_json"
value: "/tmp/finn_dev_pamela/vitis_floorplan_xhyg7if4/floorplan.json"
, key: "vivado_stitch_proj"
value: "/tmp/finn_dev_pamela/vivado_stitch_proj_5vub_qyk"
, key: "clk_ns"
value: "10"
, key: "wrapper_filename"
value: "/tmp/finn_dev_pamela/vivado_stitch_proj_5vub_qyk/finn_vivado_stitch_proj.gen/sources_1/bd/StreamingDataflowPartition_1/hdl/StreamingDataflowPartition_1_wrapper.v"
, key: "vivado_stitch_vlnv"
value: "xilinx_finn:finn:StreamingDataflowPartition_1:1.0"
, key: "vivado_stitch_ifnames"
value: "{\"clk\": [\"ap_clk\"], \"rst\": [\"ap_rst_n\"], \"s_axis\": [[\"s_axis_0\", 512]], \"m_axis\": [[\"m_axis_0\", 32]], \"aximm\": [], \"axilite\": []}"
, key: "platform"
value: "zynq-iodma"
]

In [80]:
model = ModelWrapper("hardware/quantresnet18_weight2_files/quantresnet18_weight2_post_synthesis.onnx")
model.model.metadata_props

[key: "floorplan_json"
value: "/tmp/finn_dev_pamela/vitis_floorplan_xhyg7if4/floorplan.json"
, key: "vivado_pynq_proj"
value: "/tmp/finn_dev_pamela/vivado_zynq_proj_mkohv0_9"
, key: "bitfile"
value: "/tmp/finn_dev_pamela/vivado_zynq_proj_mkohv0_9/resizer.bit"
, key: "hw_handoff"
value: "/tmp/finn_dev_pamela/vivado_zynq_proj_mkohv0_9/resizer.hwh"
, key: "vivado_synth_rpt"
value: "/tmp/finn_dev_pamela/vivado_zynq_proj_mkohv0_9/synth_report.xml"
, key: "platform"
value: "zynq-iodma"
, key: "pynq_driver_dir"
value: "/tmp/finn_dev_pamela/pynq_driver_pftt734i"
]

In [81]:
! ls {model.get_metadata_prop("vivado_pynq_proj")}

finn_zynq_link.cache	      finn_zynq_link.srcs  resizer.hwh
finn_zynq_link.gen	      finn_zynq_link.xpr   synth_project.sh
finn_zynq_link.hw	      ip_config.tcl	   synth_report.xml
finn_zynq_link.ip_user_files  NA		   vivado.jou
finn_zynq_link.runs	      resizer.bit	   vivado.log
