In [None]:
# Launch a Build: Only Estimate Reports 
# For Avnet Ultra96-v2 Board, 
# NN Model: LeNet-5
# Dataset: CIFAR-10
# This is a modification of 
# https://github.com/Xilinx/finn/blob/main/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb


In [29]:
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg
import os
import shutil

model_file = "finn_lenet.onnx"

estimates_output_dir = "output_estimates_only"

#Delete previous run results if exist
if os.path.exists(estimates_output_dir):
    shutil.rmtree(estimates_output_dir)
    print("Previous run results deleted!")


cfg_estimates = build.DataflowBuildConfig(
    output_dir          = estimates_output_dir,
    mvau_wwidth_max     = 12, #80 was the original but wanted to constrain owidth to multiple of 12 as iwidth = 12
    target_fps          = 1000000,
    synth_clk_period_ns = 10.0,
    fpga_part           = "xczu3eg-sbva484-1-i",
    steps               = build_cfg.estimate_only_dataflow_steps,
    generate_outputs=[
        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
    ]
)

Previous run results deleted!


In [30]:
%%time
build.build_dataflow_cfg(model_file, cfg_estimates)

Building dataflow accelerator from finn_lenet.onnx
Intermediate outputs will be generated in /home/rstar900/finn/my_builds
Final outputs will be generated in output_estimates_only
Build log is at output_estimates_only/build_dataflow.log
Running step: step_qonnx_to_finn [1/8]
Running step: step_tidy_up [2/8]
Running step: step_streamline [3/8]


                i.e. domain=finn to domain=qonnx.custom_op.<general|fpgadataflow|...>


Running step: step_convert_to_hls [4/8]
Running step: step_create_dataflow_partition [5/8]
Running step: step_target_fps_parallelization [6/8]
Running step: step_apply_folding_config [7/8]
Running step: step_generate_estimate_reports [8/8]
Completed successfully
CPU times: user 1.12 s, sys: 0 ns, total: 1.12 s
Wall time: 1.12 s


0

In [31]:
# We'll now examine the generated outputs from this build
# If we look under the outputs directory, we'll find a subfolder with the generated estimate reports.

In [32]:
! ls {estimates_output_dir}

auto_folding_config.json  intermediate_models  report  time_per_step.json


In [33]:
! ls {estimates_output_dir}/report

estimate_layer_config_alternatives.json  estimate_network_performance.json
estimate_layer_cycles.json		 op_and_param_counts.json
estimate_layer_resources.json


In [34]:
#  Let's examine the contents of the estimate_network_performance.json for starters. 
# Here, we can see the analytical estimates for the performance and latency
! cat {estimates_output_dir}/report/estimate_network_performance.json

{
  "critical_path_cycles": 88805,
  "max_cycles": 19760,
  "max_cycles_node_name": "ConvolutionInputGenerator_0",
  "estimated_throughput_fps": 5060.728744939272,
  "estimated_latency_ns": 888050.0
}

In [36]:
# We can see the layer-by-layer resource estimates in the estimate_layer_resources.json report 
# We can see if the layers will fit our FPGA using this report, if too high, consider lowering target_fps
import json
def read_json_dict(filename):
    with open(filename, "r") as f:
        ret = json.load(f)
    return ret

read_json_dict(estimates_output_dir + "/report/estimate_layer_resources.json")

{'ConvolutionInputGenerator_0': {'BRAM_18K': 0,
  'BRAM_efficiency': 1,
  'LUT': 372,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'MatrixVectorActivation_0': {'BRAM_18K': 2,
  'BRAM_efficiency': 0.03662109375,
  'LUT': 3811,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'StreamingMaxPool_Batch_0': {'BRAM_18K': 0,
  'BRAM_efficiency': 1,
  'LUT': 0,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'ConvolutionInputGenerator_1': {'BRAM_18K': 0,
  'BRAM_efficiency': 1,
  'LUT': 348,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'MatrixVectorActivation_1': {'BRAM_18K': 2,
  'BRAM_efficiency': 0.1953125,
  'LUT': 5254,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'StreamingMaxPool_Batch_1': {'BRAM_18K': 0,
  'BRAM_efficiency': 1,
  'LUT': 0,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'MatrixVectorActivation_2': {'BRAM_18K': 12,
  'BRAM_efficiency': 0.6510416666666666,
  'LUT': 1255,
  'URAM': 0,
  'URAM_efficiency': 1,
  'DSP': 0},
 'MatrixVectorActivation_3

In [37]:
# Have a look at estimate_layer_cycles.json
read_json_dict(estimates_output_dir + "/report/estimate_layer_cycles.json")

{'ConvolutionInputGenerator_0': 19760,
 'MatrixVectorActivation_0': 19600,
 'StreamingMaxPool_Batch_0': 980,
 'ConvolutionInputGenerator_1': 15420,
 'MatrixVectorActivation_1': 10000,
 'StreamingMaxPool_Batch_1': 125,
 'MatrixVectorActivation_2': 12000,
 'MatrixVectorActivation_3': 10080,
 'MatrixVectorActivation_4': 840}

In [7]:
# Launch a Build: Stitched IP, out-of-context synth and rtlsim Performance

In [1]:
# Check numpy version (should be 1.22.0)
! pip freeze | grep numpy

numpy==1.24.0


In [2]:
# if it is not, then run this cell
! pip uninstall numpy -y
! pip install numpy==1.22.0

Found existing installation: numpy 1.24.0
Uninstalling numpy-1.24.0:
[31mERROR: Exception:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/shutil.py", line 788, in move
    os.rename(src, real_dst)
PermissionError: [Errno 13] Permission denied: '/opt/conda/bin/f2py' -> '/tmp/pip-uninstall-cnllz0z7/f2py'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/pip/_internal/cli/base_command.py", line 228, in _main
    status = self.run(options, args)
  File "/opt/conda/lib/python3.8/site-packages/pip/_internal/commands/uninstall.py", line 89, in run
    uninstall_pathset = req.uninstall(
  File "/opt/conda/lib/python3.8/site-packages/pip/_internal/req/req_install.py", line 686, in uninstall
    uninstalled_pathset.remove(auto_confirm, verbose)
  File "/opt/conda/lib/python3.8/site-packages/pip/_internal/req/req_uninstall.py", line 394, in remove
    moved.stash(path)
  Fil

In [3]:
# Check numpy version again (should be 1.22.0)
! pip freeze | grep numpy

numpy==1.22.0


In [19]:
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg
import os
import shutil

model_file = "finn_lenet.onnx"

rtlsim_output_dir = "output_ipstitch_ooc_rtlsim"

#Delete previous run results if exist
if os.path.exists(rtlsim_output_dir):
    shutil.rmtree(rtlsim_output_dir)
    print("Previous run results deleted!")

cfg_stitched_ip = build.DataflowBuildConfig(
    output_dir          = rtlsim_output_dir,
    mvau_wwidth_max     = 12, #80 was the original but wanted to constrain owidth to multiple of 12 as iwidth = 12
    target_fps          = 1000000,
    synth_clk_period_ns = 10.0,
    fpga_part           = "xczu3eg-sbva484-1-i",
    generate_outputs=[
        build_cfg.DataflowOutputType.STITCHED_IP,
        build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
        build_cfg.DataflowOutputType.OOC_SYNTH,
    ]
)

Previous run results deleted!


In [20]:
%%time
build.build_dataflow_cfg(model_file, cfg_stitched_ip)

Building dataflow accelerator from finn_lenet.onnx
Intermediate outputs will be generated in /home/rstar900/finn/my_builds
Final outputs will be generated in output_ipstitch_ooc_rtlsim
Build log is at output_ipstitch_ooc_rtlsim/build_dataflow.log
Running step: step_qonnx_to_finn [1/17]
Running step: step_tidy_up [2/17]
Running step: step_streamline [3/17]
Running step: step_convert_to_hls [4/17]
Running step: step_create_dataflow_partition [5/17]
Running step: step_target_fps_parallelization [6/17]
Running step: step_apply_folding_config [7/17]
Running step: step_generate_estimate_reports [8/17]
Running step: step_hls_codegen [9/17]
Running step: step_hls_ipgen [10/17]
Running step: step_set_fifo_depths [11/17]
Running step: step_create_stitched_ip [12/17]
Running step: step_measure_rtlsim_performance [13/17]
Running step: step_out_of_context_synthesis [14/17]
Running step: step_synthesize_bitfile [15/17]
Running step: step_make_pynq_driver [16/17]
Running step: step_deployment_package

0

In [23]:
# Among the output products, we will find the accelerator exported as a stitched IP block design:
! ls {rtlsim_output_dir}/stitched_ip

all_verilog_srcs.txt		       finn_vivado_stitch_proj.xpr
data				       ip
finn_vivado_stitch_proj.cache	       make_project.sh
finn_vivado_stitch_proj.gen	       make_project.tcl
finn_vivado_stitch_proj.hw	       vivado.jou
finn_vivado_stitch_proj.ip_user_files  vivado.log
finn_vivado_stitch_proj.srcs


In [24]:
# We also have a few reports generated by these output products, different from the ones generated by ESTIMATE_REPORTS.
! ls {rtlsim_output_dir}/report

estimate_layer_resources_hls.json  rtlsim_performance.json
ooc_synth_and_timing.json


In [25]:
# In ooc_synth_and_timing.json we can find the post-synthesis and maximum clock frequency estimate for the accelerator. 
# Note that the clock frequency estimate here tends to be optimistic, since out-of-context synthesis is less constrained.
! cat {rtlsim_output_dir}/report/ooc_synth_and_timing.json

{
  "vivado_proj_folder": "/home/rstar900/finn/my_builds/synth_out_of_context_j_hy7xco/results_finn_design_wrapper",
  "LUT": 8591.0,
  "LUTRAM": 644.0,
  "FF": 7921.0,
  "DSP": 0.0,
  "BRAM": 10.0,
  "BRAM_18K": 4.0,
  "BRAM_36K": 8.0,
  "URAM": 0.0,
  "Carry": 413.0,
  "WNS": 4.2,
  "Delay": 4.2,
  "vivado_version": 2022.1,
  "vivado_build_no": 3526262.0,
  "": 0,
  "fmax_mhz": 172.41379310344828,
  "estimated_throughput_fps": 8725.394387826329
}

In [27]:
# in rtlsim_performance.json we can find the steady-state throughput and latency for the accelerator, as obtained by rtlsim. 
# If the DRAM bandwidth numbers reported here are below what the hardware platform is capable of 
# (i.e. the accelerator is not memory-bound), 
# you can expect the same steady-state throughput (excluding any software/driver overheads) in real hardware.

! cat {rtlsim_output_dir}/report/rtlsim_performance.json

{
  "cycles": 46299,
  "runtime[ms]": 0.46299,
  "throughput[images/s]": 2159.8738633663793,
  "DRAM_in_bandwidth[MB/s]": 3.3175662541307585,
  "DRAM_out_bandwidth[MB/s]": 0.04319747726732758,
  "fclk[mhz]": 100.0,
  "N": 1,
  "latency_cycles": 46299
}

In [28]:
# Finally, let's have a look at final_hw_config.json. 
# This is the node-by-node hardware configuration determined by the FINN compiler, 
# including FIFO depths, parallelization settings (PE/SIMD) and others. 
# If you want to optimize your build further (the "advanced" method we mentioned under "Configuring the performance"), 
# you can use this .json file as the folding_config_file 
# for a new run to use it as a starting point for further exploration and optimizations.

! cat {rtlsim_output_dir}/final_hw_config.json

{
  "Defaults": {},
  "StreamingFIFO_0": {
    "ram_style": "auto",
    "depth": 256,
    "impl_style": "rtl"
  },
  "ConvolutionInputGenerator_0": {
    "SIMD": 3,
    "ram_style": "distributed"
  },
  "MatrixVectorActivation_0": {
    "PE": 6,
    "SIMD": 3,
    "ram_style": "auto",
    "resType": "lut",
    "mem_mode": "decoupled",
    "runtime_writeable_weights": 0
  },
  "StreamingMaxPool_Batch_0": {
    "PE": 1
  },
  "StreamingFIFO_3": {
    "ram_style": "auto",
    "depth": 32,
    "impl_style": "rtl"
  },
  "StreamingDataWidthConverter_Batch_0": {
    "impl_style": "hls"
  },
  "StreamingFIFO_4": {
    "ram_style": "auto",
    "depth": 256,
    "impl_style": "rtl"
  },
  "ConvolutionInputGenerator_1": {
    "SIMD": 1,
    "ram_style": "distributed"
  },
  "StreamingDataWidthConverter_Batch_1": {
    "impl_style": "hls"
  },
  "StreamingFIFO_6": {
    "ram_style": "auto",
    "depth": 32,
    "impl_style": "rtl"
  },
  "MatrixVecto