In [1]:
import finn
from finn.util.visualization import showSrc, showInNetron
from finn.util.basic import make_build_dir
import os
import onnx

# Load model.
file_path = str('../workspace/jh_fpga_amr/src/py/models')
file_name = str('/vgglike_6f_6c_5re_5mp_pr0.3_quant8.onnx') # Change as needed.
qonnx_model = onnx.load(file_path + file_name)
showInNetron(file_path + file_name)

Serving '../workspace/jh_fpga_amr/src/py/models/vgglike_6f_6c_5re_5mp_pr0.3_quant8.onnx' at http://0.0.0.0:8081


In [2]:
from qonnx.util.cleanup import cleanup

# Run QONNX cleanup.
os.makedirs(os.path.dirname(file_path + '/cleanup' + file_name), exist_ok=True)
export_onnx_path_cleaned = file_path + '/cleanup' + file_name
cleanup(file_path + file_name, out_file=export_onnx_path_cleaned)
showInNetron(export_onnx_path_cleaned)

Stopping http://0.0.0.0:8081
Serving '../workspace/jh_fpga_amr/src/py/models/cleanup/vgglike_6f_6c_5re_5mp_pr0.3_quant8.onnx' at http://0.0.0.0:8081


In [3]:
from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
from qonnx.core.modelwrapper import ModelWrapper

# Load model using ModelWrapper and convert to FINN format from QONNX.
model = ModelWrapper(export_onnx_path_cleaned)
model = model.transform(ConvertQONNXtoFINN())

export_onnx_path_converted = file_path + '/conv' + file_name
os.makedirs(os.path.dirname(export_onnx_path_converted), exist_ok=True)
model.save(export_onnx_path_converted)
showInNetron(export_onnx_path_converted)

Stopping http://0.0.0.0:8081
Serving '../workspace/jh_fpga_amr/src/py/models/conv/vgglike_6f_6c_5re_5mp_pr0.3_quant8.onnx' at http://0.0.0.0:8081


In [4]:
#from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames, RemoveStaticGraphInputs
#from qonnx.transformation.infer_shapes import InferShapes
#from qonnx.transformation.infer_datatypes import InferDataTypes
#from qonnx.transformation.fold_constants import FoldConstants
#from qonnx.transformation.change_3d_tensors_to_4d import Change3DTo4DTensors
#import finn.transformation.streamline.absorb as absorb
#import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
#from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
#from finn.transformation.fpgadataflow.convert_to_hw_layers import InferChannelwiseLinearLayer, InferLabelSelectLayer, InferStreamingMaxPool
#from finn.transformation.streamline import Streamline
#from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
#from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
#import finn.transformation.streamline.absorb as absorb
#from finn.transformation.streamline.reorder import MakeMaxPoolNHWC, MoveScalarLinearPastInvariants
#from qonnx.transformation.infer_data_layouts import InferDataLayouts

#from qonnx.transformation.base import Transformation
#from finn.transformation.qonnx.qonnx_activation_handlers import QuantReluHandler

#model = model.transform(InferShapes())
#model = model.transform(FoldConstants())
#model = model.transform(GiveUniqueNodeNames())
#model = model.transform(InferChannelwiseLinearLayer())
#model = model.transform(InferLabelSelectLayer())
#model = model.transform(MoveScalarLinearPastInvariants())

#model = model.transform(MakeMaxPoolNHWC())
#model = model.transform(absorb.AbsorbConsecutiveTransposes())
#model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
#model = model.transform(ConvertBipolarMatMulToXnorPopcount())
#model = model.transform(Streamline())
#model = model.transform(InferStreamingMaxPool())
#model = model.transform(GiveReadableTensorNames())
#model = model.transform(InferDataTypes())
#model = model.transform(RemoveStaticGraphInputs())

In [5]:
from finn.transformation.streamline import Streamline
from qonnx.transformation.change_3d_tensors_to_4d import Change3DTo4DTensors
from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
from qonnx.transformation.general import RemoveUnusedTensors, GiveUniqueNodeNames
from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition

model = model.transform(Streamline()) # Only single model path supported.
model = model.transform(Change3DTo4DTensors()) # Necessary, FINN doesn't like 1d.
model = model.transform(LowerConvsToMatMul()) # Also necessary for build, need the 4D conversion first.
#model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
#model = model.transform(to_hw.InferChannelwiseLinearLayer())
#model = model.transform(to_hw.InferLabelSelectLayer())
model = model.transform(GiveUniqueNodeNames())
model = model.transform(RemoveUnusedTensors())
model = model.transform(CreateDataflowPartition())

os.makedirs(os.path.dirname(file_path + '/verif' + file_name), exist_ok=True)
verif_model_filename = file_path + '/verif' + file_name
model.save(verif_model_filename)
showInNetron(verif_model_filename)

Stopping http://0.0.0.0:8081
Serving '../workspace/jh_fpga_amr/src/py/models/verif/vgglike_6f_6c_5re_5mp_pr0.3_quant8.onnx' at http://0.0.0.0:8081


In [6]:
## Runs a resource estimate build.
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg
import os
import shutil

verif_model_filename = file_path + '/verif' + file_name
model_file = verif_model_filename

estimates_output_dir = file_path + '/output_estimates_new' + file_name

# Delete previous run results if exist.
if os.path.exists(estimates_output_dir):
    shutil.rmtree(estimates_output_dir)
    print("Previous run results deleted!")


cfg_estimates = build.DataflowBuildConfig(
    output_dir          = estimates_output_dir,
    mvau_wwidth_max     = 80,
    target_fps          = 1000000,
    synth_clk_period_ns = 10.0,
    fpga_part           = "xc7z020clg400-1",
    steps               = build_cfg.estimate_only_dataflow_steps,
    generate_outputs=[
        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
    ]
)

In [7]:
%%time
build.build_dataflow_cfg(model_file, cfg_estimates);

Building dataflow accelerator from ../workspace/jh_fpga_amr/src/py/models/verif/vgglike_6f_6c_5re_5mp_pr0.3_quant8.onnx
Intermediate outputs will be generated in /tmp/finn_dev_rothej
Final outputs will be generated in ../workspace/jh_fpga_amr/src/py/models/output_estimates_new/vgglike_6f_6c_5re_5mp_pr0.3_quant8.onnx
Build log is at ../workspace/jh_fpga_amr/src/py/models/output_estimates_new/vgglike_6f_6c_5re_5mp_pr0.3_quant8.onnx/build_dataflow.log
Running step: step_qonnx_to_finn [1/10]
Running step: step_tidy_up [2/10]
Running step: step_streamline [3/10]
Running step: step_convert_to_hw [4/10]
Running step: step_create_dataflow_partition [5/10]
Running step: step_specialize_layers [6/10]
Running step: step_target_fps_parallelization [7/10]
Running step: step_apply_folding_config [8/10]
Running step: step_minimize_bit_width [9/10]
Running step: step_generate_estimate_reports [10/10]
Completed successfully
CPU times: user 4.46 s, sys: 651 ms, total: 5.11 s
Wall time: 4.34 s


In [8]:
import json

json_file_path = estimates_output_dir + "/report/estimate_layer_resources.json"

with open(json_file_path, 'r') as file:
    data = json.load(file)

total_resources = data['total']

# Print utilization details.
print("Resource Utilization:")
print(f"BRAM_18K: {total_resources['BRAM_18K']}")
print(f"LUT: {total_resources['LUT']}")
print(f"URAM: {total_resources['URAM']}")
print(f"DSP: {total_resources['DSP']}")

Resource Utilization:
BRAM_18K: 22.0
LUT: 3580.0
URAM: 0.0
DSP: 24.0


In [9]:
json_file_path = estimates_output_dir + "/report/estimate_network_performance.json"

with open(json_file_path, 'r') as file:
    data = json.load(file)

# Print network performance estimates. Tends to over-estimate since it cannot capture the effects
# of various synth optimizations.
print("Network Performance:")
for key, value in data.items():
    print(f"{key}: {value}")

Network Performance:
critical_path_cycles: 260
max_cycles: 98
max_cycles_node_name: ConvolutionInputGenerator_rtl_0
estimated_throughput_fps: 1020408.1632653062
estimated_latency_ns: 2600.0


In [10]:
json_file_path = estimates_output_dir + "/report/estimate_layer_cycles.json"

with open(json_file_path, 'r') as file:
    data = json.load(file)

# All layers are running in parallel, so slowest layer determines overall throughput.
# FINN tries to parallelize each layer so that they all take a similar no. of cycles.
# Summing up all layer cycle estimates will give the overall network latency.
print("Layer Cycles:")
for key, value in data.items():
    print(f"{key}: {value}")

Layer Cycles:
FMPadding_rtl_0: 34
ConvolutionInputGenerator_rtl_0: 98
MVAU_rtl_0: 64
Thresholding_rtl_0: 64


In [11]:
## Runs a synth build to view rtlsim performance. All pulled from 3-build-accelerator-with-finn.
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg
import os
import shutil

model_file = verif_model_filename

rtlsim_output_dir = file_path + '/output_rtl' + file_name

# Delete previous run results if exist.
if os.path.exists(rtlsim_output_dir):
    shutil.rmtree(rtlsim_output_dir)
    print("Previous run results deleted!")

cfg_stitched_ip = build.DataflowBuildConfig(
    output_dir          = rtlsim_output_dir,
    mvau_wwidth_max     = 80,
    target_fps          = 1000000,
    synth_clk_period_ns = 10.0,
    fpga_part           = "xc7z020clg400-1",
    generate_outputs=[
        build_cfg.DataflowOutputType.STITCHED_IP,
        build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
        build_cfg.DataflowOutputType.OOC_SYNTH,
    ]
)

Previous run results deleted!


In [12]:
## Note: This will take ~ 10 mins to complete, uses Vivado for build.
#%%time
build.build_dataflow_cfg(model_file, cfg_stitched_ip)

Building dataflow accelerator from ../workspace/jh_fpga_amr/src/py/models/verif/vgglike_6f_6c_5re_5mp_pr0.3_quant8.onnx
Intermediate outputs will be generated in /tmp/finn_dev_rothej
Final outputs will be generated in ../workspace/jh_fpga_amr/src/py/models/output_rtl/vgglike_6f_6c_5re_5mp_pr0.3_quant8.onnx
Build log is at ../workspace/jh_fpga_amr/src/py/models/output_rtl/vgglike_6f_6c_5re_5mp_pr0.3_quant8.onnx/build_dataflow.log
Running step: step_qonnx_to_finn [1/19]
Running step: step_tidy_up [2/19]
Running step: step_streamline [3/19]
Running step: step_convert_to_hw [4/19]
Running step: step_create_dataflow_partition [5/19]
Running step: step_specialize_layers [6/19]
Running step: step_target_fps_parallelization [7/19]
Running step: step_apply_folding_config [8/19]
Running step: step_minimize_bit_width [9/19]
Running step: step_generate_estimate_reports [10/19]
Running step: step_hw_codegen [11/19]
Running step: step_hw_ipgen [12/19]
Running step: step_set_fifo_depths [13/19]
Runni

0

In [13]:
json_file_path = rtlsim_output_dir + "/report/ooc_synth_and_timing.json"

with open(json_file_path, 'r') as file:
    data = json.load(file)

## Print hardware utilization estimates post-synthesis.
# LUT - number of LUTs used.
# LUTRAM - number of LUTs configured as RAM.
# FF - number of FFs used.
# DSP - number of DSP blocks used. Synth usually tries to conserve these when not needed because they are valuable.
# BRAM - total block RAM tiles used.
# Carry - carry chains used, used for arith. operations.
# WNS - worst negative slack, positive value means timing is met.
print("RTL Hardware Util:")
for key, value in data.items():
    print(f"{key}: {value}")

RTL Hardware Util:
vivado_proj_folder: /tmp/finn_dev_rothej/synth_out_of_context_i65c8lvo/results_finn_design_wrapper
LUT: 4471.0
LUTRAM: 220.0
FF: 9346.0
DSP: 0.0
BRAM: 35.0
BRAM_18K: 48.0
BRAM_36K: 11.0
URAM: 0.0
Carry: 590.0
WNS: 0.466
Delay: 0.466
vivado_version: 2022.2
vivado_build_no: 3671981.0
: 0
fmax_mhz: 104.88777008600796
estimated_throughput_fps: 1070283.368224571


In [14]:
json_file_path = rtlsim_output_dir + "/report/rtlsim_performance.json"

with open(json_file_path, 'r') as file:
    data = json.load(file)

## Print performance estimates post-synthesis.
# N_IN_TXNS - number of input transactions.
# N_OUT_TXNS - number of output transactions.
# cycles - total number of clk cycles for the process.
# N - number of operations (batch size) handled in a single cycle.
# latency_cycles - total cycle count.
print("RTL Performance Est:")
for key, value in data.items():
    print(f"{key}: {value}")

RTL Performance Est:
N_IN_TXNS: 32
N_OUT_TXNS: 64
cycles: 131
N: 1
latency_cycles: 131
runtime[ms]: 0.0013100000000000002
throughput[images/s]: 763358.7786259541
fclk[mhz]: 100.0
stable_throughput[images/s]: 763358.7786259541
