diff --git a/backends/nxp/backend/neutron_converter_manager.py b/backends/nxp/backend/neutron_converter_manager.py index 90095527364..ceabeaf0f54 100644 --- a/backends/nxp/backend/neutron_converter_manager.py +++ b/backends/nxp/backend/neutron_converter_manager.py @@ -67,6 +67,7 @@ def convert( target: str, delegation_tag: str, fetch_constants_to_sram: bool = False, + use_new_flow_neutron_c: bool = False, ) -> bytes: """ Call Neutron Converter. @@ -75,6 +76,7 @@ def convert( :param target: The target platform. :param delegation_tag: The delegation tag of model partition. :param fetch_constants_to_sram: Add microcode that fetches weights from external memory. + :param use_new_flow_neutron_c: Enable experimental MLIR-based flow for Neutron-C with improved INT8 operator support. This allows running models which do not fit into SRAM. Applies to Neutron-C only (microcontrollers). :return: TFLite model with Neutron microcode as bytes. @@ -90,6 +92,7 @@ def convert( ) cctx.compilationOpts.fetchConstantsToSRAM = fetch_constants_to_sram cctx.compilationOpts.dumpKernelSelectionCode = self.dump_kernel_selection_code + cctx.compilationOpts.useNewFlowNeutronC = use_new_flow_neutron_c # Try to use multiprocessing for isolation, but fall back to direct execution # if the environment doesn't support it (e.g., in sandcastle/build environments) diff --git a/backends/nxp/nxp_backend.py b/backends/nxp/nxp_backend.py index 38878465d58..f7ecb8a908e 100644 --- a/backends/nxp/nxp_backend.py +++ b/backends/nxp/nxp_backend.py @@ -50,6 +50,7 @@ def __init__(self): self.use_neutron_for_format_conversion = True self.fetch_constants_to_sram = False self.dump_kernel_selection_code = False + self.use_new_flow_neutron_c = False def _replace_colons(self, operator: str) -> str: """ @@ -65,20 +66,21 @@ def neutron_compile_spec( use_neutron_for_format_conversion: bool = True, fetch_constants_to_sram: bool = False, dump_kernel_selection_code: bool = False, - ): - """ - Generate compile spec for Neutron NPU - - Args: - config: Neutron accelerator configuration, e.g. "imxrt700" - extra_flags: Extra flags for the Neutron compiler - operators_not_to_delegate: List of operators that should not be delegated - use_neutron_for_format_conversion: If True, the EdgeProgramToIRConverter will insert `Transpose` ops to + use_new_flow_neutron_c: bool = False, + ) -> "NeutronCompileSpecBuilder": + """Generate compile spec for Neutron NPU + + :param config: Neutron accelerator configuration, e.g. "imxrt700" + :param extra_flags: Extra flags for the Neutron compiler + :param operators_not_to_delegate: List of operators that should not be delegated + :param use_neutron_for_format_conversion: If True, the EdgeProgramToIRConverter will insert `Transpose` ops to ensure that the IO matches the executorch partition, which will be delegated to Neutron. - fetch_constants_to_sram: If True, the Neutron Converter will insert microinstructions to prefetch weights + :param fetch_constants_to_sram: If True, the Neutron Converter will insert microinstructions to prefetch weights from FLASH to SRAM. This should be used when the whole model does not fit into SRAM. - dump_kernel_selection_code: Whether Neutron converter dumps kernel selection code. + :param dump_kernel_selection_code: Whether Neutron converter dumps kernel selection code. + :param use_new_flow_neutron_c: Enable experimental MLIR-based flow for Neutron-C with improved INT8 operator support. + :return: self for method chaining """ self.config = NeutronTargetSpec(config) @@ -100,6 +102,7 @@ def neutron_compile_spec( self.use_neutron_for_format_conversion = use_neutron_for_format_conversion self.fetch_constants_to_sram = fetch_constants_to_sram self.dump_kernel_selection_code = dump_kernel_selection_code + self.use_new_flow_neutron_c = use_new_flow_neutron_c return self @@ -128,6 +131,10 @@ def build(self): "dump_kernel_selection_code", f"{self.dump_kernel_selection_code}".encode(), ), + CompileSpec( + "use_new_flow_neutron_c", + f"{self.use_new_flow_neutron_c}".encode(), + ), ] return self.compile_spec @@ -141,6 +148,7 @@ def generate_neutron_compile_spec( use_neutron_for_format_conversion: bool = True, fetch_constants_to_sram: bool = False, dump_kernel_selection_code: bool = False, + use_new_flow_neutron_c: bool = False, ) -> List[CompileSpec]: return ( NeutronCompileSpecBuilder() @@ -151,6 +159,7 @@ def generate_neutron_compile_spec( use_neutron_for_format_conversion=use_neutron_for_format_conversion, fetch_constants_to_sram=fetch_constants_to_sram, dump_kernel_selection_code=dump_kernel_selection_code, + use_new_flow_neutron_c=use_new_flow_neutron_c, ) .build() ) @@ -175,6 +184,7 @@ def preprocess( # noqa C901 use_neutron_for_format_conversion = None fetch_constants_to_sram = False dump_kernel_selection_code = None + use_new_flow_neutron_c = False for spec in compile_spec: if spec.key == "output_format": output_format = spec.value.decode() @@ -188,6 +198,8 @@ def preprocess( # noqa C901 fetch_constants_to_sram = spec.value.decode() == "True" if spec.key == "dump_kernel_selection_code": dump_kernel_selection_code = spec.value.decode() == "True" + if spec.key == "use_new_flow_neutron_c": + use_new_flow_neutron_c = spec.value.decode() == "True" # Check that the output format is set in the compile spec if not output_format: @@ -220,7 +232,11 @@ def preprocess( # noqa C901 ) neutron_model = NeutronConverterManager(dump_kernel_selection_code).convert( - tflite_model, target, delegation_tag, fetch_constants_to_sram + tflite_model, + target, + delegation_tag, + fetch_constants_to_sram, + use_new_flow_neutron_c, ) # Dump the tflite file if logging level is enabled diff --git a/backends/nxp/tests/executorch_pipeline.py b/backends/nxp/tests/executorch_pipeline.py index f1afbd2d0f7..bfe7aca0e27 100644 --- a/backends/nxp/tests/executorch_pipeline.py +++ b/backends/nxp/tests/executorch_pipeline.py @@ -130,6 +130,7 @@ def to_quantized_edge_program( use_quant_state_dict: bool = True, fetch_constants_to_sram: bool = False, dump_kernel_selection_code: bool = False, + use_new_flow_neutron_c: bool = False, ) -> EdgeProgramManager: _neutron_target_spec = NeutronTargetSpec(target) if get_quantizer_fn is None: @@ -160,6 +161,7 @@ def to_quantized_edge_program( use_neutron_for_format_conversion=use_neutron_for_format_conversion, fetch_constants_to_sram=fetch_constants_to_sram, dump_kernel_selection_code=dump_kernel_selection_code, + use_new_flow_neutron_c=use_new_flow_neutron_c, ) post_quant_state_dict = ( exir_program_aten__module_quant.state_dict() if use_quant_state_dict else None diff --git a/backends/nxp/tests/test_neutron_converter_manager.py b/backends/nxp/tests/test_neutron_converter_manager.py index aea0afe0c6f..a787df5897c 100644 --- a/backends/nxp/tests/test_neutron_converter_manager.py +++ b/backends/nxp/tests/test_neutron_converter_manager.py @@ -3,7 +3,10 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import multiprocessing + import torch +from eiq_neutron_sdk.neutron_converter.neutron_converter import CompilationContext from executorch import exir from executorch.backends.nxp.backend.edge_program_converter import ( @@ -56,3 +59,17 @@ def test_conv2d_neutron_conversion__prefetching(mocker): assert len(neutron_model_prefetch) != len( neutron_model_regular ), "The weight prefetching flag does not make a difference!" + + +def test_neutron_converter_with_experimental_mlir_flow(mocker): + model = LinearModule(True) + input_shape = (1, 1, 32, 32) + + process_spy = mocker.spy(multiprocessing, "Process") + to_quantized_edge_program( + model, input_shape, use_new_flow_neutron_c=True + ).exported_program() + + compilation_context = process_spy.call_args.kwargs["args"][2] + assert isinstance(compilation_context, CompilationContext) + assert compilation_context.compilationOpts.useNewFlowNeutronC diff --git a/backends/nxp/tests_models/executors.py b/backends/nxp/tests_models/executors.py index dd1aca29125..6cd8e8d1987 100644 --- a/backends/nxp/tests_models/executors.py +++ b/backends/nxp/tests_models/executors.py @@ -68,6 +68,7 @@ def _run_delegated_executorch_program( mocker, use_qat: bool = False, train_fn: Callable[[torch.fx.GraphModule], None] | None = None, + use_new_flow_neutron_c: bool = False, ) -> ExportedProgram: if len(input_spec) == 1: # Single input, use --dataset @@ -116,6 +117,7 @@ def wrapper(*args, **kwargs): delegate_to_npu=True, use_qat=use_qat, train_fn=train_fn, + use_new_flow_neutron_c=use_new_flow_neutron_c, ) except RuntimeError as e: if "Model converted with neutron-converter has" in str(e): @@ -375,6 +377,7 @@ def convert_run_compare( reference_model: ReferenceModel = ReferenceModel.QUANTIZED_EXECUTORCH_CPP, use_qat: bool = False, train_fn: Callable[[torch.fx.GraphModule], None] | None = None, + use_new_flow_neutron_c: bool = False, ): """ Run provided program twice with neutron-test and check if results correspond. At first, @@ -391,6 +394,7 @@ def convert_run_compare( :param mocker: Mocker instance used by visualizer. :param use_qat: If True, applies quantization-aware training before conversion (without the QAT training). :param train_fn: Train/finetune function for QAT training. Is used only when `use_qat=True`. + :param use_new_flow_neutron_c: Enable experimental MLIR-based flow for Neutron-C with improved INT8 operator support. """ assert_NSYS() @@ -432,6 +436,7 @@ def convert_run_compare( mocker, use_qat=use_qat, train_fn=train_fn, + use_new_flow_neutron_c=use_new_flow_neutron_c, ) output_spec = _get_program_output_spec(delegated_program) diff --git a/backends/nxp/tests_models/utils.py b/backends/nxp/tests_models/utils.py index a27f05adb33..06e09827746 100644 --- a/backends/nxp/tests_models/utils.py +++ b/backends/nxp/tests_models/utils.py @@ -67,6 +67,7 @@ def to_quantized_edge_program( delegate_to_npu=True, use_qat: bool = False, train_fn: Callable[[torch.fx.GraphModule], None] | None = None, + use_new_flow_neutron_c: bool = False, ) -> EdgeProgramManager: assert isinstance(input_spec, list) and all( isinstance(spec, ModelInputSpec) for spec in input_spec @@ -157,7 +158,9 @@ def to_quantized_edge_program( ( [ NeutronPartitioner( - generate_neutron_compile_spec("imxrt700"), + generate_neutron_compile_spec( + "imxrt700", use_new_flow_neutron_c=use_new_flow_neutron_c + ), neutron_target_spec=neutron_target_spec, post_quantization_state_dict=exir_program_aten_quant.state_dict(), ) @@ -186,6 +189,7 @@ def to_quantized_executorch_program( delegate_to_npu=True, use_qat: bool = False, train_fn: Callable[[torch.fx.GraphModule], None] | None = None, + use_new_flow_neutron_c: bool = False, ) -> ExecutorchProgramManager: edge_program_manager = to_quantized_edge_program( model, @@ -194,6 +198,7 @@ def to_quantized_executorch_program( delegate_to_npu, use_qat=use_qat, train_fn=train_fn, + use_new_flow_neutron_c=use_new_flow_neutron_c, ) return edge_program_manager.to_executorch( diff --git a/examples/nxp/aot_neutron_compile.py b/examples/nxp/aot_neutron_compile.py index 385e811fdaa..dda223c5650 100644 --- a/examples/nxp/aot_neutron_compile.py +++ b/examples/nxp/aot_neutron_compile.py @@ -240,6 +240,13 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool): action="store_true", help="This feature allows running models which do not fit into SRAM by offloading them to an external memory.", ) + parser.add_argument( + "--use_new_flow_neutron_c", + required=False, + default=False, + action="store_true", + help="Enable experimental MLIR-based flow for Neutron-C with improves INT8 operator support.", + ) args = parser.parse_args() @@ -323,6 +330,7 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool): operators_not_to_delegate=args.operators_not_to_delegate, fetch_constants_to_sram=args.fetch_constants_to_sram, dump_kernel_selection_code=args.dump_kernel_selection_code, + use_new_flow_neutron_c=args.use_new_flow_neutron_c, ) partitioners = ( [