sony · lior-dikstein · May 6, 2024 · Apr 30, 2024 · May 1, 2024 · May 2, 2024
diff --git a/...olkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_data.py b/...olkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_data.py
@@ -13,26 +13,30 @@
 # limitations under the License.
 # ==============================================================================
 import numpy as np
-from typing import Callable, Any
+from typing import Callable, Any, Dict, Tuple
 
-from model_compression_toolkit.constants import FLOAT_BITWIDTH
+from model_compression_toolkit.constants import FLOAT_BITWIDTH, BITS_TO_BYTES
 from model_compression_toolkit.core import FrameworkInfo, ResourceUtilization, CoreConfig
 from model_compression_toolkit.core.common import Graph
 from model_compression_toolkit.core.common.framework_implementation import FrameworkImplementation
 from model_compression_toolkit.core.common.graph.edge import EDGE_SINK_INDEX
 from model_compression_toolkit.core.graph_prep_runner import graph_preparation_runner
-from model_compression_toolkit.target_platform_capabilities.target_platform import TargetPlatformCapabilities
+from model_compression_toolkit.target_platform_capabilities.target_platform import TargetPlatformCapabilities, \
+    QuantizationConfigOptions
 
 
 def compute_resource_utilization_data(in_model: Any,
                                       representative_data_gen: Callable,
                                       core_config: CoreConfig,
                                       tpc: TargetPlatformCapabilities,
                                       fw_info: FrameworkInfo,
-                                      fw_impl: FrameworkImplementation) -> ResourceUtilization:
+                                      fw_impl: FrameworkImplementation,
+                                      transformed_graph: Graph = None,
+                                      mixed_precision_enable: bool = True) -> ResourceUtilization:
     """
     Compute Resource Utilization information that can be relevant for defining target ResourceUtilization for mixed precision search.
-    Calculates maximal activation tensor, sum of weights' parameters and total (sum of both).
+    Calculates maximal activation tensor size, the sum of the model's weight parameters and the total memory combining both weights
+    and maximal activation tensor size.
 
     Args:
         in_model:  Model to build graph from (the model that intended to be quantized).
@@ -42,27 +46,37 @@ def compute_resource_utilization_data(in_model: Any,
                                               the attached framework operator's information.
         fw_info: Information needed for quantization about the specific framework.
         fw_impl: FrameworkImplementation object with a specific framework methods implementation.
+        transformed_graph: An internal graph representation of the input model.
+        transformed_graph: An internal graph representation of the input model. Defaults to None.
+                            If no graph is provided, a graph will be automatically generated
+                            using the specified model.
+        mixed_precision_enable: Indicates if mixed precision is enabled, defaults to True.
+                                If disabled, computes resource utilization using base quantization
+                                configurations across all layers.
+
+    Returns:
+        ResourceUtilization: An object encapsulating the calculated resource utilization computations.
 
-    Returns: A ResourceUtilization object with the results.
 
     """
 
     # We assume that the resource_utilization_data API is used to compute the model resource utilization for
     # mixed precision scenario, so we run graph preparation under the assumption of enabled mixed precision.
-    transformed_graph = graph_preparation_runner(in_model,
-                                                 representative_data_gen,
-                                                 core_config.quantization_config,
-                                                 fw_info,
-                                                 fw_impl,
-                                                 tpc,
-                                                 mixed_precision_enable=True)
+    if transformed_graph is None:
+        transformed_graph = graph_preparation_runner(in_model,
+                                                     representative_data_gen,
+                                                     core_config.quantization_config,
+                                                     fw_info,
+                                                     fw_impl,
+                                                     tpc,
+                                                     mixed_precision_enable=mixed_precision_enable)
 
     # Compute parameters sum
-    weights_params = compute_nodes_weights_params(graph=transformed_graph, fw_info=fw_info)
+    weights_memory_bytes, weights_params = compute_nodes_weights_params(graph=transformed_graph, fw_info=fw_info)
     total_weights_params = 0 if len(weights_params) == 0 else sum(weights_params)
 
     # Compute max activation tensor
-    activation_output_sizes = compute_activation_output_sizes(graph=transformed_graph)
+    activation_output_sizes_bytes, activation_output_sizes = compute_activation_output_sizes(graph=transformed_graph)
     max_activation_tensor_size = 0 if len(activation_output_sizes) == 0 else max(activation_output_sizes)
 
     # Compute total memory utilization - parameters sum + max activation tensor
@@ -78,20 +92,23 @@ def compute_resource_utilization_data(in_model: Any,
                                bops=bops_count)
 
 
-def compute_nodes_weights_params(graph: Graph, fw_info: FrameworkInfo) -> np.ndarray:
+def compute_nodes_weights_params(graph: Graph, fw_info: FrameworkInfo) -> Tuple[np.ndarray, np.ndarray]:
     """
-    Computes a vector with the respective weights' parameters size for each node.
+    Calculates the memory usage in bytes and the number of weight parameters for each node within a graph.
+    Memory calculations are based on the maximum bit-width used for quantization per node.
 
     Args:
-        graph: Finalized Graph object.
-        fw_info: FrameworkInfo object about the specific framework
-            (e.g., attributes of different layers' weights to quantize).
-
-    Returns: A vector of node's weights memory sizes.
-
+        graph: A finalized Graph object, representing the model structure.
+        fw_info: FrameworkInfo object containing details about the specific framework's
+                 quantization attributes for different layers' weights.
+
+    Returns:
+        A tuple containing two arrays:
+            - The first array represents the memory in bytes for each node's weights when quantized at the maximal bit-width.
+            - The second array represents the total number of weight parameters for each node.
     """
-
     weights_params = []
+    weights_memory_bytes = []
     for n in graph.nodes:
         # TODO: when enabling multiple attribute quantization by default (currently,
         #  only kernel quantization is enabled) we should include other attributes memory in the sum of all
@@ -100,36 +117,51 @@ def compute_nodes_weights_params(graph: Graph, fw_info: FrameworkInfo) -> np.nda
         kernel_attr = fw_info.get_kernel_op_attributes(n.type)[0]
         if kernel_attr is not None and not n.reuse:
             kernel_candidates = n.get_all_weights_attr_candidates(kernel_attr)
+
             if len(kernel_candidates) > 0 and any([c.enable_weights_quantization for c in kernel_candidates]):
+                max_weight_bits = max([kc.weights_n_bits for kc in kernel_candidates])
                 node_num_weights_params = 0
                 for attr in fw_info.get_kernel_op_attributes(n.type):
                     if attr is not None:
                         node_num_weights_params += n.get_weights_by_keys(attr).flatten().shape[0]
 
                 weights_params.append(node_num_weights_params)
 
-    return np.array(weights_params)
+                # multiply num params by num bits and divide by BITS_TO_BYTES to convert from bits to bytes
+                weights_memory_bytes.append(node_num_weights_params * max_weight_bits / BITS_TO_BYTES)
 
+    return np.array(weights_memory_bytes), np.array(weights_params)
 
-def compute_activation_output_sizes(graph: Graph) -> np.ndarray:
+def compute_activation_output_sizes(graph: Graph) -> Tuple[np.ndarray, np.ndarray]:
     """
-    Computes a vector with the respective output tensor size for each node.
+    Computes an array of the respective output tensor size and an array of the output tensor size in bytes for
+    each node.
 
     Args:
-        graph: Finalized Graph object.
+        graph: A finalized Graph object, representing the model structure.
+
+    Returns:
+    A tuple containing two arrays:
+        - The first array represents the size of each node's activation output tensor size in bytes,
+          calculated using the maximal bit-width for quantization.
+        - The second array represents the size of each node's activation output tensor size.
 
-    Returns: A vector of node's activation output size.
 
     """
 
     activation_outputs = []
-    # Go over all nodes that have configurable activation.
+    activation_outputs_bytes = []
     for n in graph.nodes:
+        # Go over all nodes that have configurable activation.
         if n.has_activation_quantization_enabled_candidate():
+            # Fetch maximum bits required for quantizing activations
+            max_activation_bits = max([qc.activation_quantization_cfg.activation_n_bits for qc in n.candidates_quantization_cfg])
             node_output_size = n.get_total_output_params()
             activation_outputs.append(node_output_size)
+            # Calculate activation size in bytes and append to list
+            activation_outputs_bytes.append(node_output_size * max_activation_bits / BITS_TO_BYTES)
 
-    return np.array(activation_outputs)
+    return np.array(activation_outputs_bytes), np.array(activation_outputs)
 
 
 def compute_total_bops(graph: Graph, fw_info: FrameworkInfo, fw_impl: FrameworkImplementation) -> np.ndarray:
@@ -162,3 +194,56 @@ def compute_total_bops(graph: Graph, fw_info: FrameworkInfo, fw_impl: FrameworkI
             bops.append(node_bops)
 
     return np.array(bops)
+
+
+def requires_mixed_precision(in_model: Any,
+                            target_resource_utilization: ResourceUtilization,
+                            representative_data_gen: Callable,
+                            core_config: CoreConfig,
+                            tpc: TargetPlatformCapabilities,
+                            fw_info: FrameworkInfo,
+                            fw_impl: FrameworkImplementation) -> bool:
+    """
+    The function checks whether the model requires mixed precision to meet the requested target resource utilization.
+    This is determined by whether the target memory usage of the weights is less than the available memory,
+    the target maximum size of an activation tensor is less than the available memory,
+    and the target number of BOPs is less than the available BOPs.
+    If any of these conditions are met, the function returns True. Otherwise, it returns False.
+
+    Args:
+        in_model: The model to be evaluated.
+        target_resource_utilization: The resource utilization of the target device.
+        representative_data_gen: A function that generates representative data for the model.
+        core_config: CoreConfig containing parameters of how the model should be quantized.
+        tpc: TargetPlatformCapabilities object that models the inference target platform and
+                                              the attached framework operator's information.
+        fw_info: Information needed for quantization about the specific framework.
+        fw_impl: FrameworkImplementation object with a specific framework methods implementation.
+
+    Returns: A boolean indicating if mixed precision is needed.
+    """
+    is_mixed_precision = False
+    transformed_graph = graph_preparation_runner(in_model,
+                                                 representative_data_gen,
+                                                 core_config.quantization_config,
+                                                 fw_info,
+                                                 fw_impl,
+                                                 tpc,
+                                                 mixed_precision_enable=False)
+    # Compute max weights memory in bytes
+    weights_memory_by_layer_bytes, _ = compute_nodes_weights_params(transformed_graph, fw_info)
+    total_weights_memory_bytes = 0 if len(weights_memory_by_layer_bytes) == 0 else sum(weights_memory_by_layer_bytes)
+
+    # Compute max activation tensor in bytes
+    activation_output_sizes_bytes, _ = compute_activation_output_sizes(transformed_graph)
+    max_activation_tensor_size_bytes = 0 if len(activation_output_sizes_bytes) == 0 else max(activation_output_sizes_bytes)
+
+    # Compute BOPS utilization - total count of bit-operations for all configurable layers with kernel
+    bops_count = compute_total_bops(graph=transformed_graph, fw_info=fw_info, fw_impl=fw_impl)
+    bops_count = np.inf if len(bops_count) == 0 else sum(bops_count)
+
+    is_mixed_precision |= target_resource_utilization.weights_memory < total_weights_memory_bytes
+    is_mixed_precision |= target_resource_utilization.activation_memory < max_activation_tensor_size_bytes
+    is_mixed_precision |= target_resource_utilization.total_memory < total_weights_memory_bytes + max_activation_tensor_size_bytes
+    is_mixed_precision |= target_resource_utilization.bops < bops_count
+    return is_mixed_precision
diff --git a/model_compression_toolkit/core/runner.py b/model_compression_toolkit/core/runner.py
@@ -20,6 +20,8 @@
 
 from model_compression_toolkit.core.common import FrameworkInfo
 from model_compression_toolkit.core.common.hessian.hessian_info_service import HessianInfoService
+from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization_data import \
+    requires_mixed_precision
 from model_compression_toolkit.core.graph_prep_runner import graph_preparation_runner
 from model_compression_toolkit.core.quantization_prep_runner import quantization_preparation_runner
 from model_compression_toolkit.logger import Logger
@@ -89,7 +91,16 @@ def core_runner(in_model: Any,
         if core_config.mixed_precision_config is None:
             Logger.critical("Provided an initialized target_resource_utilization, that means that mixed precision quantization is "
                             "enabled, but the provided MixedPrecisionQuantizationConfig is None.")
-        core_config.mixed_precision_config.set_mixed_precision_enable()
+        # Determine whether to use mixed precision or single precision based on target_resource_utilization.
+        if requires_mixed_precision(in_model,
+                                    target_resource_utilization,
+                                    representative_data_gen,
+                                    core_config,
+                                    tpc,
+                                    fw_info,
+                                    fw_impl):
+            core_config.mixed_precision_config.set_mixed_precision_enable()
+            Logger.info('Mixed precision enabled.')
 
     graph = graph_preparation_runner(in_model,
                                      representative_data_gen,

diff --git a/tests/common_tests/helpers/generate_test_tp_model.py b/tests/common_tests/helpers/generate_test_tp_model.py
@@ -74,7 +74,7 @@ def generate_mixed_precision_test_tp_model(base_cfg, default_config, mp_bitwidth
                              name=name)
 
 
-def generate_tp_model_with_activation_mp(base_cfg, default_config, mp_bitwidth_candidates_list,
+def generate_tp_model_with_activation_mp(base_cfg, default_config, mp_bitwidth_candidates_list, custom_opsets=[],
                                          name="activation_mp_model"):
     mp_op_cfg_list = []
     for weights_n_bits, activation_n_bits in mp_bitwidth_candidates_list:
@@ -99,6 +99,8 @@ def generate_tp_model_with_activation_mp(base_cfg, default_config, mp_bitwidth_c
     operator_sets_dict = {op_set.name: mixed_precision_configuration_options for op_set in base_tp_model.operator_set
                           if op_set.name is not "NoQuantization"}
     operator_sets_dict["Input"] = mixed_precision_configuration_options
+    for c_ops in custom_opsets:
+        operator_sets_dict[c_ops] = mixed_precision_configuration_options
 
     return generate_custom_test_tp_model(name=name,
                                          base_cfg=base_cfg,

diff --git a/tests/keras_tests/feature_networks_tests/feature_networks/mixed_precision_tests.py b/tests/keras_tests/feature_networks_tests/feature_networks/mixed_precision_tests.py
@@ -383,7 +383,7 @@ def __init__(self, unit_test):
         super().__init__(unit_test, activation_layers_idx=[1, 2, 3])
 
     def get_resource_utilization(self):
-        return ResourceUtilization(np.inf, np.inf)
+        return ResourceUtilization(np.inf, 5407)
 
     def create_networks(self):
         inputs = layers.Input(shape=self.get_input_shapes()[0][1:])
@@ -397,7 +397,7 @@ def compare(self, quantized_model, float_model, input_x=None, quantization_info=
         # resource utilization is infinity -> should give best model - 8bits
         holder_layers = get_layers_from_model_by_type(quantized_model, KerasActivationQuantizationHolder)
         activation_bits = [h.activation_holder_quantizer.get_config()['num_bits'] for h in holder_layers]
-        self.unit_test.assertTrue((activation_bits == [8, 8, 8]))
+        self.unit_test.assertTrue((activation_bits == [8, 4, 4]))
 
         self.verify_quantization(quantized_model, input_x,
                                  weights_layers_idx=[2],

diff --git a/tests/keras_tests/feature_networks_tests/feature_networks/qat/qat_test.py b/tests/keras_tests/feature_networks_tests/feature_networks/qat/qat_test.py
@@ -283,7 +283,7 @@ def compare(self, qat_model, finalize=False, input_x=None, quantization_info=Non
 
 
 class QATWrappersMixedPrecisionCfgTest(MixedPrecisionActivationBaseTest):
-    def __init__(self, unit_test, ru_weights=np.inf, ru_activation=np.inf, expected_mp_cfg=[0, 0, 0, 0]):
+    def __init__(self, unit_test, ru_weights=17919, ru_activation=5407, expected_mp_cfg=[0, 4, 0, 0]):
         self.ru_weights = ru_weights
         self.ru_activation = ru_activation
         self.expected_mp_cfg = expected_mp_cfg
@@ -303,7 +303,6 @@ def run_test(self, **kwargs):
 
     def compare(self, qat_ready_model, quantization_info):
 
-        # check that MP search returns 8 bits configuration for all layers
         self.unit_test.assertTrue(all(quantization_info.mixed_precision_cfg == self.expected_mp_cfg))
 
         # check that quantizer gets multiple bits configuration