Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a flag to indicate whether to run mixed precision according to resource utilization provided by user. #1052

Merged
merged 6 commits into from
May 6, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,30 @@
# limitations under the License.
# ==============================================================================
import numpy as np
from typing import Callable, Any
from typing import Callable, Any, Dict, Tuple

from model_compression_toolkit.constants import FLOAT_BITWIDTH
from model_compression_toolkit.constants import FLOAT_BITWIDTH, BITS_TO_BYTES
from model_compression_toolkit.core import FrameworkInfo, ResourceUtilization, CoreConfig
from model_compression_toolkit.core.common import Graph
from model_compression_toolkit.core.common.framework_implementation import FrameworkImplementation
from model_compression_toolkit.core.common.graph.edge import EDGE_SINK_INDEX
from model_compression_toolkit.core.graph_prep_runner import graph_preparation_runner
from model_compression_toolkit.target_platform_capabilities.target_platform import TargetPlatformCapabilities
from model_compression_toolkit.target_platform_capabilities.target_platform import TargetPlatformCapabilities, \
QuantizationConfigOptions


def compute_resource_utilization_data(in_model: Any,
representative_data_gen: Callable,
core_config: CoreConfig,
tpc: TargetPlatformCapabilities,
fw_info: FrameworkInfo,
fw_impl: FrameworkImplementation) -> ResourceUtilization:
fw_impl: FrameworkImplementation,
transformed_graph: Graph = None,
mixed_precision_enable: bool = True) -> ResourceUtilization:
"""
Compute Resource Utilization information that can be relevant for defining target ResourceUtilization for mixed precision search.
Calculates maximal activation tensor, sum of weights' parameters and total (sum of both).
Calculates maximal activation tensor size, the sum of the model's weight parameters and the total memory combining both weights
and maximal activation tensor size.

Args:
in_model: Model to build graph from (the model that intended to be quantized).
Expand All @@ -42,27 +46,37 @@ def compute_resource_utilization_data(in_model: Any,
the attached framework operator's information.
fw_info: Information needed for quantization about the specific framework.
fw_impl: FrameworkImplementation object with a specific framework methods implementation.
transformed_graph: An internal graph representation of the input model.
ofirgo marked this conversation as resolved.
Show resolved Hide resolved
ofirgo marked this conversation as resolved.
Show resolved Hide resolved
transformed_graph: An internal graph representation of the input model. Defaults to None.
If no graph is provided, a graph will be automatically generated
ofirgo marked this conversation as resolved.
Show resolved Hide resolved
using the specified model.
mixed_precision_enable: Indicates if mixed precision is enabled, defaults to True.
If disabled, computes resource utilization using base quantization
configurations across all layers.

Returns:
ResourceUtilization: An object encapsulating the calculated resource utilization computations.

Returns: A ResourceUtilization object with the results.

"""

# We assume that the resource_utilization_data API is used to compute the model resource utilization for
# mixed precision scenario, so we run graph preparation under the assumption of enabled mixed precision.
transformed_graph = graph_preparation_runner(in_model,
representative_data_gen,
core_config.quantization_config,
fw_info,
fw_impl,
tpc,
mixed_precision_enable=True)
if transformed_graph is None:
transformed_graph = graph_preparation_runner(in_model,
representative_data_gen,
core_config.quantization_config,
fw_info,
fw_impl,
tpc,
mixed_precision_enable=mixed_precision_enable)

# Compute parameters sum
weights_params = compute_nodes_weights_params(graph=transformed_graph, fw_info=fw_info)
weights_memory_bytes, weights_params = compute_nodes_weights_params(graph=transformed_graph, fw_info=fw_info)
total_weights_params = 0 if len(weights_params) == 0 else sum(weights_params)

# Compute max activation tensor
activation_output_sizes = compute_activation_output_sizes(graph=transformed_graph)
activation_output_sizes_bytes, activation_output_sizes = compute_activation_output_sizes(graph=transformed_graph)
max_activation_tensor_size = 0 if len(activation_output_sizes) == 0 else max(activation_output_sizes)

# Compute total memory utilization - parameters sum + max activation tensor
Expand All @@ -78,20 +92,23 @@ def compute_resource_utilization_data(in_model: Any,
bops=bops_count)


def compute_nodes_weights_params(graph: Graph, fw_info: FrameworkInfo) -> np.ndarray:
def compute_nodes_weights_params(graph: Graph, fw_info: FrameworkInfo) -> Tuple[np.ndarray, np.ndarray]:
"""
Computes a vector with the respective weights' parameters size for each node.
Calculates the memory usage in bytes and the number of weight parameters for each node within a graph.
Memory calculations are based on the maximum bit-width used for quantization per node.

Args:
graph: Finalized Graph object.
fw_info: FrameworkInfo object about the specific framework
(e.g., attributes of different layers' weights to quantize).

Returns: A vector of node's weights memory sizes.

graph: A finalized Graph object, representing the model structure.
fw_info: FrameworkInfo object containing details about the specific framework's
quantization attributes for different layers' weights.

Returns:
A tuple containing two arrays:
- The first array represents the memory in bytes for each node's weights when quantized at the maximal bit-width.
- The second array represents the total number of weight parameters for each node.
"""

weights_params = []
weights_memory_bytes = []
for n in graph.nodes:
# TODO: when enabling multiple attribute quantization by default (currently,
# only kernel quantization is enabled) we should include other attributes memory in the sum of all
Expand All @@ -100,36 +117,51 @@ def compute_nodes_weights_params(graph: Graph, fw_info: FrameworkInfo) -> np.nda
kernel_attr = fw_info.get_kernel_op_attributes(n.type)[0]
if kernel_attr is not None and not n.reuse:
kernel_candidates = n.get_all_weights_attr_candidates(kernel_attr)

if len(kernel_candidates) > 0 and any([c.enable_weights_quantization for c in kernel_candidates]):
max_weight_bits = max([kc.weights_n_bits for kc in kernel_candidates])
node_num_weights_params = 0
for attr in fw_info.get_kernel_op_attributes(n.type):
if attr is not None:
node_num_weights_params += n.get_weights_by_keys(attr).flatten().shape[0]

weights_params.append(node_num_weights_params)

return np.array(weights_params)
# multiply num params by num bits and divide by BITS_TO_BYTES to convert from bits to bytes
weights_memory_bytes.append(node_num_weights_params * max_weight_bits / BITS_TO_BYTES)

return np.array(weights_memory_bytes), np.array(weights_params)

def compute_activation_output_sizes(graph: Graph) -> np.ndarray:
def compute_activation_output_sizes(graph: Graph) -> Tuple[np.ndarray, np.ndarray]:
"""
Computes a vector with the respective output tensor size for each node.
Computes an array of the respective output tensor size and an array of the output tensor size in bytes for
each node.

Args:
graph: Finalized Graph object.
graph: A finalized Graph object, representing the model structure.

Returns:
A tuple containing two arrays:
- The first array represents the size of each node's activation output tensor size in bytes,
calculated using the maximal bit-width for quantization.
- The second array represents the size of each node's activation output tensor size.

Returns: A vector of node's activation output size.

"""

activation_outputs = []
# Go over all nodes that have configurable activation.
activation_outputs_bytes = []
for n in graph.nodes:
# Go over all nodes that have configurable activation.
if n.has_activation_quantization_enabled_candidate():
# Fetch maximum bits required for quantizing activations
max_activation_bits = max([qc.activation_quantization_cfg.activation_n_bits for qc in n.candidates_quantization_cfg])
node_output_size = n.get_total_output_params()
activation_outputs.append(node_output_size)
# Calculate activation size in bytes and append to list
activation_outputs_bytes.append(node_output_size * max_activation_bits / BITS_TO_BYTES)

return np.array(activation_outputs)
return np.array(activation_outputs_bytes), np.array(activation_outputs)


def compute_total_bops(graph: Graph, fw_info: FrameworkInfo, fw_impl: FrameworkImplementation) -> np.ndarray:
Expand Down Expand Up @@ -162,3 +194,56 @@ def compute_total_bops(graph: Graph, fw_info: FrameworkInfo, fw_impl: FrameworkI
bops.append(node_bops)

return np.array(bops)


def requires_mixed_precision(in_model: Any,
target_resource_utilization: ResourceUtilization,
representative_data_gen: Callable,
core_config: CoreConfig,
tpc: TargetPlatformCapabilities,
fw_info: FrameworkInfo,
fw_impl: FrameworkImplementation) -> bool:
"""
The function checks whether the model requires mixed precision to meet the requested target resource utilization.
This is determined by whether the target memory usage of the weights is less than the available memory,
the target maximum size of an activation tensor is less than the available memory,
and the target number of BOPs is less than the available BOPs.
If any of these conditions are met, the function returns True. Otherwise, it returns False.

Args:
in_model: The model to be evaluated.
target_resource_utilization: The resource utilization of the target device.
representative_data_gen: A function that generates representative data for the model.
core_config: CoreConfig containing parameters of how the model should be quantized.
tpc: TargetPlatformCapabilities object that models the inference target platform and
the attached framework operator's information.
fw_info: Information needed for quantization about the specific framework.
fw_impl: FrameworkImplementation object with a specific framework methods implementation.

Returns: A boolean indicating if mixed precision is needed.
"""
is_mixed_precision = False
transformed_graph = graph_preparation_runner(in_model,
representative_data_gen,
core_config.quantization_config,
fw_info,
fw_impl,
tpc,
mixed_precision_enable=False)
# Compute max weights memory in bytes
ofirgo marked this conversation as resolved.
Show resolved Hide resolved
weights_memory_by_layer_bytes, _ = compute_nodes_weights_params(transformed_graph, fw_info)
total_weights_memory_bytes = 0 if len(weights_memory_by_layer_bytes) == 0 else sum(weights_memory_by_layer_bytes)

# Compute max activation tensor in bytes
activation_output_sizes_bytes, _ = compute_activation_output_sizes(transformed_graph)
max_activation_tensor_size_bytes = 0 if len(activation_output_sizes_bytes) == 0 else max(activation_output_sizes_bytes)

# Compute BOPS utilization - total count of bit-operations for all configurable layers with kernel
bops_count = compute_total_bops(graph=transformed_graph, fw_info=fw_info, fw_impl=fw_impl)
bops_count = np.inf if len(bops_count) == 0 else sum(bops_count)

is_mixed_precision |= target_resource_utilization.weights_memory < total_weights_memory_bytes
is_mixed_precision |= target_resource_utilization.activation_memory < max_activation_tensor_size_bytes
is_mixed_precision |= target_resource_utilization.total_memory < total_weights_memory_bytes + max_activation_tensor_size_bytes
is_mixed_precision |= target_resource_utilization.bops < bops_count
return is_mixed_precision
13 changes: 12 additions & 1 deletion model_compression_toolkit/core/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@

from model_compression_toolkit.core.common import FrameworkInfo
from model_compression_toolkit.core.common.hessian.hessian_info_service import HessianInfoService
from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization_data import \
requires_mixed_precision
from model_compression_toolkit.core.graph_prep_runner import graph_preparation_runner
from model_compression_toolkit.core.quantization_prep_runner import quantization_preparation_runner
from model_compression_toolkit.logger import Logger
Expand Down Expand Up @@ -89,7 +91,16 @@ def core_runner(in_model: Any,
if core_config.mixed_precision_config is None:
Logger.critical("Provided an initialized target_resource_utilization, that means that mixed precision quantization is "
"enabled, but the provided MixedPrecisionQuantizationConfig is None.")
core_config.mixed_precision_config.set_mixed_precision_enable()
# Determine whether to use mixed precision or single precision based on target_resource_utilization.
if requires_mixed_precision(in_model,
target_resource_utilization,
representative_data_gen,
core_config,
tpc,
fw_info,
fw_impl):
core_config.mixed_precision_config.set_mixed_precision_enable()
Logger.info('Mixed precision enabled.')

graph = graph_preparation_runner(in_model,
representative_data_gen,
Expand Down
4 changes: 3 additions & 1 deletion tests/common_tests/helpers/generate_test_tp_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def generate_mixed_precision_test_tp_model(base_cfg, default_config, mp_bitwidth
name=name)


def generate_tp_model_with_activation_mp(base_cfg, default_config, mp_bitwidth_candidates_list,
def generate_tp_model_with_activation_mp(base_cfg, default_config, mp_bitwidth_candidates_list, custom_opsets=[],
name="activation_mp_model"):
mp_op_cfg_list = []
for weights_n_bits, activation_n_bits in mp_bitwidth_candidates_list:
Expand All @@ -99,6 +99,8 @@ def generate_tp_model_with_activation_mp(base_cfg, default_config, mp_bitwidth_c
operator_sets_dict = {op_set.name: mixed_precision_configuration_options for op_set in base_tp_model.operator_set
if op_set.name is not "NoQuantization"}
operator_sets_dict["Input"] = mixed_precision_configuration_options
for c_ops in custom_opsets:
operator_sets_dict[c_ops] = mixed_precision_configuration_options

return generate_custom_test_tp_model(name=name,
base_cfg=base_cfg,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ def __init__(self, unit_test):
super().__init__(unit_test, activation_layers_idx=[1, 2, 3])

def get_resource_utilization(self):
return ResourceUtilization(np.inf, np.inf)
return ResourceUtilization(np.inf, 5407)

def create_networks(self):
inputs = layers.Input(shape=self.get_input_shapes()[0][1:])
Expand All @@ -397,7 +397,7 @@ def compare(self, quantized_model, float_model, input_x=None, quantization_info=
# resource utilization is infinity -> should give best model - 8bits
holder_layers = get_layers_from_model_by_type(quantized_model, KerasActivationQuantizationHolder)
activation_bits = [h.activation_holder_quantizer.get_config()['num_bits'] for h in holder_layers]
self.unit_test.assertTrue((activation_bits == [8, 8, 8]))
self.unit_test.assertTrue((activation_bits == [8, 4, 4]))

self.verify_quantization(quantized_model, input_x,
weights_layers_idx=[2],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def compare(self, qat_model, finalize=False, input_x=None, quantization_info=Non


class QATWrappersMixedPrecisionCfgTest(MixedPrecisionActivationBaseTest):
def __init__(self, unit_test, ru_weights=np.inf, ru_activation=np.inf, expected_mp_cfg=[0, 0, 0, 0]):
def __init__(self, unit_test, ru_weights=17919, ru_activation=5407, expected_mp_cfg=[0, 4, 0, 0]):
self.ru_weights = ru_weights
self.ru_activation = ru_activation
self.expected_mp_cfg = expected_mp_cfg
Expand All @@ -303,7 +303,6 @@ def run_test(self, **kwargs):

def compare(self, qat_ready_model, quantization_info):

# check that MP search returns 8 bits configuration for all layers
self.unit_test.assertTrue(all(quantization_info.mixed_precision_cfg == self.expected_mp_cfg))

# check that quantizer gets multiple bits configuration
Expand Down
Loading
Loading