diff --git a/backends/apple/coreml/compiler/coreml_preprocess.py b/backends/apple/coreml/compiler/coreml_preprocess.py index 375fdf406b2..5084405c468 100644 --- a/backends/apple/coreml/compiler/coreml_preprocess.py +++ b/backends/apple/coreml/compiler/coreml_preprocess.py @@ -3,6 +3,7 @@ # CoreML backend for delegating a EdgeProgram to CoreML. import json +import logging import shutil import uuid @@ -14,6 +15,7 @@ from typing import Any, Dict, final, List, Optional, Tuple import coremltools as ct +import coremltools.optimize as cto import executorchcoreml from executorch.exir.backend.backend_details import ( @@ -23,12 +25,16 @@ ) from executorch.exir.backend.compile_spec_schema import CompileSpec +logger = logging.getLogger(__name__) +logger.setLevel(logging.WARNING) + class COMPILE_SPEC_KEYS(Enum): COMPUTE_UNITS = "compute_units" MODEL_TYPE = "model_type" MIN_DEPLOYMENT_TARGET = "min_deployment_target" MODEL_COMPUTE_PRECISION = "model_compute_precision" + OP_LINEAR_QUANTIZER_CONFIG = "op_linear_quantizer_config" class MODEL_PATHS(Enum): @@ -169,12 +175,44 @@ def generate_compute_unit_compile_spec( compute_unit.name.lower().encode("utf-8"), ) + @staticmethod + def generate_op_linear_quantizer_config_compile_spec( + op_linear_quantizer_config: Dict, + ) -> CompileSpec: + """ + Returns the compile spec representing the model post conversion quantization, + which is a dict that will construct cto.coreml.OpLinearQuantizerConfig + """ + str_representation = json.dumps(op_linear_quantizer_config) + byte_representation = str_representation.encode("utf-8") + return CompileSpec( + COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value, + byte_representation, + ) + + @staticmethod + def op_linear_quantizer_config_from_compile_specs( + compile_specs: List[CompileSpec], + ) -> cto.coreml.OpLinearQuantizerConfig: + """ + Returns the model's post conversion quantization by parsing the list of compile specs. + """ + for compile_spec in compile_specs: + if compile_spec.key == COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value: + config_dict_str = compile_spec.value.decode("utf-8") + config_dict = json.loads(config_dict_str) + config = cto.coreml.OpLinearQuantizerConfig._from_dict(config_dict) + return config + + return None + @staticmethod def generate_compile_specs( compute_unit: ct.ComputeUnit = ct.ComputeUnit.ALL, minimum_deployment_target: ct.target = ct.target.iOS15, compute_precision: ct.precision = ct.precision.FLOAT16, model_type: MODEL_TYPE = MODEL_TYPE.MODEL, + op_linear_quantizer_config: Optional[Dict] = None, ) -> List[CompileSpec]: """ Returns the list of compile specs that's used by CoreMLBackend to lower the module. @@ -192,6 +230,12 @@ def generate_compile_specs( CoreMLBackend.generate_compute_precision_compile_spec(compute_precision) ) compile_specs.append(CoreMLBackend.generate_model_type_compile_spec(model_type)) + if op_linear_quantizer_config is not None: + compile_specs.append( + CoreMLBackend.generate_op_linear_quantizer_config_compile_spec( + op_linear_quantizer_config + ) + ) return compile_specs @@ -368,18 +412,18 @@ def preprocess( compile_specs, ) ) - model_compute_precision: ct.precision = ( CoreMLBackend.model_compute_precision_from_compile_specs(compile_specs) ) - minimum_deployment_target: ct.target = ( CoreMLBackend.min_deployment_target_from_compile_specs(compile_specs) ) - compute_units: ct.ComputeUnit = CoreMLBackend.compute_unit_from_compile_specs( compile_specs ) + op_linear_quantizer_config = ( + CoreMLBackend.op_linear_quantizer_config_from_compile_specs(compile_specs) + ) mlmodel = ct.convert( model=edge_program, @@ -392,4 +436,15 @@ def preprocess( compute_units=compute_units, ) + if op_linear_quantizer_config is not None: + logger.warning( + "Core ML Backend op_linear_quantizer_config API is experimental" + ) + config = cto.coreml.OptimizationConfig( + global_config=op_linear_quantizer_config, + # skip embedding + op_type_configs={"gather": None}, + ) + mlmodel = cto.coreml.linear_quantize_weights(mlmodel, config=config) + return CoreMLBackend.preprocess_model(mlmodel, model_type=model_type) diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index dd5822c23f6..97228bb5c5d 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -304,6 +304,12 @@ def build_args_parser() -> argparse.ArgumentParser: action="store_true", help="This option is only for coreml, and is only supported for MacOS15+/iOS18+", ) + parser.add_argument( + "--coreml-quantize", + default=None, + choices=["b4w"], + help="This option is only for coreml: Use coreml quantization, e.g. b4w (for blockwise 4 bit weight)", + ) parser.add_argument( "--qnn", action="store_true", @@ -523,6 +529,7 @@ def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901 args.use_kv_cache and args.coreml_enable_state, args.embedding_quantize, args.pt2e_quantize, + args.coreml_quantize, ) partitioners.append(coreml_partitioner) modelname = f"coreml_{modelname}" diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py index f5cc04ead48..eca78bc9346 100644 --- a/extension/llm/export/partitioner_lib.py +++ b/extension/llm/export/partitioner_lib.py @@ -59,6 +59,7 @@ def get_coreml_partitioner( enable_state: bool = False, embedding_quantize: Optional[str] = None, pt2e_quantize: Optional[str] = None, + coreml_quantize: Optional[str] = None, ): try: import coremltools as ct @@ -87,16 +88,29 @@ def get_coreml_partitioner( minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS17) # In Core ML, 4-bit weight compression is introduced in iOS 18 if ( - embedding_quantize is not None and int(embedding_quantize.split(",")[0]) == 4 - ) or pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w"): + (embedding_quantize is not None and int(embedding_quantize.split(",")[0]) == 4) + or pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w") + or coreml_quantize == "b4w" + ): minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18) + op_linear_quantizer_config = None + if coreml_quantize == "b4w": + op_linear_quantizer_config = { + "mode": "linear_symmetric", + "dtype": "int4", + "granularity": "per_block", + "block_size": 32, + "weight_threshold": 512, + } + compile_specs = CoreMLBackend.generate_compile_specs( # pyre-fixme[16] minimum_deployment_target=minimum_deployment_target, compute_precision=ct.precision(ct.precision.FLOAT16.value), # using `ComputeUnit.ALL` can increase the model load time, default to `ComputeUnit.CPU_AND_GPU` compute_unit=ct.ComputeUnit[ct.ComputeUnit.CPU_AND_GPU.name.upper()], model_type=CoreMLBackend.MODEL_TYPE.MODEL, # pyre-fixme[16] + op_linear_quantizer_config=op_linear_quantizer_config, ) return CoreMLPartitioner( # pyre-fixme[16] compile_specs=compile_specs,