diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh index 5e5ed588a2d..6d009ebad51 100644 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -110,6 +110,12 @@ else COREML=OFF fi +if [[ "${MODE}" =~ .*quantize_kv.* ]]; then + QUANTIZE_KV_CACHE=ON +else + QUANTIZE_KV_CACHE=OFF +fi + echo "COREML option ${COREML}" if [[ "${MODE}" =~ .*qnn.* ]]; then @@ -249,6 +255,9 @@ if [[ "${QNN}" == "ON" ]]; then EXPORT_ARGS+=" --tokenizer_path tokenizer.model --pt2e_quantize qnn_16a16w --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --calibration_data Once " fi fi +if [[ "${QUANTIZE_KV_CACHE}" == "ON" ]]; then + EXPORT_ARGS="${EXPORT_ARGS} --quantize_kv_cache" +fi # Add dynamically linked library location $PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS} diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 6d7205611e7..41b8c06407d 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -86,7 +86,7 @@ jobs: strategy: matrix: dtype: [fp32] - mode: [portable, xnnpack+custom, xnnpack+custom+qe] + mode: [portable, xnnpack+custom, xnnpack+custom+qe,xnnpack+custom+quantize_kv,xnnpack+quantize_kv] include: - dtype: bf16 mode: portable diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 18c91691e92..365c7564fef 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -225,7 +225,7 @@ jobs: strategy: matrix: dtype: [fp32] - mode: [portable, xnnpack+kv+custom, mps, coreml] + mode: [portable, xnnpack+kv+custom, mps, coreml, xnnpack+custom+quantize_kv] include: - dtype: bf16 mode: portable diff --git a/examples/models/llama/source_transformation/quantized_kv_cache.py b/examples/models/llama/source_transformation/quantized_kv_cache.py index c8e567e8d6c..a0c8c2fd93b 100644 --- a/examples/models/llama/source_transformation/quantized_kv_cache.py +++ b/examples/models/llama/source_transformation/quantized_kv_cache.py @@ -10,6 +10,7 @@ import torch import torch.nn as nn from executorch.examples.models.llama.llama_transformer import KVCache + from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib # noqa: F401 @@ -221,6 +222,33 @@ def from_float(cls, kv_cache, cache_type: QuantizedCacheType): def replace_kv_cache_with_quantized_kv_cache(module): + try: + op = torch.ops.quantized_decomposed.quantize_per_token.out + assert op is not None + except: + import glob + + import executorch + from executorch.extension.pybindings import portable_lib # noqa # usort: skip + + # Ideally package is installed in only one location but usage of + # PYATHONPATH can result in multiple locations. + # ATM this is mainly used in CI for qnn runner. Will need to revisit this + executorch_package_path = executorch.__path__[-1] + libs = list( + glob.glob( + f"{executorch_package_path}/**/libquantized_ops_aot_lib.*", + recursive=True, + ) + ) + assert len(libs) == 1, f"Expected 1 library but got {len(libs)}" + logging.info(f"Loading custom ops library: {libs[0]}") + torch.ops.load_library(libs[0]) + op = torch.ops.quantized_decomposed.quantize_per_token.out + assert op is not None + # This is needed to ensure that custom ops are registered + from executorch.extension.llm.custom_ops import custom_ops # noqa: F401 + logging.warning( "Replacing KVCache with QuantizedKVCache. This modifies the model in place." ) diff --git a/examples/models/llama/source_transformation/sdpa.py b/examples/models/llama/source_transformation/sdpa.py index 65c5b68f7ad..59bfbe6f951 100644 --- a/examples/models/llama/source_transformation/sdpa.py +++ b/examples/models/llama/source_transformation/sdpa.py @@ -56,7 +56,7 @@ def forward( k_cache = self.kv_cache.k_cache v_cache = self.kv_cache.v_cache - if isinstance(self.kv_cache, QuantizedKVCache): + if hasattr(self.kv_cache, "quantized_cache_dtype"): # updated quantize cache, scale and zero points # returns dequantized kv cache # Not most optimal. Optimizations to follow next diff --git a/extension/llm/custom_ops/custom_ops.py b/extension/llm/custom_ops/custom_ops.py index 3570e34d192..b3b05db68fb 100644 --- a/extension/llm/custom_ops/custom_ops.py +++ b/extension/llm/custom_ops/custom_ops.py @@ -26,7 +26,13 @@ import executorch - executorch_package_path = executorch.__path__[0] + # This is needed to ensure that custom ops are registered + from executorch.extension.pybindings import portable_lib # noqa # usort: skip + + # Ideally package is installed in only one location but usage of + # PYATHONPATH can result in multiple locations. + # ATM this is mainly used in CI for qnn runner. Will need to revisit this + executorch_package_path = executorch.__path__[-1] logging.info(f"Looking for libcustom_ops_aot_lib.so in {executorch_package_path}") libs = list( glob.glob( diff --git a/kernels/quantized/CMakeLists.txt b/kernels/quantized/CMakeLists.txt index 7e49f73b09f..6b01ba4fc27 100644 --- a/kernels/quantized/CMakeLists.txt +++ b/kernels/quantized/CMakeLists.txt @@ -60,14 +60,17 @@ if(NOT CMAKE_GENERATOR STREQUAL "Xcode" set(_quantized_aot_ops "quantized_decomposed::add.out" "quantized_decomposed::choose_qparams.Tensor_out" + "quantized_decomposed::choose_qparams_per_token_asymmetric.out" "quantized_decomposed::dequantize_per_channel.out" "quantized_decomposed::dequantize_per_tensor.out" "quantized_decomposed::dequantize_per_tensor.Tensor_out" + "quantized_decomposed::dequantize_per_token.out" "quantized_decomposed::mixed_linear.out" "quantized_decomposed::mixed_mm.out" "quantized_decomposed::quantize_per_channel.out" "quantized_decomposed::quantize_per_tensor.out" "quantized_decomposed::quantize_per_tensor.Tensor_out" + "quantized_decomposed::quantize_per_token.out" ) gen_selected_ops( LIB_NAME "quantized_ops_aot_lib" ROOT_OPS ${_quantized_aot_ops}