From 0b5a9a709a410dca334670a697e737e302a8eb2a Mon Sep 17 00:00:00 2001 From: Jack Zhang Date: Mon, 30 Sep 2024 12:53:03 -0700 Subject: [PATCH 01/10] Add kwarg example inputs to eager model base --- examples/models/llama2/model.py | 13 ++++++++----- examples/models/model_base.py | 5 +++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py index 23f1c1b4898..a3532a41ca1 100644 --- a/examples/models/llama2/model.py +++ b/examples/models/llama2/model.py @@ -224,25 +224,27 @@ def get_eager_model(self) -> torch.nn.Module: # switch all to FP32 return self.model_.to(torch.float32) - def get_example_inputs(self): + def get_example_inputs(self) -> Tuple[Tuple, Dict]: if self.use_kv_cache: return self.get_example_inputs_kvcache_sdpa() else: - return ( + positional_inputs = ( torch.tensor( [[1, 2, 3]], dtype=torch.long ), # tokens, with kv cache our input token length is always just 1 token. ) + return (positional_inputs, {}) # assumption is the custom op doesnt support dynamic shape right now. It might but its untested so lets first get static shape working - def get_example_inputs_kvcache_sdpa(self): + def get_example_inputs_kvcache_sdpa(self) -> Tuple[Tuple, Dict]: if self.enable_dynamic_shape: - return ( + positional_inputs = ( torch.tensor([[2, 3, 4]], dtype=torch.long), torch.tensor([0], dtype=torch.long), ) + return (positional_inputs, {}) else: - return ( + positional_inputs = ( torch.tensor( [[1]], dtype=torch.long ), # tokens, with kv cache our input token length is always just 1 token. @@ -250,6 +252,7 @@ def get_example_inputs_kvcache_sdpa(self): [0], dtype=torch.long ), # start_pos, what token of output are we on. ) + return (positional_inputs, {}) def _transform_for_pre_quantization(self, checkpoint): assert hasattr(self.args, "preq_mode"), "preq_mode must be specified" diff --git a/examples/models/model_base.py b/examples/models/model_base.py index a1e639cf323..478f1e2d65f 100644 --- a/examples/models/model_base.py +++ b/examples/models/model_base.py @@ -5,6 +5,7 @@ # LICENSE file in the root directory of this source tree. from abc import ABC, abstractmethod +from typing import Dict, Tuple import torch @@ -37,11 +38,11 @@ def get_eager_model(self) -> torch.nn.Module: raise NotImplementedError("get_eager_model") @abstractmethod - def get_example_inputs(self): + def get_example_inputs(self) -> Tuple[Tuple, Dict]: """ Abstract method to provide example inputs for the model. Returns: - Any: Example inputs that can be used for testing and tracing. + Tuple[Tuple, Dict]: The positional inputs (Tuple) and the kwarg inputs (Dict). """ raise NotImplementedError("get_example_inputs") From a9647d2068a539173f1203bee7cbfca5572db421 Mon Sep 17 00:00:00 2001 From: Jack Zhang Date: Mon, 7 Oct 2024 15:41:23 -0700 Subject: [PATCH 02/10] Create create new method for example kwarg inputs instead --- examples/models/llama2/model.py | 13 +++++-------- examples/models/model_base.py | 5 ++--- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py index a3532a41ca1..23f1c1b4898 100644 --- a/examples/models/llama2/model.py +++ b/examples/models/llama2/model.py @@ -224,27 +224,25 @@ def get_eager_model(self) -> torch.nn.Module: # switch all to FP32 return self.model_.to(torch.float32) - def get_example_inputs(self) -> Tuple[Tuple, Dict]: + def get_example_inputs(self): if self.use_kv_cache: return self.get_example_inputs_kvcache_sdpa() else: - positional_inputs = ( + return ( torch.tensor( [[1, 2, 3]], dtype=torch.long ), # tokens, with kv cache our input token length is always just 1 token. ) - return (positional_inputs, {}) # assumption is the custom op doesnt support dynamic shape right now. It might but its untested so lets first get static shape working - def get_example_inputs_kvcache_sdpa(self) -> Tuple[Tuple, Dict]: + def get_example_inputs_kvcache_sdpa(self): if self.enable_dynamic_shape: - positional_inputs = ( + return ( torch.tensor([[2, 3, 4]], dtype=torch.long), torch.tensor([0], dtype=torch.long), ) - return (positional_inputs, {}) else: - positional_inputs = ( + return ( torch.tensor( [[1]], dtype=torch.long ), # tokens, with kv cache our input token length is always just 1 token. @@ -252,7 +250,6 @@ def get_example_inputs_kvcache_sdpa(self) -> Tuple[Tuple, Dict]: [0], dtype=torch.long ), # start_pos, what token of output are we on. ) - return (positional_inputs, {}) def _transform_for_pre_quantization(self, checkpoint): assert hasattr(self.args, "preq_mode"), "preq_mode must be specified" diff --git a/examples/models/model_base.py b/examples/models/model_base.py index 478f1e2d65f..a1e639cf323 100644 --- a/examples/models/model_base.py +++ b/examples/models/model_base.py @@ -5,7 +5,6 @@ # LICENSE file in the root directory of this source tree. from abc import ABC, abstractmethod -from typing import Dict, Tuple import torch @@ -38,11 +37,11 @@ def get_eager_model(self) -> torch.nn.Module: raise NotImplementedError("get_eager_model") @abstractmethod - def get_example_inputs(self) -> Tuple[Tuple, Dict]: + def get_example_inputs(self): """ Abstract method to provide example inputs for the model. Returns: - Tuple[Tuple, Dict]: The positional inputs (Tuple) and the kwarg inputs (Dict). + Any: Example inputs that can be used for testing and tracing. """ raise NotImplementedError("get_example_inputs") From fa3b1d253796ce12f957d103a55650884f53e99c Mon Sep 17 00:00:00 2001 From: Jack Zhang Date: Mon, 30 Sep 2024 12:53:03 -0700 Subject: [PATCH 03/10] Add kwarg example inputs to eager model base --- examples/models/llama2/model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py index 23f1c1b4898..8ea42ae98c6 100644 --- a/examples/models/llama2/model.py +++ b/examples/models/llama2/model.py @@ -315,3 +315,4 @@ def _transform_for_pre_quantization(self, checkpoint): int(embedding_bit_width), embedding_group_size, ) + From e8715ba9e3c7d67f81889b8ddbfa2a401657a01e Mon Sep 17 00:00:00 2001 From: Jack Zhang Date: Tue, 8 Oct 2024 00:20:44 -0700 Subject: [PATCH 04/10] Lint --- examples/models/llama2/model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py index 8ea42ae98c6..23f1c1b4898 100644 --- a/examples/models/llama2/model.py +++ b/examples/models/llama2/model.py @@ -315,4 +315,3 @@ def _transform_for_pre_quantization(self, checkpoint): int(embedding_bit_width), embedding_group_size, ) - From a6f96a2fd9f050177ac9f7307428e17bbb7c0c97 Mon Sep 17 00:00:00 2001 From: Jack Zhang Date: Fri, 4 Oct 2024 20:37:08 -0700 Subject: [PATCH 05/10] Accept model type parameter in export_llama --- examples/models/llama2/README.md | 26 ++++++-- examples/models/llama2/export_llama.py | 3 +- examples/models/llama2/export_llama_lib.py | 75 ++++++++++++++-------- 3 files changed, 71 insertions(+), 33 deletions(-) diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md index 1a6fe99fc41..8686b87c2a1 100644 --- a/examples/models/llama2/README.md +++ b/examples/models/llama2/README.md @@ -142,6 +142,7 @@ LLAMA_CHECKPOINT=path/to/checkpoint.pth LLAMA_PARAMS=path/to/params.json python -m examples.models.llama2.export_llama \ + --model llama3_2 --checkpoint "${LLAMA_CHECKPOINT:?}" \ --params "${LLAMA_PARAMS:?}" \ -kv \ @@ -162,6 +163,7 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/checkpoint.pth LLAMA_PARAMS=path/to/params.json python -m examples.models.llama2.export_llama \ + --model llama3_2 --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \ --params "${LLAMA_PARAMS:?}" \ --use_sdpa_with_kv_cache \ @@ -185,7 +187,19 @@ You can export and run the original Llama 3 8B instruct model. 2. Export model and generate `.pte` file ``` - python -m examples.models.llama2.export_llama --checkpoint -p -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" + python -m examples.models.llama2.export_llama + --model llama3 + --checkpoint + -p + -kv + --use_sdpa_with_kv_cache + -X + -qmode 8da4w + --group_size 128 + -d fp32 + --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + --embedding-quantize 4,32 + --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" ``` Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `--embedding-quantize 4,32` as shown above to further reduce the model size. @@ -205,7 +219,7 @@ If you want to deploy and run a smaller model for educational purposes. From `ex ``` 3. Export model and generate `.pte` file. ``` - python -m examples.models.llama2.export_llama -c stories110M.pt -p params.json -X -kv + python -m examples.models.llama2.export_llama --model llama2 --checkpoint stories110M.pt --params params.json -X -kv ``` ### Option D: Download and export Llama 2 7B model @@ -218,7 +232,7 @@ You can export and run the original Llama 2 7B model. 3. Export model and generate `.pte` file: ``` - python -m examples.models.llama2.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 + python -m examples.models.llama2.export_llama --model llama2 --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 ``` 4. Create tokenizer.bin. ``` @@ -432,9 +446,9 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de Currently we supported lowering the stories model to other backends, including, CoreML, MPS and QNN. Please refer to the instruction for each backend ([CoreML](https://pytorch.org/executorch/main/build-run-coreml.html), [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [QNN](https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html)) before trying to lower them. After the backend library is installed, the script to export a lowered model is -- Lower to CoreML: `python -m examples.models.llama2.export_llama -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json ` -- MPS: `python -m examples.models.llama2.export_llama -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json ` -- QNN: `python -m examples.models.llama2.export_llama -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json ` +- Lower to CoreML: `python -m examples.models.llama2.export_llama --model llama3 -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json ` +- MPS: `python -m examples.models.llama2.export_llama --model llama3 -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json ` +- QNN: `python -m examples.models.llama2.export_llama --model llama3 -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json ` The iOS LLAMA app supports the CoreML and MPS model and the Android LLAMA app supports the QNN model. On Android, it also allow to cross compiler the llama runner binary, push to the device and run. diff --git a/examples/models/llama2/export_llama.py b/examples/models/llama2/export_llama.py index 3d0d1b7bcfb..5f382bf50cf 100644 --- a/examples/models/llama2/export_llama.py +++ b/examples/models/llama2/export_llama.py @@ -20,10 +20,9 @@ def main() -> None: seed = 42 torch.manual_seed(seed) - modelname = "llama2" parser = build_args_parser() args = parser.parse_args() - export_llama(modelname, args) + export_llama(args) if __name__ == "__main__": diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index 0d292b11e7b..5735b4cc30c 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -78,6 +78,10 @@ verbosity_setting = None +EXECUTORCH_DEFINED_MODELS = ["llama2", "llama3", "llama3_1", "llama3_2"] +TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision"] + + class WeightType(Enum): LLAMA = "LLAMA" FAIRSEQ2 = "FAIRSEQ2" @@ -113,11 +117,11 @@ def build_model( else: output_dir_path = "." - argString = f"--checkpoint par:{modelname}_ckpt.pt --params par:{modelname}_params.json {extra_opts} --output-dir {output_dir_path}" + argString = f"--model {modelname} --checkpoint par:{modelname}_ckpt.pt --params par:{modelname}_params.json {extra_opts} --output-dir {output_dir_path}" parser = build_args_parser() args = parser.parse_args(shlex.split(argString)) # pkg_name = resource_pkg_name - return export_llama(modelname, args) + return export_llama(args) def build_args_parser() -> argparse.ArgumentParser: @@ -127,6 +131,12 @@ def build_args_parser() -> argparse.ArgumentParser: # parser.add_argument( # "-q", "--quantized_ckpt", default=None, help="quantized checkpoint file" # ) + parser.add_argument( + "--model", + default="llama2", + choices=EXECUTORCH_DEFINED_MODELS + TORCHTUNE_DEFINED_MODELS, + help="The Lllama model to export. llama2, llama3, llama3_1, llama3_2 share the same architecture, so they are technically interchangeable, given you provide the checkpoint file for the desired version.", + ) parser.add_argument( "-E", "--embedding-quantize", @@ -458,13 +468,13 @@ def canonical_path(path: Union[str, Path], *, dir: bool = False) -> str: return return_val -def export_llama(modelname, args) -> str: +def export_llama(args) -> str: if args.profile_path is not None: try: from executorch.util.python_profiler import CProfilerFlameGraph with CProfilerFlameGraph(args.profile_path): - builder = _export_llama(modelname, args) + builder = _export_llama(args) assert ( filename := builder.get_saved_pte_filename() ) is not None, "Fail to get file name from builder" @@ -475,14 +485,14 @@ def export_llama(modelname, args) -> str: ) return "" else: - builder = _export_llama(modelname, args) + builder = _export_llama(args) assert ( filename := builder.get_saved_pte_filename() ) is not None, "Fail to get file name from builder" return filename -def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager: +def _prepare_for_llama_export(args) -> LLMEdgeManager: """ Helper function for export_llama. Loads the model from checkpoint and params, and sets up a LLMEdgeManager with initial transforms and dtype conversion. @@ -508,7 +518,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager: return ( _load_llama_model( - modelname=modelname, + args.model, checkpoint=checkpoint_path, checkpoint_dir=checkpoint_dir, params_path=params_path, @@ -530,7 +540,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager: args=args, ) .set_output_dir(output_dir_path) - .source_transform(_get_source_transforms(modelname, dtype_override, args)) + .source_transform(_get_source_transforms(args.model, dtype_override, args)) ) @@ -574,13 +584,13 @@ def _validate_args(args): raise ValueError("Model shard is only supported with qnn backend now.") -def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901 +def _export_llama(args) -> LLMEdgeManager: # noqa: C901 _validate_args(args) pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args) # export_to_edge builder_exported_to_edge = ( - _prepare_for_llama_export(modelname, args) + _prepare_for_llama_export(args) .capture_pre_autograd_graph() .pt2e_quantize(quantizers) .export_to_edge() @@ -748,8 +758,8 @@ def _load_llama_model_metadata( def _load_llama_model( + modelname: str, *, - modelname: str = "llama2", checkpoint: Optional[str] = None, checkpoint_dir: Optional[str] = None, params_path: str, @@ -776,26 +786,41 @@ def _load_llama_model( Returns: An instance of LLMEdgeManager which contains the eager mode model. """ + assert ( checkpoint or checkpoint_dir ) and params_path, "Both checkpoint/checkpoint_dir and params can't be empty" logging.info( f"Loading model with checkpoint={checkpoint}, params={params_path}, use_kv_cache={use_kv_cache}, weight_type={weight_type}" ) - model, example_inputs, example_kwarg_inputs, _ = EagerModelFactory.create_model( - "llama2", - "Llama2Model", - checkpoint=checkpoint, - checkpoint_dir=checkpoint_dir, - params=params_path, - use_kv_cache=use_kv_cache, - use_sdpa_with_kv_cache=use_sdpa_with_kv_cache, - generate_full_logits=generate_full_logits, - fairseq2=weight_type == WeightType.FAIRSEQ2, - max_seq_len=max_seq_len, - enable_dynamic_shape=enable_dynamic_shape, - output_prune_map_path=output_prune_map_path, - args=args, + + if modelname in EXECUTORCH_DEFINED_MODELS: + # Set to llama2 because all models in EXECUTORCH_DEFINED_MODELS share the same archteciture as + # defined in example/models/llama2. + modelname = "llama2" + model_class_name = "Llama2Model" + elif modelname in TORCHTUNE_DEFINED_MODELS: + if modelname == "llama3_2_vision": + model_class_name = "Llama3_2Decoder" + else: + raise ValueError(f"{modelname} is not a valid Llama model.") + + model, example_inputs, example_kwarg_inputs, _ = ( + EagerModelFactory.create_model( + modelname, + model_class_name, + checkpoint=checkpoint, + checkpoint_dir=checkpoint_dir, + params=params_path, + use_kv_cache=use_kv_cache, + use_sdpa_with_kv_cache=use_sdpa_with_kv_cache, + generate_full_logits=generate_full_logits, + fairseq2=weight_type == WeightType.FAIRSEQ2, + max_seq_len=max_seq_len, + enable_dynamic_shape=enable_dynamic_shape, + output_prune_map_path=output_prune_map_path, + args=args, + ) ) if dtype_override: assert isinstance( From 328c72c8cf7554184cee927178acfb33bf43338d Mon Sep 17 00:00:00 2001 From: Jack Zhang Date: Fri, 4 Oct 2024 20:49:42 -0700 Subject: [PATCH 06/10] Remove future implementation --- examples/models/llama2/export_llama_lib.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index 5735b4cc30c..49bf5d8da83 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -79,7 +79,7 @@ EXECUTORCH_DEFINED_MODELS = ["llama2", "llama3", "llama3_1", "llama3_2"] -TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision"] +TORCHTUNE_DEFINED_MODELS = [] class WeightType(Enum): @@ -800,8 +800,7 @@ def _load_llama_model( modelname = "llama2" model_class_name = "Llama2Model" elif modelname in TORCHTUNE_DEFINED_MODELS: - if modelname == "llama3_2_vision": - model_class_name = "Llama3_2Decoder" + raise NotImplementedError("Torchtune Llama models are not yet supported in ExecuTorch export.") else: raise ValueError(f"{modelname} is not a valid Llama model.") From ec80bba43acd0553062d4485a3897acd2b156b45 Mon Sep 17 00:00:00 2001 From: Jack Zhang Date: Tue, 15 Oct 2024 14:59:41 -0700 Subject: [PATCH 07/10] Lint --- examples/models/llama2/export_llama_lib.py | 34 +++++++++++----------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index 49bf5d8da83..40378f0b73a 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -800,26 +800,26 @@ def _load_llama_model( modelname = "llama2" model_class_name = "Llama2Model" elif modelname in TORCHTUNE_DEFINED_MODELS: - raise NotImplementedError("Torchtune Llama models are not yet supported in ExecuTorch export.") + raise NotImplementedError( + "Torchtune Llama models are not yet supported in ExecuTorch export." + ) else: raise ValueError(f"{modelname} is not a valid Llama model.") - model, example_inputs, example_kwarg_inputs, _ = ( - EagerModelFactory.create_model( - modelname, - model_class_name, - checkpoint=checkpoint, - checkpoint_dir=checkpoint_dir, - params=params_path, - use_kv_cache=use_kv_cache, - use_sdpa_with_kv_cache=use_sdpa_with_kv_cache, - generate_full_logits=generate_full_logits, - fairseq2=weight_type == WeightType.FAIRSEQ2, - max_seq_len=max_seq_len, - enable_dynamic_shape=enable_dynamic_shape, - output_prune_map_path=output_prune_map_path, - args=args, - ) + model, example_inputs, example_kwarg_inputs, _ = EagerModelFactory.create_model( + modelname, + model_class_name, + checkpoint=checkpoint, + checkpoint_dir=checkpoint_dir, + params=params_path, + use_kv_cache=use_kv_cache, + use_sdpa_with_kv_cache=use_sdpa_with_kv_cache, + generate_full_logits=generate_full_logits, + fairseq2=weight_type == WeightType.FAIRSEQ2, + max_seq_len=max_seq_len, + enable_dynamic_shape=enable_dynamic_shape, + output_prune_map_path=output_prune_map_path, + args=args, ) if dtype_override: assert isinstance( From 1825972af1fa6fc49e8ffe38e06cb26112d3667b Mon Sep 17 00:00:00 2001 From: Jack Zhang Date: Fri, 25 Oct 2024 15:27:08 -0700 Subject: [PATCH 08/10] Update READMEs --- backends/vulkan/docs/android_demo.md | 1 + ...llama3-qualcomm-ai-engine-direct-backend.md | 2 +- .../docs/delegates/qualcomm_README.md | 6 +++--- .../LlamaDemo/docs/delegates/xnnpack_README.md | 10 +++++----- .../LLaMA/docs/delegates/mps_README.md | 4 ++-- .../LLaMA/docs/delegates/xnnpack_README.md | 8 ++++---- examples/models/llama/README.md | 18 ++++++++++++++++-- examples/models/llama/UTILS.md | 8 ++++---- examples/models/llama2/README.md | 2 +- 9 files changed, 37 insertions(+), 22 deletions(-) diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md index 2a4faacc0c8..1314a6503aa 100644 --- a/backends/vulkan/docs/android_demo.md +++ b/backends/vulkan/docs/android_demo.md @@ -58,6 +58,7 @@ partially lower the Llama model to Vulkan. ```shell # The files will usually be downloaded to ~/.llama python -m examples.models.llama.export_llama \ + --model llama3_2 --disable_dynamic_shape --vulkan -kv --use_sdpa_with_kv_cache -d fp32 \ -c ~/.llama/checkpoints/Llama3.2-1B/consolidated.00.pth \ -p ~/.llama/checkpoints/Llama3.2-1B/params.json \ diff --git a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md index d928377ff28..90dc7dd0ad8 100644 --- a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md +++ b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md @@ -39,7 +39,7 @@ To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure ```bash # Please note that calibration_data must include the prompt template for special tokens. -python -m examples.models.llama.export_llama -t +python -m examples.models.llama.export_llama --model llama3 -t llama3/Meta-Llama-3-8B-Instruct/tokenizer.model -p -c --use_kv_cache --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ``` diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md index 8308da6d840..7d28288bfed 100644 --- a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md +++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md @@ -101,12 +101,12 @@ We support PTQ by default. The entire export may take ~20 minutes (Llama 3.1 8B) Examples: ``` # 4 bits weight only quantize -python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_16a4w -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte” +python -m examples.models.llama.export_llama --model llama3 --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_16a4w -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte” ``` If the model is really big, it may require model sharding because the Qualcomm DSP is a 32bit system and has a 4GB size limit . For example for Llama 3 8B models, we need to shard the model into 4, but ExecuTorch still packages it into one PTE file. Here is an example: ``` # 8 bits quantization with 4 shards -python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_8a8w -d fp32 --num_sharding 4 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte” +python -m examples.models.llama.export_llama --model llama3 --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_8a8w -d fp32 --num_sharding 4 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte” ``` Note: if you encountered issues below ``` @@ -158,7 +158,7 @@ To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure * 8B models might need 16GB RAM on the device to run. ``` # Please note that calibration_data must include the prompt template for special tokens. -python -m examples.models.llama.export_llama -t -p -c --use_kv_cache --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" +python -m examples.models.llama.export_llama --model llama3 -t -p -c --use_kv_cache --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ``` ## Pushing Model and Tokenizer diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md index 2a6ddbbfe09..4ee52bd1b99 100644 --- a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md +++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md @@ -56,14 +56,14 @@ In this demo app, we support text-only inference with up-to-date Llama models an Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte" +python -m examples.models.llama.export_llama --model llama3_2 --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte" ``` ### For Llama 3.2 1B and 3B QAT+LoRA models Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --checkpoint --params -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte" +python -m examples.models.llama.export_llama --model llama3_2 --checkpoint --params -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte" ``` ### For Llama 3.2 1B and 3B BF16 models @@ -72,7 +72,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte" +python -m examples.models.llama.export_llama --model llama3_2 --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte" ``` For more detail using Llama 3.2 lightweight models including prompt template, please go to our official [website](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-llama-3.2-lightweight-models-(1b/3b)-). @@ -87,7 +87,7 @@ To safeguard your application, you can use our Llama Guard models for prompt cla * We prepared this model using the following command ``` -python -m examples.models.llama.export_llama --checkpoint --params -d fp32 -kv --use_sdpa_with_kv_cache --quantization_mode 8da4w --group_size 256 --xnnpack --max_seq_length 8193 --embedding-quantize 4,32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_prune_map --output_name="llama_guard_3_1b_pruned_xnnpack.pte" +python -m examples.models.llama.export_llama --model llama3_2 --checkpoint --params -d fp32 -kv --use_sdpa_with_kv_cache --quantization_mode 8da4w --group_size 256 --xnnpack --max_seq_length 8193 --embedding-quantize 4,32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_prune_map --output_name="llama_guard_3_1b_pruned_xnnpack.pte" ``` @@ -97,7 +97,7 @@ python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama.pte" +python -m examples.models.llama.export_llama --model llama3_2 --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama.pte" ``` You may wonder what the ‘--metadata’ flag is doing. This flag helps export the model with proper special tokens added that the runner can detect EOS tokens easily. diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md index eb3c244dee7..8aeed59cab9 100644 --- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md +++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md @@ -45,9 +45,9 @@ Install the required packages to export the model sh examples/models/llama/install_requirements.sh ``` -Export the model +Export the model (Llama 3 in this case) ``` -python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" --params "${MODEL_DIR}/params.json" -kv --use_sdpa_with_kv_cache --mps -d fp32 --disable_dynamic_shape -qmode 8da4w -G 32 +python -m examples.models.llama.export_llama --model llama3 --checkpoint "${MODEL_DIR}/consolidated.00.pth" --params "${MODEL_DIR}/params.json" -kv --use_sdpa_with_kv_cache --mps -d fp32 --disable_dynamic_shape -qmode 8da4w -G 32 ``` ## Pushing Model and Tokenizer diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md index 201a2934470..63dfd334a10 100644 --- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md +++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md @@ -48,14 +48,14 @@ sh examples/models/llama/install_requirements.sh Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte" +python -m examples.models.llama.export_llama --model llama3_2 --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte" ``` ### For Llama 3.2 1B and 3B QAT+LoRA models Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --checkpoint --params -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte" +python -m examples.models.llama.export_llama --model llama3_2 --checkpoint --params -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte" ``` ### For Llama 3.2 1B and 3B BF16 models @@ -64,7 +64,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte" +python -m examples.models.llama.export_llama --model llama3_2 --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte" ``` For more detail using Llama 3.2 lightweight models including prompt template, please go to our official [website](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-llama-3.2-lightweight-models-(1b/3b)-). @@ -73,7 +73,7 @@ For more detail using Llama 3.2 lightweight models including prompt template, pl Export the model ``` -python -m examples.models.llama.export_llama --checkpoint -p -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" +python -m examples.models.llama.export_llama --model llama3_2 --checkpoint -p -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" ``` ### For LLaVA model diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md index 1ae6796b575..d06c0c031d2 100644 --- a/examples/models/llama/README.md +++ b/examples/models/llama/README.md @@ -166,6 +166,7 @@ LLAMA_CHECKPOINT=path/to/checkpoint.pth LLAMA_PARAMS=path/to/params.json python -m examples.models.llama.export_llama \ + --model llama3_2 --checkpoint "${LLAMA_CHECKPOINT:?}" \ --params "${LLAMA_PARAMS:?}" \ -kv \ @@ -187,6 +188,7 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/checkpoint.pth LLAMA_PARAMS=path/to/spinquant/params.json python -m examples.models.llama.export_llama \ + --model llama3_2 --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \ --params "${LLAMA_PARAMS:?}" \ --use_sdpa_with_kv_cache \ @@ -212,6 +214,7 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/qlora/checkpoint.pth LLAMA_PARAMS=path/to/qlora/params.json python -m examples.models.llama.export_llama \ + --model llama3_2 --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \ --params "${LLAMA_PARAMS:?}" \ -qat \ @@ -237,9 +240,20 @@ You can export and run the original Llama 3 8B instruct model. 2. Export model and generate `.pte` file ``` - python -m examples.models.llama.export_llama --checkpoint -p -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" + python -m examples.models.llama.export_llama + --model llama3 + --checkpoint + -p + -kv + --use_sdpa_with_kv_cache + -X + -qmode 8da4w + --group_size 128 + -d fp32 + --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + --embedding-quantize 4,32 + --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" ``` - Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `--embedding-quantize 4,32` as shown above to further reduce the model size. ## Step 3: Run on your computer to validate diff --git a/examples/models/llama/UTILS.md b/examples/models/llama/UTILS.md index c2ae26e4835..d26362e3853 100644 --- a/examples/models/llama/UTILS.md +++ b/examples/models/llama/UTILS.md @@ -19,7 +19,7 @@ From `executorch` root: ``` 3. Export model and generate `.pte` file. ``` - python -m examples.models.llama.export_llama -c stories110M.pt -p params.json -X -kv + python -m examples.models.llama.export_llama --model llama3 -c stories110M.pt -p params.json -X -kv ``` ## Smaller model delegated to other backends @@ -27,9 +27,9 @@ From `executorch` root: Currently we supported lowering the stories model to other backends, including, CoreML, MPS and QNN. Please refer to the instruction for each backend ([CoreML](https://pytorch.org/executorch/main/build-run-coreml.html), [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [QNN](https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html)) before trying to lower them. After the backend library is installed, the script to export a lowered model is -- Lower to CoreML: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json ` -- MPS: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json ` -- QNN: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json ` +- Lower to CoreML: `python -m examples.models.llama.export_llama --model llama3 -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json ` +- MPS: `python -m examples.models.llama.export_llama --model llama3 -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json ` +- QNN: `python -m examples.models.llama.export_llama --model llama3 -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json ` The iOS LLAMA app supports the CoreML and MPS model and the Android LLAMA app supports the QNN model. On Android, it also allow to cross compiler the llama runner binary, push to the device and run. diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md index f7e308a4321..6e0b3794a74 100644 --- a/examples/models/llama2/README.md +++ b/examples/models/llama2/README.md @@ -37,7 +37,7 @@ You can export and run the original Llama 2 7B model. 3. Export model and generate `.pte` file: ``` - python -m examples.models.llama.export_llama --model llama3_2 --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 + python -m examples.models.llama.export_llama --model llama2 --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 ``` 4. Create tokenizer.bin. ``` From 196499af8535bc900e993339a97ce68ceb95a6a6 Mon Sep 17 00:00:00 2001 From: Jack Zhang Date: Fri, 25 Oct 2024 15:29:03 -0700 Subject: [PATCH 09/10] Change model default arg --- examples/models/llama/export_llama_lib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 39cb169be34..12598eac365 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -134,9 +134,9 @@ def build_args_parser() -> argparse.ArgumentParser: # ) parser.add_argument( "--model", - default="llama2", + default="llama3", choices=EXECUTORCH_DEFINED_MODELS + TORCHTUNE_DEFINED_MODELS, - help="The Lllama model to export. llama2, llama3, llama3_1, llama3_2 share the same architecture, so they are technically interchangeable, given you provide the checkpoint file for the desired version.", + help="The Lllama model to export. llama2, llama3, llama3_1, llama3_2 share the same architecture, so they are technically interchangeable given you provide the checkpoint file for the desired version.", ) parser.add_argument( "-E", From 96ba40b01a6003512ba3427a49075d2995bafa14 Mon Sep 17 00:00:00 2001 From: Jack Zhang Date: Fri, 25 Oct 2024 15:32:39 -0700 Subject: [PATCH 10/10] Update eager runner and eval llama --- examples/models/llama/eval_llama_lib.py | 2 +- examples/models/llama/runner/eager.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/models/llama/eval_llama_lib.py b/examples/models/llama/eval_llama_lib.py index 285d2f874df..d9591d4ed1e 100644 --- a/examples/models/llama/eval_llama_lib.py +++ b/examples/models/llama/eval_llama_lib.py @@ -191,7 +191,7 @@ def gen_eval_wrapper( pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args) # GPTFastEvalWrapper: Create a wrapper around a pre-exported model - manager: LLMEdgeManager = _prepare_for_llama_export(model_name, args) + manager: LLMEdgeManager = _prepare_for_llama_export(args) if len(quantizers) != 0: manager = manager.export().pt2e_quantize(quantizers) diff --git a/examples/models/llama/runner/eager.py b/examples/models/llama/runner/eager.py index e116e08a099..e68b85fac2a 100644 --- a/examples/models/llama/runner/eager.py +++ b/examples/models/llama/runner/eager.py @@ -38,7 +38,7 @@ def __init__(self, args): model_args=model_args, device="cuda" if torch.cuda.is_available() else "cpu", ) - manager: LLMEdgeManager = _prepare_for_llama_export("llama", args) + manager: LLMEdgeManager = _prepare_for_llama_export(args) self.model = manager.model.eval().to(device=self.device) def forward(