Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure

```bash
# Please note that calibration_data must include the prompt template for special tokens.
python -m examples.models.llama.export_llama -t <path_to_tokenizer.model>
python -m examples.models.llama.export_llama -t <path_to_tokenizer.model>
llama3/Meta-Llama-3-8B-Instruct/tokenizer.model -p <path_to_params.json> -c <path_to_checkpoint_for_Meta-Llama-3-8B-Instruct> --use_kv_cache --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path <path_to_optimized_matrix> --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
```

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure
* 8B models might need 16GB RAM on the device to run.
```
# Please note that calibration_data must include the prompt template for special tokens.
python -m examples.models.llama.export_llama -t <path_to_tokenizer.model> -p <path_to_params.json> -c <path_to_checkpoint_for_Meta-Llama-3-8B-Instruct> --use_kv_cache --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path <path_to_optimized_matrix> --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
python -m examples.models.llama.export_llama -t <path_to_tokenizer.model> -p <path_to_params.json> -c <path_to_checkpoint_for_Meta-Llama-3-8B-Instruct> --use_kv_cache --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path <path_to_optimized_matrix> --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
```

## Pushing Model and Tokenizer
Expand Down
14 changes: 12 additions & 2 deletions examples/models/llama/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -239,9 +239,19 @@ You can export and run the original Llama 3 8B instruct model.

2. Export model and generate `.pte` file
```
python -m examples.models.llama.export_llama --checkpoint <consolidated.00.pth> -p <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
python -m examples.models.llama.export_llama \
--checkpoint <consolidated.00.pth> \
-p <params.json> \
-kv \
--use_sdpa_with_kv_cache \
-X \
-qmode 8da4w \
--group_size 128 \
-d fp32 \
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
--embedding-quantize 4,32 \
--output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
```

Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `--embedding-quantize 4,32` as shown above to further reduce the model size.


Expand Down
2 changes: 1 addition & 1 deletion examples/models/llama/eval_llama_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def gen_eval_wrapper(

pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args)
# GPTFastEvalWrapper: Create a wrapper around a pre-exported model
manager: LLMEdgeManager = _prepare_for_llama_export(model_name, args)
manager: LLMEdgeManager = _prepare_for_llama_export(args)

if len(quantizers) != 0:
manager = manager.export().pt2e_quantize(quantizers)
Expand Down
3 changes: 1 addition & 2 deletions examples/models/llama/export_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,9 @@
def main() -> None:
seed = 42
torch.manual_seed(seed)
modelname = "llama2"
parser = build_args_parser()
args = parser.parse_args()
export_llama(modelname, args)
export_llama(args)


if __name__ == "__main__":
Expand Down
50 changes: 36 additions & 14 deletions examples/models/llama/export_llama_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@
verbosity_setting = None


EXECUTORCH_DEFINED_MODELS = ["stories110m", "llama2", "llama3", "llama3_1", "llama3_2"]
TORCHTUNE_DEFINED_MODELS = []


class WeightType(Enum):
LLAMA = "LLAMA"
FAIRSEQ2 = "FAIRSEQ2"
Expand All @@ -105,7 +109,7 @@ def verbose_export():


def build_model(
modelname: str = "model",
modelname: str = "llama3",
extra_opts: str = "",
*,
par_local_output: bool = False,
Expand All @@ -116,11 +120,11 @@ def build_model(
else:
output_dir_path = "."

argString = f"--checkpoint par:{modelname}_ckpt.pt --params par:{modelname}_params.json {extra_opts} --output-dir {output_dir_path}"
argString = f"--model {modelname} --checkpoint par:model_ckpt.pt --params par:model_params.json {extra_opts} --output-dir {output_dir_path}"
parser = build_args_parser()
args = parser.parse_args(shlex.split(argString))
# pkg_name = resource_pkg_name
return export_llama(modelname, args)
return export_llama(args)


def build_args_parser() -> argparse.ArgumentParser:
Expand All @@ -130,6 +134,12 @@ def build_args_parser() -> argparse.ArgumentParser:
# parser.add_argument(
# "-q", "--quantized_ckpt", default=None, help="quantized checkpoint file"
# )
parser.add_argument(
"--model",
default="llama3",
choices=EXECUTORCH_DEFINED_MODELS + TORCHTUNE_DEFINED_MODELS,
help="The Lllama model architecture to use. stories110M, llama2, llama3, llama3_1, and llama3_2 use the same underlying LlamaTransformer architecture defined in ExecuTorch. All other models use TorchTune model definitions.",
)
parser.add_argument(
"-E",
"--embedding-quantize",
Expand Down Expand Up @@ -480,13 +490,13 @@ def canonical_path(path: Union[str, Path], *, dir: bool = False) -> str:
return return_val


def export_llama(modelname, args) -> str:
def export_llama(args) -> str:
if args.profile_path is not None:
try:
from executorch.util.python_profiler import CProfilerFlameGraph

with CProfilerFlameGraph(args.profile_path):
builder = _export_llama(modelname, args)
builder = _export_llama(args)
assert (
filename := builder.get_saved_pte_filename()
) is not None, "Fail to get file name from builder"
Expand All @@ -497,14 +507,14 @@ def export_llama(modelname, args) -> str:
)
return ""
else:
builder = _export_llama(modelname, args)
builder = _export_llama(args)
assert (
filename := builder.get_saved_pte_filename()
) is not None, "Fail to get file name from builder"
return filename


def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
def _prepare_for_llama_export(args) -> LLMEdgeManager:
"""
Helper function for export_llama. Loads the model from checkpoint and params,
and sets up a LLMEdgeManager with initial transforms and dtype conversion.
Expand All @@ -530,7 +540,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:

return (
_load_llama_model(
modelname=modelname,
args.model,
checkpoint=checkpoint_path,
checkpoint_dir=checkpoint_dir,
params_path=params_path,
Expand All @@ -553,7 +563,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
args=args,
)
.set_output_dir(output_dir_path)
.source_transform(_get_source_transforms(modelname, dtype_override, args))
.source_transform(_get_source_transforms(args.model, dtype_override, args))
)


Expand Down Expand Up @@ -627,12 +637,12 @@ def _validate_args(args):
)


def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901
def _export_llama(args) -> LLMEdgeManager: # noqa: C901
_validate_args(args)
pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args)

# export_to_edge
builder_exported = _prepare_for_llama_export(modelname, args).export()
builder_exported = _prepare_for_llama_export(args).export()

if args.export_only:
exit()
Expand Down Expand Up @@ -830,8 +840,8 @@ def _load_llama_model_metadata(


def _load_llama_model(
modelname: str = "llama3",
*,
modelname: str = "llama2",
checkpoint: Optional[str] = None,
checkpoint_dir: Optional[str] = None,
params_path: str,
Expand Down Expand Up @@ -859,15 +869,27 @@ def _load_llama_model(
Returns:
An instance of LLMEdgeManager which contains the eager mode model.
"""

assert (
checkpoint or checkpoint_dir
) and params_path, "Both checkpoint/checkpoint_dir and params can't be empty"
logging.info(
f"Loading model with checkpoint={checkpoint}, params={params_path}, use_kv_cache={use_kv_cache}, weight_type={weight_type}"
)

if modelname in EXECUTORCH_DEFINED_MODELS:
module_name = "llama"
model_class_name = "Llama2Model" # TODO: Change to "LlamaModel" in examples/models/llama/model.py.
elif modelname in TORCHTUNE_DEFINED_MODELS:
raise NotImplementedError(
"Torchtune Llama models are not yet supported in ExecuTorch export."
)
else:
raise ValueError(f"{modelname} is not a valid Llama model.")

model, example_inputs, example_kwarg_inputs, _ = EagerModelFactory.create_model(
module_name="llama",
model_class_name="Llama2Model",
module_name,
model_class_name,
checkpoint=checkpoint,
checkpoint_dir=checkpoint_dir,
params=params_path,
Expand Down
2 changes: 1 addition & 1 deletion examples/models/llama/runner/eager.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(self, args):
model_args=model_args,
device="cuda" if torch.cuda.is_available() else "cpu",
)
manager: LLMEdgeManager = _prepare_for_llama_export("llama", args)
manager: LLMEdgeManager = _prepare_for_llama_export(args)
self.model = manager.model.eval().to(device=self.device)

def forward(
Expand Down
Loading