diff --git a/examples/qualcomm/oss_scripts/albert.py b/examples/qualcomm/oss_scripts/albert.py index 3be48215ac6..d529e5db734 100644 --- a/examples/qualcomm/oss_scripts/albert.py +++ b/examples/qualcomm/oss_scripts/albert.py @@ -30,6 +30,9 @@ def main(args): + if args.compile_only and args.pre_gen_pte: + raise RuntimeError("Cannot set both compile_only and pre_gen_pte as true") + skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) os.makedirs(args.artifact, exist_ok=True) @@ -60,26 +63,32 @@ def main(args): module = AutoModelForMaskedLM.from_pretrained(model_name, config=config).eval() pte_filename = "albert_qnn_q16" - # lower to QNN - passes_job = get_capture_program_passes() - build_executorch_binary( - module, - inputs[0], - args.model, - f"{args.artifact}/{pte_filename}", - dataset=inputs, - skip_node_id_set=skip_node_id_set, - skip_node_op_set=skip_node_op_set, - quant_dtype=QuantDtype.use_16a16w, - passes_job=passes_job, - shared_buffer=args.shared_buffer, - ) + # Skip lowering/compilation if using pre-generated PTE + if not args.pre_gen_pte: + # lower to QNN + passes_job = get_capture_program_passes() + build_executorch_binary( + module, + inputs[0], + args.model, + f"{args.artifact}/{pte_filename}", + dataset=inputs, + skip_node_id_set=skip_node_id_set, + skip_node_op_set=skip_node_op_set, + quant_dtype=QuantDtype.use_16a16w, + passes_job=passes_job, + shared_buffer=args.shared_buffer, + ) if args.compile_only: return workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/{pte_filename}" - pte_path = f"{args.artifact}/{pte_filename}.pte" + pte_path = ( + f"{args.pre_gen_pte}/{pte_filename}.pte" + if args.pre_gen_pte + else f"{args.artifact}/{pte_filename}.pte" + ) adb = SimpleADB( qnn_sdk=os.getenv("QNN_SDK_ROOT"), diff --git a/examples/qualcomm/oss_scripts/bert.py b/examples/qualcomm/oss_scripts/bert.py index 0f9255cefdb..aa41df6ff4d 100644 --- a/examples/qualcomm/oss_scripts/bert.py +++ b/examples/qualcomm/oss_scripts/bert.py @@ -30,6 +30,9 @@ def main(args): + if args.compile_only and args.pre_gen_pte: + raise RuntimeError("Cannot set both compile_only and pre_gen_pte as true") + skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) os.makedirs(args.artifact, exist_ok=True) @@ -57,26 +60,32 @@ def main(args): ).eval() pte_filename = "bert_qnn_q16" - # lower to QNN - passes_job = get_capture_program_passes() - build_executorch_binary( - module, - inputs[0], - args.model, - f"{args.artifact}/{pte_filename}", - dataset=inputs, - skip_node_id_set=skip_node_id_set, - skip_node_op_set=skip_node_op_set, - quant_dtype=QuantDtype.use_16a8w, - passes_job=passes_job, - shared_buffer=args.shared_buffer, - ) + # Skip lowering/compilation if using pre-generated PTE + if not args.pre_gen_pte: + # lower to QNN + passes_job = get_capture_program_passes() + build_executorch_binary( + module, + inputs[0], + args.model, + f"{args.artifact}/{pte_filename}", + dataset=inputs, + skip_node_id_set=skip_node_id_set, + skip_node_op_set=skip_node_op_set, + quant_dtype=QuantDtype.use_16a8w, + passes_job=passes_job, + shared_buffer=args.shared_buffer, + ) if args.compile_only: return workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/{pte_filename}" - pte_path = f"{args.artifact}/{pte_filename}.pte" + pte_path = ( + f"{args.pre_gen_pte}/{pte_filename}.pte" + if args.pre_gen_pte + else f"{args.artifact}/{pte_filename}.pte" + ) adb = SimpleADB( qnn_sdk=os.getenv("QNN_SDK_ROOT"), diff --git a/examples/qualcomm/oss_scripts/distilbert.py b/examples/qualcomm/oss_scripts/distilbert.py index 7ca05181645..ce88f61ca5c 100644 --- a/examples/qualcomm/oss_scripts/distilbert.py +++ b/examples/qualcomm/oss_scripts/distilbert.py @@ -31,6 +31,9 @@ def main(args): + if args.compile_only and args.pre_gen_pte: + raise RuntimeError("Cannot set both compile_only and pre_gen_pte as true") + skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) os.makedirs(args.artifact, exist_ok=True) @@ -58,26 +61,32 @@ def main(args): ).eval() pte_filename = "distilbert_qnn_q16" - # lower to QNN - passes_job = get_capture_program_passes() - build_executorch_binary( - module, - inputs[0], - args.model, - f"{args.artifact}/{pte_filename}", - dataset=inputs, - skip_node_id_set=skip_node_id_set, - skip_node_op_set=skip_node_op_set, - quant_dtype=QuantDtype.use_16a8w, - passes_job=passes_job, - shared_buffer=args.shared_buffer, - ) + # Skip lowering/compilation if using pre-generated PTE + if not args.pre_gen_pte: + # lower to QNN + passes_job = get_capture_program_passes() + build_executorch_binary( + module, + inputs[0], + args.model, + f"{args.artifact}/{pte_filename}", + dataset=inputs, + skip_node_id_set=skip_node_id_set, + skip_node_op_set=skip_node_op_set, + quant_dtype=QuantDtype.use_16a8w, + passes_job=passes_job, + shared_buffer=args.shared_buffer, + ) if args.compile_only: return workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/{pte_filename}" - pte_path = f"{args.artifact}/{pte_filename}.pte" + pte_path = ( + f"{args.pre_gen_pte}/{pte_filename}.pte" + if args.pre_gen_pte + else f"{args.artifact}/{pte_filename}.pte" + ) adb = SimpleADB( qnn_sdk=os.getenv("QNN_SDK_ROOT"), diff --git a/examples/qualcomm/oss_scripts/eurobert.py b/examples/qualcomm/oss_scripts/eurobert.py index a856616bcf2..5e133aed0d1 100644 --- a/examples/qualcomm/oss_scripts/eurobert.py +++ b/examples/qualcomm/oss_scripts/eurobert.py @@ -35,6 +35,9 @@ def main(args): + if args.compile_only and args.pre_gen_pte: + raise RuntimeError("Cannot set both compile_only and pre_gen_pte as true") + assert ( transformers.__version__ >= TRANSFORMERS_VERSION ), f"Please ensure transformers version >= {TRANSFORMERS_VERSION}, current version is {transformers.__version__}" @@ -88,33 +91,40 @@ def replace_rms_norm_with_native_rms_norm(module: torch.nn.Module): pte_filename = "eurobert_qnn_q16" - # lower to QNN - passes_job = get_capture_program_passes() - quantizer = make_quantizer( - quant_dtype=QuantDtype.use_16a16w, - ) - quantizer.add_custom_quant_annotations((annotate_eurobert,)) - with torch.no_grad(): - build_executorch_binary( - model, - inputs[0], - args.model, - f"{args.artifact}/{pte_filename}", - dataset=inputs, - skip_node_id_set=skip_node_id_set, - skip_node_op_set=skip_node_op_set, - custom_quantizer=quantizer, - passes_job=passes_job, - shared_buffer=args.shared_buffer, + # Skip lowering/compilation if using pre-generated PTE + if not args.pre_gen_pte: + # lower to QNN + passes_job = get_capture_program_passes() + quantizer = make_quantizer( + quant_dtype=QuantDtype.use_16a16w, ) + quantizer.add_custom_quant_annotations((annotate_eurobert,)) + with torch.no_grad(): + build_executorch_binary( + model, + inputs[0], + args.model, + f"{args.artifact}/{pte_filename}", + dataset=inputs, + skip_node_id_set=skip_node_id_set, + skip_node_op_set=skip_node_op_set, + custom_quantizer=quantizer, + passes_job=passes_job, + shared_buffer=args.shared_buffer, + ) if args.compile_only: return + pte_path = ( + f"{args.pre_gen_pte}/{pte_filename}.pte" + if args.pre_gen_pte + else f"{args.artifact}/{pte_filename}.pte" + ) adb = SimpleADB( qnn_sdk=os.getenv("QNN_SDK_ROOT"), build_path=f"{args.build_folder}", - pte_path=f"{args.artifact}/{pte_filename}.pte", + pte_path=pte_path, workspace=f"/data/local/tmp/executorch/{pte_filename}", device_id=args.device, host_id=args.host, diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py index 91d82531654..8b54b8e150c 100755 --- a/examples/qualcomm/oss_scripts/llama/llama.py +++ b/examples/qualcomm/oss_scripts/llama/llama.py @@ -1023,7 +1023,8 @@ def post_process(): runner=f"examples/qualcomm/oss_scripts/llama/qnn_llama_runner", ) # No pregen inputs, input_list is not required - adb.push(inputs=[], files=[runtime_tokenizer_path]) + if not args.skip_push: + adb.push(inputs=[], files=[runtime_tokenizer_path]) adb.execute(custom_runner_cmd=runner_cmd) adb.pull(output_path=args.artifact, callback=post_process)