diff --git a/.ci/scripts/test_qnn_static_llama.sh b/.ci/scripts/test_qnn_static_llama.sh index 8aab21846f1..5df74bddef4 100644 --- a/.ci/scripts/test_qnn_static_llama.sh +++ b/.ci/scripts/test_qnn_static_llama.sh @@ -34,11 +34,11 @@ $PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o to set +e # Compile only as weight sharing is not applicable on x86 -$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --compile_only +$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --llama_artifacts . --compile_only exit_code1=$? # Checks accuracy with weight sharing disabled since x86 does not support weight sharing. -$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --enable_x86_64 +$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts . --enable_x86_64 exit_code2=$? # Check the exit codes and print messages diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 8757e9ce728..6242558189e 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -2991,6 +2991,173 @@ def test_qnn_backend_draw_graph(self): ), "Generated .dot file does not match the golden file." +class TestExampleLLMScript(TestQNN): + def required_envs(self, conditions=None) -> bool: + conditions = [] if conditions is None else conditions + return all( + [ + self.executorch_root, + self.artifact_dir, + *conditions, + ] + ) + + def test_llama3_2_1b(self): + if not self.required_envs(): + self.skipTest("missing required envs") + assert ( + self.llama_artifacts is not None + ), "Please provide path to llama artifacts" + + prompt = "What is the meaning of life?" + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py", + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--model", + self.model, + "--checkpoint", + f"{self.llama_artifacts}/consolidated.00.pth", + "--params", + f"{self.llama_artifacts}/params.json", + "--tokenizer_model", + f"{self.llama_artifacts}/tokenizer.model", + "--ip", + self.ip, + "--port", + str(self.port), + "--prompt", + f"{prompt}", + "--ptq", + "16a4w", + "--temperature", + "0", + "--llama_model", + "llama3_2", + "--model_mode", + "hybrid", + "--prefill_seq_len", + "32", + "--kv_seq_len", + "512", + "--num_sharding", + "4", + ] + if self.compile_only: + cmds.extend(["--compile_only"]) + elif self.device: + cmds.extend(["--device", self.device]) + if self.host: + cmds.extend(["--host", self.host]) + elif self.enable_x86_64: + cmds.extend(["--enable_x86_64"]) + if self.pre_gen_pte: + cmds.extend(["--pre_gen_pte", self.pre_gen_pte]) + + golden_start_with = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>" + p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + if not self.compile_only: + model_out = msg["result"][0] + self.assertTrue( + model_out.startswith(golden_start_with), + f"Expected Output: {golden_start_with}. Actual Output: {model_out}", + ) + # x86 does not allow weight sharing, so we don't check pte size. + # Inference speed on x86 is slow, so we only check when running on Android + if not self.enable_x86_64: + pte_size = msg["pte_size"] + self.assertLessEqual(pte_size, 1300000000) + if not self.compile_only and not self.enable_x86_64: + self.assertGreaterEqual(msg["inference_speed"], 66) # Lanai + + def test_llama_stories_110m(self): + if not self.required_envs(): + self.skipTest("missing required envs") + assert ( + self.llama_artifacts is not None + ), "Please provide path to llama artifacts" + + prompt = "Once" + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py", + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--model", + self.model, + "--checkpoint", + f"{self.llama_artifacts}/stories110M.pt", + "--params", + f"{self.llama_artifacts}/params.json", + "--tokenizer_model", + f"{self.llama_artifacts}/tokenizer.model", + "--tokenizer_bin", + f"{self.llama_artifacts}/tokenizer.bin", + "--ip", + self.ip, + "--port", + str(self.port), + "--prompt", + f"{prompt}", + "--ptq", + "16a4w", + "--temperature", + "0", + "--llama_model", + "stories110m", + "--model_mode", + "hybrid", + "--prefill_seq_len", + "32", + "--kv_seq_len", + "128", + ] + if self.compile_only: + cmds.extend(["--compile_only"]) + elif self.device: + cmds.extend(["--device", self.device]) + if self.host: + cmds.extend(["--host", self.host]) + elif self.enable_x86_64: + cmds.extend(["--enable_x86_64"]) + if self.pre_gen_pte: + cmds.extend(["--pre_gen_pte", self.pre_gen_pte]) + + golden_start_with = "Once upon a time," + p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + if not self.compile_only: + model_out = msg["result"][0] + self.assertTrue( + model_out.startswith(golden_start_with), + f"Expected Output: {golden_start_with}. Actual Output: {model_out}", + ) + # x86 does not allow weight sharing, so we don't check pte size + if not self.enable_x86_64: + pte_size = msg["pte_size"] + self.assertLessEqual(pte_size, 130000000) + if not self.compile_only and not self.enable_x86_64: + self.assertGreaterEqual(msg["inference_speed"], 220) # Lanai + + class TestExampleOssScript(TestQNN): def required_envs(self, conditions=None) -> bool: conditions = [] if conditions is None else conditions @@ -3886,72 +4053,6 @@ def test_deeplab_v3(self): self.assertGreaterEqual(msg["MPA"], 0.70) self.assertGreaterEqual(msg["MIoU"], 0.55) - def test_stories_single_llama(self): - if not self.required_envs(): - self.skipTest("missing required envs") - - cmds = [ - "python", - f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py", - "--artifact", - self.artifact_dir, - "--build_folder", - self.build_folder, - "--model", - self.model, - "--checkpoint", - f"{self.artifact_dir}/stories110M.pt", - "--params", - f"{self.artifact_dir}/params.json", - "--tokenizer_model", - f"{self.artifact_dir}/tokenizer.model", - "--tokenizer_bin", - f"{self.artifact_dir}/tokenizer.bin", - "--ip", - self.ip, - "--port", - str(self.port), - "--prompt", - "Once", - "--ptq", - "16a4w", - "--temperature", - "0", - "--llama_model", - "stories110m", - "--model_mode", - "hybrid", - "--prefill_seq_len", - "32", - "--kv_seq_len", - "128", - ] - if self.compile_only: - cmds.extend(["--compile_only"]) - elif self.device: - cmds.extend(["--device", self.device]) - if self.host: - cmds.extend(["--host", self.host]) - elif self.enable_x86_64: - cmds.extend(["--enable_x86_64"]) - - golden_start_with = "Once upon a time," - p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) - with Listener((self.ip, self.port)) as listener: - conn = listener.accept() - p.communicate() - msg = json.loads(conn.recv()) - if "Error" in msg: - self.fail(msg["Error"]) - else: - if not self.compile_only: - model_out = msg["result"][0] - self.assertTrue(model_out.startswith(golden_start_with)) - # x86 does not allow weight sharing, so we don't check pte size - if not self.enable_x86_64: - pte_size = msg["pte_size"] - self.assertLessEqual(pte_size, 130000000) - @unittest.skip("dynamic shape inputs appear in recent torch.export.export") def test_mobilebert(self): if not self.required_envs([self.pretrained_weight]): @@ -4156,6 +4257,18 @@ def setup_environment(): type=str, ) + parser.add_argument( + "--pre_gen_pte", + help="Run the pre-generated pte in the given directory.", + type=str, + ) + + parser.add_argument( + "--llama_artifacts", + help="A folder that contains: weight, tokenizer, and params.", + type=str, + ) + args, ns_args = parser.parse_known_args(namespace=unittest) TestQNN.host = args.host TestQNN.device = args.device @@ -4174,6 +4287,8 @@ def setup_environment(): TestQNN.enable_x86_64 = args.enable_x86_64 TestQNN.dump_intermediate_outputs = args.dump_intermediate_outputs TestQNN.compile_only = args.compile_only + TestQNN.pre_gen_pte = args.pre_gen_pte + TestQNN.llama_artifacts = args.llama_artifacts return sys.argv[:1] + ns_args diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py index b2b9561b632..ce88bca97d6 100644 --- a/backends/qualcomm/tests/utils.py +++ b/backends/qualcomm/tests/utils.py @@ -188,6 +188,8 @@ class TestQNN(unittest.TestCase): shared_buffer: bool = False enable_x86_64: bool = False compile_only: bool = False + pre_gen_pte: str = "" + llama_artifacts: str = "" def _assert_outputs_equal(self, model_output, ref_output): self.assertTrue(len(ref_output) == len(model_output)) diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py index ab27714ae1f..48353d3ee6b 100755 --- a/examples/qualcomm/oss_scripts/llama/llama.py +++ b/examples/qualcomm/oss_scripts/llama/llama.py @@ -881,6 +881,10 @@ def post_process(): adb.pull(output_path=args.artifact, callback=post_process) if args.ip and args.port != -1: + inference_speed = 0 + with open(f"{args.artifact}/outputs/inference_speed.txt", "r") as f: + inference_speed = float(f.read()) + pte_size = os.path.getsize(pte_path) with Client((args.ip, args.port)) as conn: conn.send( @@ -888,6 +892,7 @@ def post_process(): { "result": outputs, "pte_size": pte_size, + "inference_speed": inference_speed, } ) ) diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp index 4b45863147e..70ba25a0972 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include using executorch::aten::Tensor; @@ -518,6 +519,19 @@ void printReport(const Runner::Stats& stats) { stats.num_generated_tokens, (double)stats.aggregate_sampling_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND); + + // For now, we just print the total inference time for CI, can save more info + // in future if needed. + std::ofstream outfile("outputs/inference_speed.txt"); + if (outfile.is_open()) { + double num_tok = (stats.num_generated_tokens) / + (double)(stats.inference_end_ms - stats.inference_start_ms) * + stats.SCALING_FACTOR_UNITS_PER_SECOND; + outfile << num_tok; + outfile.close(); + } else { + ET_CHECK_MSG(false, "Error saving the inference speed file"); + } } std::string statsToJsonString(const Runner::Stats& stats) {