From b03e7f2c1487b6b8da207f8ccdc1c5aff3aa9583 Mon Sep 17 00:00:00 2001 From: Kimish Patel Date: Mon, 26 Aug 2024 20:00:20 -0700 Subject: [PATCH] enable parallel prefill again (#4893) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/4893 Accidentally the parallel prefill enablement logic fell through the cracks before {F1823815458} after {F1823818585} Observe the calls to 4x8 kernels Reviewed By: swolchok, larryliu0820 Differential Revision: D61751873 --- examples/models/llama2/runner/runner.cpp | 2 +- examples/models/llama2/runner/runner.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp index dd0a305a371..7a2fa676628 100644 --- a/examples/models/llama2/runner/runner.cpp +++ b/examples/models/llama2/runner/runner.cpp @@ -126,7 +126,7 @@ Error Runner::load() { tokenizer_.get(), text_decoder_runner_.get(), metadata_.at(kUseKVCache), - enable_parallel_prefill_); + metadata_.at(kEnableDynamicShape)); text_token_generator_ = std::make_unique( tokenizer_.get(), diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h index 12fb63c6f34..4e3c1daef7b 100644 --- a/examples/models/llama2/runner/runner.h +++ b/examples/models/llama2/runner/runner.h @@ -45,7 +45,6 @@ class Runner { private: float temperature_; - bool enable_parallel_prefill_; bool shouldStop_{false}; // model