From b03e7f2c1487b6b8da207f8ccdc1c5aff3aa9583 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@meta.com>
Date: Mon, 26 Aug 2024 20:00:20 -0700
Subject: [PATCH] enable parallel prefill again (#4893)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4893

Accidentally the parallel prefill enablement logic fell through the cracks
before

 {F1823815458}

after

 {F1823818585}

Observe the calls to 4x8 kernels

Reviewed By: swolchok, larryliu0820

Differential Revision: D61751873
---
 examples/models/llama2/runner/runner.cpp | 2 +-
 examples/models/llama2/runner/runner.h   | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp
index dd0a305a371..7a2fa676628 100644
--- a/examples/models/llama2/runner/runner.cpp
+++ b/examples/models/llama2/runner/runner.cpp
@@ -126,7 +126,7 @@ Error Runner::load() {
       tokenizer_.get(),
       text_decoder_runner_.get(),
       metadata_.at(kUseKVCache),
-      enable_parallel_prefill_);
+      metadata_.at(kEnableDynamicShape));
 
   text_token_generator_ = std::make_unique<TextTokenGenerator>(
       tokenizer_.get(),
diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h
index 12fb63c6f34..4e3c1daef7b 100644
--- a/examples/models/llama2/runner/runner.h
+++ b/examples/models/llama2/runner/runner.h
@@ -45,7 +45,6 @@ class Runner {
 
  private:
   float temperature_;
-  bool enable_parallel_prefill_;
   bool shouldStop_{false};
 
   // model