From e15665721136a81e231e312b0d7c7535a8fb27fd Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Thu, 26 Sep 2024 23:20:25 -0500 Subject: [PATCH 1/4] update gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index bd3528a4c4b..176edf9300b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ .hypothesis buck-out/ -cmake-out/ +cmake-out* +.DS_Store cmake-android-out/ cmake-out-android/ cmake-ios-out/ From f61d801f4250fb5b1fcec11b790eae89e8d739a3 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Thu, 26 Sep 2024 23:21:11 -0500 Subject: [PATCH 2/4] [llm] add stat reset() --- extension/llm/runner/stats.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/extension/llm/runner/stats.h b/extension/llm/runner/stats.h index 902ba892966..94b5129bbaf 100644 --- a/extension/llm/runner/stats.h +++ b/extension/llm/runner/stats.h @@ -52,6 +52,19 @@ struct Stats { aggregate_sampling_timer_start_timestamp = 0; } + void reset() { + model_load_start_ms = 0; + model_load_end_ms = 0; + inference_start_ms = 0; + prompt_eval_end_ms = 0; + first_token_ms = 0; + inference_end_ms = 0; + aggregate_sampling_time_ms = 0; + num_prompt_tokens = 0; + num_generated_tokens = 0; + aggregate_sampling_timer_start_timestamp = 0; + } + private: long aggregate_sampling_timer_start_timestamp = 0; }; From 5e5c3fe255a698a84de388630edab8904d58adb8 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Thu, 26 Sep 2024 23:25:15 -0500 Subject: [PATCH 3/4] [llama] Add warmup --- examples/models/llama2/main.cpp | 7 +++ examples/models/llama2/runner/runner.cpp | 57 +++++++++++++++++++----- examples/models/llama2/runner/runner.h | 6 ++- 3 files changed, 57 insertions(+), 13 deletions(-) diff --git a/examples/models/llama2/main.cpp b/examples/models/llama2/main.cpp index c7ad96fac9d..30aef95f580 100644 --- a/examples/models/llama2/main.cpp +++ b/examples/models/llama2/main.cpp @@ -39,6 +39,8 @@ DEFINE_int32( -1, "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device."); +DEFINE_bool(warmup, false, "Whether to run a warmup run."); + int32_t main(int32_t argc, char** argv) { gflags::ParseCommandLineFlags(&argc, &argv, true); @@ -57,6 +59,8 @@ int32_t main(int32_t argc, char** argv) { int32_t cpu_threads = FLAGS_cpu_threads; + bool warmup = FLAGS_warmup; + #if defined(ET_USE_THREADPOOL) uint32_t num_performant_cores = cpu_threads == -1 ? torch::executorch::cpuinfo::get_num_performant_cores() @@ -71,6 +75,9 @@ int32_t main(int32_t argc, char** argv) { // create llama runner example::Runner runner(model_path, tokenizer_path, temperature); + if (warmup) { + runner.warmup(prompt, seq_len); + } // generate runner.generate(prompt, seq_len); diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp index 764f4f7f68c..7d314ae902c 100644 --- a/examples/models/llama2/runner/runner.cpp +++ b/examples/models/llama2/runner/runner.cpp @@ -146,12 +146,21 @@ Error Runner::load() { return Error::Ok; } +// Don't print with the same priority during warmup +#define RUNNER_ET_LOG(warmup, format, ...) \ + if (warmup) { \ + ET_LOG(Debug, format, __VA_ARGS__); \ + } else { \ + ET_LOG(Info, format, __VA_ARGS__); \ + } + Error Runner::generate( const std::string& prompt, int32_t seq_len, std::function token_callback, std::function stats_callback, - bool echo) { + bool echo, + bool warmup) { // Prepare the inputs. // Use ones-initialized inputs. ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null"); @@ -161,16 +170,22 @@ Error Runner::generate( stats_.model_load_end_ms = llm::time_in_ms(); } - ET_LOG( - Info, + if (warmup) { + ET_LOG(Info, "Doing a warmup run..."); + } + + RUNNER_ET_LOG( + warmup, "RSS after loading model: %f MiB (0 if unsupported)", llm::get_rss_bytes() / 1024.0 / 1024.0); // Wrap the token_callback with print function std::function wrapped_callback = - [token_callback](const std::string& piece) { - llm::safe_printf(piece.c_str()); - fflush(stdout); + [token_callback, warmup](const std::string& piece) { + if (!warmup) { + llm::safe_printf(piece.c_str()); + fflush(stdout); + } if (token_callback) { token_callback(piece); } @@ -228,8 +243,8 @@ Error Runner::generate( // print the first token from prefill. No prev_token so use cur_token for it. wrapped_callback(ET_UNWRAP(tokenizer_->decode(cur_token, cur_token))); - ET_LOG( - Info, + RUNNER_ET_LOG( + warmup, "RSS after prompt prefill: %f MiB (0 if unsupported)", llm::get_rss_bytes() / 1024.0 / 1024.0); @@ -240,18 +255,24 @@ Error Runner::generate( stats_.inference_end_ms = llm::time_in_ms(); printf("\n"); - ET_LOG( - Info, + RUNNER_ET_LOG( + warmup, "RSS after finishing text generation: %f MiB (0 if unsupported)", llm::get_rss_bytes() / 1024.0 / 1024.0); if (num_prompt_tokens + num_generated_tokens == seq_len) { - ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len); + RUNNER_ET_LOG(warmup, "Sequence length (%i tokens) reached!", seq_len); } stats_.num_prompt_tokens = num_prompt_tokens; stats_.num_generated_tokens = num_generated_tokens; - ::executorch::llm::print_report(stats_); + + if (warmup) { + ET_LOG(Info, "Warmup run finished!"); + } else { + // Do not print report during warmup + ::executorch::llm::print_report(stats_); + } if (stats_callback) { stats_callback(stats_); } @@ -259,6 +280,18 @@ Error Runner::generate( return Error::Ok; } +Error Runner::warmup(const std::string& prompt, int32_t seq_len) { + Error err = generate( + prompt, + seq_len, + /*token_callback=*/nullptr, + /*stats_callbak=*/nullptr, + /*echo=*/false, + /*warmup=*/true); + stats_.reset(); + return err; +} + void Runner::stop() { if (is_loaded()) { text_token_generator_->stop(); diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h index 72d0ea12a50..ca843427a7b 100644 --- a/examples/models/llama2/runner/runner.h +++ b/examples/models/llama2/runner/runner.h @@ -41,7 +41,11 @@ class Runner { std::function token_callback = {}, std::function stats_callback = {}, - bool echo = true); + bool echo = true, + bool warming = false); + ::executorch::runtime::Error warmup( + const std::string& prompt, + int32_t seq_len = 128); void stop(); private: From 3668b269fb787d65b4d027ebc474c19dcdce1e49 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Mon, 30 Sep 2024 13:18:51 -0500 Subject: [PATCH 4/4] add warm up option for test --- .ci/scripts/test_llama.sh | 2 +- examples/models/llama2/runner/runner.cpp | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh index 12ba9ba52db..a912d565a3e 100644 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -213,7 +213,7 @@ echo "Creating tokenizer.bin" $PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin -RUNTIME_ARGS="--model_path=${EXPORTED_MODEL_NAME} --tokenizer_path=tokenizer.bin --prompt=Once --temperature=0 --seq_len=10" +RUNTIME_ARGS="--model_path=${EXPORTED_MODEL_NAME} --tokenizer_path=tokenizer.bin --prompt=Once --temperature=0 --seq_len=10 --warmup=1" # Check build tool. echo "Running ${EXPORTED_MODEL_NAME} in portable mode" if [[ "${BUILD_TOOL}" == "buck2" ]]; then diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp index 7d314ae902c..499bfbedf15 100644 --- a/examples/models/llama2/runner/runner.cpp +++ b/examples/models/llama2/runner/runner.cpp @@ -254,7 +254,9 @@ Error Runner::generate( prompt_tokens, num_prompt_tokens, seq_len, wrapped_callback)); stats_.inference_end_ms = llm::time_in_ms(); - printf("\n"); + if (!warmup) { + printf("\n"); + } RUNNER_ET_LOG( warmup, "RSS after finishing text generation: %f MiB (0 if unsupported)",