-
Notifications
You must be signed in to change notification settings - Fork 712
Add warmup for Llama #5756
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add warmup for Llama #5756
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,7 @@ | ||
| .hypothesis | ||
| buck-out/ | ||
| cmake-out/ | ||
| cmake-out* | ||
| .DS_Store | ||
| cmake-android-out/ | ||
| cmake-out-android/ | ||
| cmake-ios-out/ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -146,12 +146,21 @@ Error Runner::load() { | |
| return Error::Ok; | ||
| } | ||
|
|
||
| // Don't print with the same priority during warmup | ||
| #define RUNNER_ET_LOG(warmup, format, ...) \ | ||
| if (warmup) { \ | ||
| ET_LOG(Debug, format, __VA_ARGS__); \ | ||
| } else { \ | ||
| ET_LOG(Info, format, __VA_ARGS__); \ | ||
| } | ||
|
|
||
| Error Runner::generate( | ||
| const std::string& prompt, | ||
| int32_t seq_len, | ||
| std::function<void(const std::string&)> token_callback, | ||
| std::function<void(const llm::Stats&)> stats_callback, | ||
| bool echo) { | ||
| bool echo, | ||
| bool warmup) { | ||
| // Prepare the inputs. | ||
| // Use ones-initialized inputs. | ||
| ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null"); | ||
|
|
@@ -161,16 +170,22 @@ Error Runner::generate( | |
| stats_.model_load_end_ms = llm::time_in_ms(); | ||
| } | ||
|
|
||
| ET_LOG( | ||
| Info, | ||
| if (warmup) { | ||
| ET_LOG(Info, "Doing a warmup run..."); | ||
| } | ||
|
Comment on lines
+173
to
+175
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe add this before generate in Runner::warmup? |
||
|
|
||
| RUNNER_ET_LOG( | ||
| warmup, | ||
| "RSS after loading model: %f MiB (0 if unsupported)", | ||
| llm::get_rss_bytes() / 1024.0 / 1024.0); | ||
|
|
||
| // Wrap the token_callback with print function | ||
| std::function<void(const std::string&)> wrapped_callback = | ||
| [token_callback](const std::string& piece) { | ||
| llm::safe_printf(piece.c_str()); | ||
| fflush(stdout); | ||
| [token_callback, warmup](const std::string& piece) { | ||
| if (!warmup) { | ||
| llm::safe_printf(piece.c_str()); | ||
| fflush(stdout); | ||
| } | ||
| if (token_callback) { | ||
| token_callback(piece); | ||
| } | ||
|
|
@@ -228,8 +243,8 @@ Error Runner::generate( | |
|
|
||
| // print the first token from prefill. No prev_token so use cur_token for it. | ||
| wrapped_callback(ET_UNWRAP(tokenizer_->decode(cur_token, cur_token))); | ||
| ET_LOG( | ||
| Info, | ||
| RUNNER_ET_LOG( | ||
| warmup, | ||
| "RSS after prompt prefill: %f MiB (0 if unsupported)", | ||
| llm::get_rss_bytes() / 1024.0 / 1024.0); | ||
|
|
||
|
|
@@ -239,26 +254,46 @@ Error Runner::generate( | |
| prompt_tokens, num_prompt_tokens, seq_len, wrapped_callback)); | ||
|
|
||
| stats_.inference_end_ms = llm::time_in_ms(); | ||
| printf("\n"); | ||
| ET_LOG( | ||
| Info, | ||
| if (!warmup) { | ||
| printf("\n"); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. safe_printf? |
||
| } | ||
| RUNNER_ET_LOG( | ||
| warmup, | ||
| "RSS after finishing text generation: %f MiB (0 if unsupported)", | ||
| llm::get_rss_bytes() / 1024.0 / 1024.0); | ||
|
|
||
| if (num_prompt_tokens + num_generated_tokens == seq_len) { | ||
| ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len); | ||
| RUNNER_ET_LOG(warmup, "Sequence length (%i tokens) reached!", seq_len); | ||
| } | ||
|
|
||
| stats_.num_prompt_tokens = num_prompt_tokens; | ||
| stats_.num_generated_tokens = num_generated_tokens; | ||
| ::executorch::llm::print_report(stats_); | ||
|
|
||
| if (warmup) { | ||
| ET_LOG(Info, "Warmup run finished!"); | ||
| } else { | ||
| // Do not print report during warmup | ||
| ::executorch::llm::print_report(stats_); | ||
| } | ||
| if (stats_callback) { | ||
| stats_callback(stats_); | ||
| } | ||
|
|
||
| return Error::Ok; | ||
| } | ||
|
|
||
| Error Runner::warmup(const std::string& prompt, int32_t seq_len) { | ||
| Error err = generate( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It should be prefill, right? not generate? or generate calls prefill? And you are running not warming up by prefill only but entire sequence generation? also how do you enable this for llava |
||
| prompt, | ||
| seq_len, | ||
| /*token_callback=*/nullptr, | ||
| /*stats_callbak=*/nullptr, | ||
| /*echo=*/false, | ||
| /*warmup=*/true); | ||
| stats_.reset(); | ||
| return err; | ||
| } | ||
|
|
||
| void Runner::stop() { | ||
| if (is_loaded()) { | ||
| text_token_generator_->stop(); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,7 +41,11 @@ class Runner { | |
| std::function<void(const std::string&)> token_callback = {}, | ||
| std::function<void(const ::executorch::extension::llm::Stats&)> | ||
| stats_callback = {}, | ||
| bool echo = true); | ||
| bool echo = true, | ||
| bool warming = false); | ||
| ::executorch::runtime::Error warmup( | ||
| const std::string& prompt, | ||
| int32_t seq_len = 128); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we move seq_len out to a constant variable DEFAULT_SEQ_LEN that is shared by both generate and warmup?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so warmup is done on the same prompt/seq_len as a real run today. Default is just something I followed from generate(). |
||
| void stop(); | ||
|
|
||
| private: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why warm up in CI?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah good question. This is so we don't break things with warmup. I debated this and was thinking about doing two runs and compare outputs w/ and w/o warmup, but CI is expensive so just did w/ warmup and compared output after.