Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/scripts/test_llama.sh
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ echo "Creating tokenizer.bin"
$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin


RUNTIME_ARGS="--model_path=${EXPORTED_MODEL_NAME} --tokenizer_path=tokenizer.bin --prompt=Once --temperature=0 --seq_len=10"
RUNTIME_ARGS="--model_path=${EXPORTED_MODEL_NAME} --tokenizer_path=tokenizer.bin --prompt=Once --temperature=0 --seq_len=10 --warmup=1"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why warm up in CI?

Copy link
Contributor Author

@digantdesai digantdesai Oct 1, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah good question. This is so we don't break things with warmup. I debated this and was thinking about doing two runs and compare outputs w/ and w/o warmup, but CI is expensive so just did w/ warmup and compared output after.

# Check build tool.
echo "Running ${EXPORTED_MODEL_NAME} in portable mode"
if [[ "${BUILD_TOOL}" == "buck2" ]]; then
Expand Down
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
.hypothesis
buck-out/
cmake-out/
cmake-out*
.DS_Store
cmake-android-out/
cmake-out-android/
cmake-ios-out/
Expand Down
7 changes: 7 additions & 0 deletions examples/models/llama2/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ DEFINE_int32(
-1,
"Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");

DEFINE_bool(warmup, false, "Whether to run a warmup run.");

int32_t main(int32_t argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);

Expand All @@ -57,6 +59,8 @@ int32_t main(int32_t argc, char** argv) {

int32_t cpu_threads = FLAGS_cpu_threads;

bool warmup = FLAGS_warmup;

#if defined(ET_USE_THREADPOOL)
uint32_t num_performant_cores = cpu_threads == -1
? torch::executorch::cpuinfo::get_num_performant_cores()
Expand All @@ -71,6 +75,9 @@ int32_t main(int32_t argc, char** argv) {
// create llama runner
example::Runner runner(model_path, tokenizer_path, temperature);

if (warmup) {
runner.warmup(prompt, seq_len);
}
// generate
runner.generate(prompt, seq_len);

Expand Down
61 changes: 48 additions & 13 deletions examples/models/llama2/runner/runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,12 +146,21 @@ Error Runner::load() {
return Error::Ok;
}

// Don't print with the same priority during warmup
#define RUNNER_ET_LOG(warmup, format, ...) \
if (warmup) { \
ET_LOG(Debug, format, __VA_ARGS__); \
} else { \
ET_LOG(Info, format, __VA_ARGS__); \
}

Error Runner::generate(
const std::string& prompt,
int32_t seq_len,
std::function<void(const std::string&)> token_callback,
std::function<void(const llm::Stats&)> stats_callback,
bool echo) {
bool echo,
bool warmup) {
// Prepare the inputs.
// Use ones-initialized inputs.
ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
Expand All @@ -161,16 +170,22 @@ Error Runner::generate(
stats_.model_load_end_ms = llm::time_in_ms();
}

ET_LOG(
Info,
if (warmup) {
ET_LOG(Info, "Doing a warmup run...");
}
Comment on lines +173 to +175
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe add this before generate in Runner::warmup?


RUNNER_ET_LOG(
warmup,
"RSS after loading model: %f MiB (0 if unsupported)",
llm::get_rss_bytes() / 1024.0 / 1024.0);

// Wrap the token_callback with print function
std::function<void(const std::string&)> wrapped_callback =
[token_callback](const std::string& piece) {
llm::safe_printf(piece.c_str());
fflush(stdout);
[token_callback, warmup](const std::string& piece) {
if (!warmup) {
llm::safe_printf(piece.c_str());
fflush(stdout);
}
if (token_callback) {
token_callback(piece);
}
Expand Down Expand Up @@ -228,8 +243,8 @@ Error Runner::generate(

// print the first token from prefill. No prev_token so use cur_token for it.
wrapped_callback(ET_UNWRAP(tokenizer_->decode(cur_token, cur_token)));
ET_LOG(
Info,
RUNNER_ET_LOG(
warmup,
"RSS after prompt prefill: %f MiB (0 if unsupported)",
llm::get_rss_bytes() / 1024.0 / 1024.0);

Expand All @@ -239,26 +254,46 @@ Error Runner::generate(
prompt_tokens, num_prompt_tokens, seq_len, wrapped_callback));

stats_.inference_end_ms = llm::time_in_ms();
printf("\n");
ET_LOG(
Info,
if (!warmup) {
printf("\n");
Copy link
Contributor

@mcr229 mcr229 Sep 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

safe_printf?

}
RUNNER_ET_LOG(
warmup,
"RSS after finishing text generation: %f MiB (0 if unsupported)",
llm::get_rss_bytes() / 1024.0 / 1024.0);

if (num_prompt_tokens + num_generated_tokens == seq_len) {
ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len);
RUNNER_ET_LOG(warmup, "Sequence length (%i tokens) reached!", seq_len);
}

stats_.num_prompt_tokens = num_prompt_tokens;
stats_.num_generated_tokens = num_generated_tokens;
::executorch::llm::print_report(stats_);

if (warmup) {
ET_LOG(Info, "Warmup run finished!");
} else {
// Do not print report during warmup
::executorch::llm::print_report(stats_);
}
if (stats_callback) {
stats_callback(stats_);
}

return Error::Ok;
}

Error Runner::warmup(const std::string& prompt, int32_t seq_len) {
Error err = generate(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should be prefill, right? not generate? or generate calls prefill? And you are running not warming up by prefill only but entire sequence generation?

also how do you enable this for llava

prompt,
seq_len,
/*token_callback=*/nullptr,
/*stats_callbak=*/nullptr,
/*echo=*/false,
/*warmup=*/true);
stats_.reset();
return err;
}

void Runner::stop() {
if (is_loaded()) {
text_token_generator_->stop();
Expand Down
6 changes: 5 additions & 1 deletion examples/models/llama2/runner/runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,11 @@ class Runner {
std::function<void(const std::string&)> token_callback = {},
std::function<void(const ::executorch::extension::llm::Stats&)>
stats_callback = {},
bool echo = true);
bool echo = true,
bool warming = false);
::executorch::runtime::Error warmup(
const std::string& prompt,
int32_t seq_len = 128);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we move seq_len out to a constant variable DEFAULT_SEQ_LEN that is shared by both generate and warmup?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so warmup is done on the same prompt/seq_len as a real run today. Default is just something I followed from generate().

void stop();

private:
Expand Down
13 changes: 13 additions & 0 deletions extension/llm/runner/stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,19 @@ struct Stats {
aggregate_sampling_timer_start_timestamp = 0;
}

void reset() {
model_load_start_ms = 0;
model_load_end_ms = 0;
inference_start_ms = 0;
prompt_eval_end_ms = 0;
first_token_ms = 0;
inference_end_ms = 0;
aggregate_sampling_time_ms = 0;
num_prompt_tokens = 0;
num_generated_tokens = 0;
aggregate_sampling_timer_start_timestamp = 0;
}

private:
long aggregate_sampling_timer_start_timestamp = 0;
};
Expand Down
Loading