From 95c889a81de1f8e43c04619e5b630203027cfb11 Mon Sep 17 00:00:00 2001 From: Lunwen He Date: Thu, 3 Oct 2024 16:53:37 -0700 Subject: [PATCH] use --use_sdpa_with_kv_cache for 1B/3B bf16 We should use this option during exporting 1B/3B models as bf16 because KVCache is always fp32. Otherwise, we see regressed performance for 1B/3B in bf16 format. Differential Revision: [D63871048](https://our.internmc.facebook.com/intern/diff/D63871048/) [ghstack-poisoned] --- examples/models/llama2/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md index bcca1b82ba4..93d9274d613 100644 --- a/examples/models/llama2/README.md +++ b/examples/models/llama2/README.md @@ -142,7 +142,9 @@ LLAMA_PARAMS=path/to/params.json python -m examples.models.llama2.export_llama \ --checkpoint "${LLAMA_CHECKPOINT:?}" \ --params "${LLAMA_PARAMS:?}" \ - -kv -X \ + -kv \ + --use_sdpa_with_kv_cache \ + -X \ -d bf16 \ --metadata '{"append_eos_to_prompt": 0, "get_bos_id":128000, "get_eos_ids":[128009, 128001], "get_n_bos": 0, "get_n_eos": 0}' \ --output_name="llama3_2.pte"