From 95c889a81de1f8e43c04619e5b630203027cfb11 Mon Sep 17 00:00:00 2001
From: Lunwen He <lwhecser@gmail.com>
Date: Thu, 3 Oct 2024 16:53:37 -0700
Subject: [PATCH] use --use_sdpa_with_kv_cache for 1B/3B bf16

We should use this option during exporting 1B/3B models as bf16 because KVCache is always fp32. Otherwise, we see regressed performance for 1B/3B in bf16 format.

Differential Revision: [D63871048](https://our.internmc.facebook.com/intern/diff/D63871048/)

[ghstack-poisoned]
---
 examples/models/llama2/README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md
index bcca1b82ba4..93d9274d613 100644
--- a/examples/models/llama2/README.md
+++ b/examples/models/llama2/README.md
@@ -142,7 +142,9 @@ LLAMA_PARAMS=path/to/params.json
 python -m examples.models.llama2.export_llama \
   --checkpoint "${LLAMA_CHECKPOINT:?}" \
   --params "${LLAMA_PARAMS:?}" \
-  -kv -X \
+  -kv \
+  --use_sdpa_with_kv_cache \
+  -X \
   -d bf16 \
   --metadata '{"append_eos_to_prompt": 0, "get_bos_id":128000, "get_eos_ids":[128009, 128001], "get_n_bos": 0, "get_n_eos": 0}' \
   --output_name="llama3_2.pte"