From 19f94f222f8a9ccccad82a96834ddd309bd47d0a Mon Sep 17 00:00:00 2001
From: zjing14 <zhangjing14@gmail.com>
Date: Sat, 22 Oct 2022 21:01:04 -0500
Subject: [PATCH] Fix BERT benchmark for 2 gcd (#6)

* fixed batch_size > 1

* load so file for benchmark
---
 3rdparty/composable_kernel                |  2 +-
 examples/03_bert/benchmark_ait.py         | 62 +++++++++++++++--------
 examples/03_bert/benchmark_mi250.sh       |  8 +--
 examples/05_stable_diffusion/benchmark.py |  6 +--
 examples/05_stable_diffusion/demo.py      |  9 ++--
 5 files changed, 54 insertions(+), 33 deletions(-)

diff --git a/3rdparty/composable_kernel b/3rdparty/composable_kernel
index d8b41e1c9..40942b909 160000
--- a/3rdparty/composable_kernel
+++ b/3rdparty/composable_kernel
@@ -1 +1 @@
-Subproject commit d8b41e1c96d864569a2f2b59a3fbf14912a4e317
+Subproject commit 40942b909801dd721769834fc61ad201b5795446
diff --git a/examples/03_bert/benchmark_ait.py b/examples/03_bert/benchmark_ait.py
index 9847cb910..eafe8adcf 100644
--- a/examples/03_bert/benchmark_ait.py
+++ b/examples/03_bert/benchmark_ait.py
@@ -184,6 +184,7 @@ def compile_module(
     use_fp16_acc: bool,
     encoders_only: bool,
     pt_model: torch.nn.Module,
+    benchmark: bool
 ) -> None:
     model_name = f"BERT_{activation}_{batch_size}_{seq_length}"
     target = detect_target(use_fp16_acc=use_fp16_acc)
@@ -207,7 +208,10 @@ def compile_module(
 
     params = map_pt_params(model, pt_model, batch_size, seq_length)
 
-    mod = compile_model(y, target, "./tmp", model_name)
+    if benchmark:
+        mod = Model(os.path.join("./tmp", model_name, "test.so"))
+    else:
+        mod = compile_model(y, target, "./tmp", model_name)
 
     for k, v in params.items():
         mod.set_constant_with_tensor(k, v)
@@ -267,30 +271,44 @@ def compile_and_benchmark(
     pt_model.eval()
     hidden_size = pt_model.config.hidden_size
 
-    if batch_size < 1:
-        batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256]
+    if batch_size >= 1 and seq_length >= 1:
+        mod = compile_module(
+                    batch_size,
+                    seq_length,
+                    hidden_size,
+                    activation,
+                    use_fp16_acc,
+                    encoders_only,
+                    pt_model,
+                    1,
+                )
+        benchmark(batch_size, seq_length, hidden_size, mod, graph_mode, encoders_only)
     else:
-        batch_sizes = [batch_size]
+        if batch_size < 1:
+            batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256]
+        else:
+            batch_sizes = [batch_size]
 
-    if seq_length < 1:
-        seq_lengths = (
-            [64, 128, 384, 512, 1024, 4096] if encoders_only else [64, 128, 384, 512]
-        )
-    else:
-        seq_lengths = [seq_length]
-
-    for seq_length in seq_lengths:
-        for bs in batch_sizes:
-            mod = compile_module(
-                bs,
-                seq_length,
-                hidden_size,
-                activation,
-                use_fp16_acc,
-                encoders_only,
-                pt_model,
+        if seq_length < 1:
+            seq_lengths = (
+                [64, 128, 384, 512, 1024, 4096] if encoders_only else [64, 128, 384, 512]
             )
-            benchmark(bs, seq_length, hidden_size, mod, graph_mode, encoders_only)
+        else:
+            seq_lengths = [seq_length]
+
+        for sq in seq_lengths:
+            for bs in batch_sizes:
+                mod = compile_module(
+                    bs,
+                    sq,
+                    hidden_size,
+                    activation,
+                    use_fp16_acc,
+                    encoders_only,
+                    pt_model,
+                    0,
+                )
+                benchmark(bs, seq_length, hidden_size, mod, graph_mode, encoders_only)
 
 
 if __name__ == "__main__":
diff --git a/examples/03_bert/benchmark_mi250.sh b/examples/03_bert/benchmark_mi250.sh
index dab4ae50c..dcd1e5157 100644
--- a/examples/03_bert/benchmark_mi250.sh
+++ b/examples/03_bert/benchmark_mi250.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
 
 #profile
-HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 benchmark_ait.py
+#HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 benchmark_ait.py
 
 #1GCD
-HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size $1
+#HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size $1 --seq_length $2
 
 #2GCD
-HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size $1 &
-HIP_VISIBLE_DEVICES=1 python3 benchmark_ait.py --batch-size $1 && fg
+HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size $1 --seq-length $2 &
+HIP_VISIBLE_DEVICES=1 python3 benchmark_ait.py --batch-size $1 --seq-length $2 && fg
diff --git a/examples/05_stable_diffusion/benchmark.py b/examples/05_stable_diffusion/benchmark.py
index bda7da289..a8de6853e 100644
--- a/examples/05_stable_diffusion/benchmark.py
+++ b/examples/05_stable_diffusion/benchmark.py
@@ -68,7 +68,7 @@ def benchmark_unet(
 
     latent_model_input_pt = torch.randn(batch_size, 4, hh, ww).cuda().half()
     text_embeddings_pt = torch.randn(batch_size, 64, 768).cuda().half()
-    timesteps_pt = torch.Tensor([1, 1]).cuda().half()
+    timesteps_pt = torch.Tensor([1] * batch_size).cuda().half()
 
     with autocast("cuda"):
         pt_ys = pt_mod(
@@ -148,7 +148,7 @@ def benchmark_clip(
 
     tokenizer = CLIPTokenizer.from_pretrained(version)
     text_input = tokenizer(
-        ["a photo of an astronaut riding a horse on mars"],
+        ["a photo of an astronaut riding a horse on mars"] * batch_size,
         padding="max_length",
         max_length=seqlen,
         truncation=True,
@@ -278,7 +278,7 @@ def benchmark_vae(batch_size=1, height=64, width=64, benchmark_pt=False, verify=
 @click.option("--verify", type=bool, default=False, help="verify correctness")
 @click.option("--benchmark-pt", type=bool, default=False, help="run pt benchmark")
 def benchmark_diffusers(token, batch_size, verify, benchmark_pt):
-    assert batch_size == 1, "batch size must be 1 for submodule verification"
+    #assert batch_size == 1, "batch size must be 1 for submodule verification"
     logging.getLogger().setLevel(logging.INFO)
     np.random.seed(0)
     torch.manual_seed(4896)
diff --git a/examples/05_stable_diffusion/demo.py b/examples/05_stable_diffusion/demo.py
index 5a7b8b79e..de48f8783 100644
--- a/examples/05_stable_diffusion/demo.py
+++ b/examples/05_stable_diffusion/demo.py
@@ -21,11 +21,12 @@
 
 @click.command()
 @click.option("--token", default="", help="access token")
+@click.option("--batch-size", default=1, help="batch size")
 @click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
 @click.option(
     "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
 )
-def run(token, prompt, benchmark):
+def run(token, batch_size, prompt, benchmark):
     pipe = StableDiffusionAITPipeline.from_pretrained(
         "CompVis/stable-diffusion-v1-4",
         revision="fp16",
@@ -33,10 +34,12 @@ def run(token, prompt, benchmark):
         use_auth_token=token,
     ).to("cuda")
 
+    prompts = [prompt] * batch_size
+
     with torch.autocast("cuda"):
-        image = pipe(prompt).images[0]
+        image = pipe(prompts).images[0]
         if benchmark:
-            t = benchmark_torch_function(10, pipe, prompt)
+            t = benchmark_torch_function(10, pipe, prompts)
             print(f"sd e2e: {t} ms")
 
     image.save("example_ait.png")