Made some minor improvements to flexattention perf + added more autot…

…une configs (#126811) Pull Request resolved: #126811 Approved by: https://github.com/drisspg, https://github.com/yanboliang, https://github.com/Neilblaze
pytorch · May 25, 2024 · 84e59f0 · 84e59f0 · pytorchmergebot · May 28, 2024
1 parent 9f11fc6
commit 84e59f0
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 12 deletions.
diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
@@ -738,7 +738,6 @@ def score_mod(score, b, h, m, n):
         self.run_test(score_mod)
 
     @supported_platform
-    @skip("TODO: Figure out why this is erroring")
     @patch.object(torch._inductor.config, "max_autotune", True)
     def test_max_autotune_with_captured(self):
         head_scale = torch.randn(H, device="cuda")

diff --git a/torch/_inductor/kernel/flex_attention.py b/torch/_inductor/kernel/flex_attention.py
@@ -241,10 +241,8 @@ def build_subgraph_buffer(
         start_n = tl.multiple_of(start_n, BLOCK_N)
         # -- load k, v --
         k = tl.load(K_block_ptr)
-        v = tl.load(V_block_ptr)
         # -- compute qk ---
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk = tl.dot(q, k.to(MATMUL_PRECISION), acc=qk)
+        qk = tl.dot(q, k)
         # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
         m = offs_m[:, None]
         n = start_n + offs_n[None, :]
@@ -277,7 +275,8 @@ def build_subgraph_buffer(
         # -- scale and update acc --
         acc_scale = l_i * 0 + alpha  # workaround some compiler bug
         acc *= acc_scale[:, None]
-        acc = tl.dot(p.to(MATMUL_PRECISION), v.to(MATMUL_PRECISION), acc)
+        v = tl.load(V_block_ptr)
+        acc = tl.dot(p.to(MATMUL_PRECISION), v, acc)
 
         # -- update m_i and l_i --
         l_i = l_i * alpha + tl.sum(p, 1)
@@ -402,13 +401,11 @@ def flex_attention(*args, **kwargs):
     configs: List[Tuple[int, int, int, int]] = []
     configs.append(_get_default_config_fwd(query))
     if config.max_autotune:
-        configs += [
-            (128, 64, 4, 3),
-            (128, 128, 4, 3),
-            (128, 128, 8, 2),
-            (64, 128, 4, 3),
-            (64, 64, 4, 3),
-        ]
+        for BM in [64, 128]:
+            for BN in [64, 128]:
+                for s in [3, 4, 7]:
+                    for w in [4, 8]:
+                        configs.append((BM, BN, w, s))
 
     # Note, we don't need to pass in the captured buffers explicitly
     # because they're implicitly added by the score_mod function