[cpu] [inductor] decompose bmm for memory bound in lowering (#124826)

Fixes #124697. Resolve the issue of large regression of GPT-FAST MOE with `coordinate_descent_tuning` disabled. To get better perf for memory bound case, we decompose bmm in lowering. Pull Request resolved: #124826 Approved by: https://github.com/jgong5, https://github.com/jansel
pytorch · Apr 27, 2024 · 368f521 · 368f521
1 parent ebb8905
commit 368f521
Showing 1 changed file with 8 additions and 4 deletions.
diff --git a/torch/_inductor/kernel/bmm.py b/torch/_inductor/kernel/bmm.py
@@ -2,8 +2,7 @@
 
 import torch
 
-from .. import ir
-from ..lowering import register_lowering
+from .. import ir, lowering as L
 from ..select_algorithm import (
     autotune_select_algorithm,
     ExternKernelChoice,
@@ -97,9 +96,14 @@ def bmm_grid(b, m, n, meta):
 aten_baddbmm = ExternKernelChoice(torch.baddbmm, "at::baddbmm_out")
 
 
-@register_lowering(aten.bmm)
+@L.register_lowering(aten.bmm)
 def tuned_bmm(mat1, mat2, *, layout=None):
     if all(x.get_device().type == "cpu" for x in [mat1, mat2]):
+        # decompose to small ops when memory bound
+        if mat1.get_size()[1] == 1 or mat2.get_size()[2] == 1:
+            mat1 = L.unsqueeze(mat1, -1)
+            mat2 = L.unsqueeze(mat2, 1)
+            return L.sum_(L.mul(mat1, mat2), axis=2)
 
         def is_valid_to_require_contiguous(t):
             if not ir.is_storage_and_layout(t):
@@ -157,7 +161,7 @@ def may_require_contiguous(t, meta_t):
 
 
 # Don't register this since it is slower than decomposing it
-# @register_lowering(aten.baddbmm)
+# @L.register_lowering(aten.baddbmm)
 def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
     m, n, k, layout, mat1, mat2, inp = mm_args(mat1, mat2, inp, layout=layout)