Skip to content

Commit

Permalink
[cpu] [inductor] decompose bmm for memory bound in lowering (#124826)
Browse files Browse the repository at this point in the history
Fixes #124697. Resolve the issue of large regression of GPT-FAST MOE with `coordinate_descent_tuning` disabled.

To get better perf for memory bound case, we decompose bmm in lowering.

Pull Request resolved: #124826
Approved by: https://github.com/jgong5, https://github.com/jansel
  • Loading branch information
Valentine233 authored and pytorchmergebot committed Apr 27, 2024
1 parent ebb8905 commit 368f521
Showing 1 changed file with 8 additions and 4 deletions.
12 changes: 8 additions & 4 deletions torch/_inductor/kernel/bmm.py
Expand Up @@ -2,8 +2,7 @@

import torch

from .. import ir
from ..lowering import register_lowering
from .. import ir, lowering as L
from ..select_algorithm import (
autotune_select_algorithm,
ExternKernelChoice,
Expand Down Expand Up @@ -97,9 +96,14 @@ def bmm_grid(b, m, n, meta):
aten_baddbmm = ExternKernelChoice(torch.baddbmm, "at::baddbmm_out")


@register_lowering(aten.bmm)
@L.register_lowering(aten.bmm)
def tuned_bmm(mat1, mat2, *, layout=None):
if all(x.get_device().type == "cpu" for x in [mat1, mat2]):
# decompose to small ops when memory bound
if mat1.get_size()[1] == 1 or mat2.get_size()[2] == 1:
mat1 = L.unsqueeze(mat1, -1)
mat2 = L.unsqueeze(mat2, 1)
return L.sum_(L.mul(mat1, mat2), axis=2)

def is_valid_to_require_contiguous(t):
if not ir.is_storage_and_layout(t):
Expand Down Expand Up @@ -157,7 +161,7 @@ def may_require_contiguous(t, meta_t):


# Don't register this since it is slower than decomposing it
# @register_lowering(aten.baddbmm)
# @L.register_lowering(aten.baddbmm)
def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
m, n, k, layout, mat1, mat2, inp = mm_args(mat1, mat2, inp, layout=layout)

Expand Down

0 comments on commit 368f521

Please sign in to comment.