Fix SDPA decomp problem

mcremon-meta · facebook-github-bot · commit 244ebc010092 · 2024-08-22T14:35:44.000-07:00
Summary: As titled. The new `_safe_softmax` function is meant to avoid NaN issues mostly in training. For inference, we shouldn't need it so we swap with the regular softmax, which will prevent the decomposition that introduces the unsupported ops (`eq`, `logical_not` and `any`). See https://www.internalfb.com/code/fbsource/fbcode/caffe2/torch/_decomp/decompositions.py?lines=425. Note that it needed some changes to `run_and_verify` since we now need some aten IR changes. I will fix it in another diff, where `run_and_verify` will use a nop quantizer instead. This way the code path will be the same for fp32 and quantized. But let's make CI green first! We will also need to formalize better how to apply passes on the initial graph module (aten IR passes as opposed to edge IR passes). Seems like lifted constants and other things like that can create issues, but unless we see errors, let's wait until the IR changes from PT/ET are in first. Reviewed By: hsharma35 Differential Revision: D61639074
diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
@@ -18,12 +18,13 @@
     ReplaceLogicalNotBooleanWhereWithWherePass,
     ReplacePT2DequantWithCadenceDequantPass,
     ReplacePT2QuantWithCadenceQuantPass,
+    ReplaceSafeSoftmaxWithSoftmax,
     ReplaceScalarTensorWithFullPass,
     ReplaceSqueezeAndUnsqueezeWithViewPass,
 )
 from executorch.backends.cadence.aot.quantizer.fusion_pass import QuantFusion
 from executorch.backends.cadence.aot.quantizer.quantizer import CadenceQuantizer
-from executorch.backends.cadence.aot.utils import model_is_quantized
+from executorch.backends.cadence.aot.utils import model_gm_has_SDPA, model_is_quantized
 from executorch.backends.transforms.decompose_sdpa import (
     DecomposeScaledDotProductAttention,
 )
@@ -57,13 +58,20 @@ def convert_pt2(
     """
 
     # Export with dynamo
-    model_exp = capture_pre_autograd_graph(model, inputs)
+    model_gm = capture_pre_autograd_graph(model, inputs)
 
-    # Decompose SDPA
-    DecomposeScaledDotProductAttention(False)(model_exp)
+    if model_gm_has_SDPA(model_gm):
+        # Decompose SDPA
+        DecomposeScaledDotProductAttention(False)(model_gm)
+
+        # Swap _safe_softmax with _softmax (see https://github.com/pytorch/pytorch/pull/133882
+        # for details).
+        result = ReplaceSafeSoftmaxWithSoftmax()(model_gm)
+        assert result is not None
+        model_gm = result.graph_module
 
     # Prepare
-    prepared_model = prepare_pt2e(model_exp, quantizer)
+    prepared_model = prepare_pt2e(model_gm, quantizer)
 
     # Calibrate
     prepared_model(*inputs)
diff --git a/backends/cadence/aot/passes.py b/backends/cadence/aot/passes.py
@@ -266,3 +266,29 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         result = SpecPropPass()(graph_module)
         assert result is not None
         return result
+
+
+class ReplaceSafeSoftmaxWithSoftmax(ExportPass):
+    """
+    Replace _safe_softmax with _softmax
+    """
+
+    def call_operator(
+        self,
+        op,  # pyre-ignore
+        args: tuple[Argument, ...],
+        kwargs: dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        if op != torch.ops.aten._safe_softmax.default:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # Add False for the half_to_float argument of softmax
+        softmax_args = list(args) + [False]
+
+        return super().call_operator(
+            torch.ops.aten._softmax.default,
+            tuple(softmax_args),
+            kwargs,
+            meta,
+        )
diff --git a/backends/cadence/aot/utils.py b/backends/cadence/aot/utils.py
@@ -177,3 +177,14 @@ def print_ops_info(
                 tablefmt="outline",
             )
         )
+
+
+def model_gm_has_SDPA(model_gm: torch.fx.GraphModule) -> bool:
+    for node in model_gm.graph.nodes:
+        if node.op == "call_function":
+            if (
+                node.target
+                == torch.ops.aten.scaled_dot_product_attention.default
+            ):
+                return True
+    return False