[Cutlass 3.3.0 submodule upgrade]

Cutlass 3.3 offers the following improvements: Adds support for mixed precision GEMMs On Hopper and Ampere Adds support for < 16B aligned GEMMs on Hopper Enhancements to EVT Enhancements to Python interface Enhancements to Sub-byte type handling in CuTe Several other bug-fixes and performance improvements. minor doc update Test Plan: CI ( ciflow/trunk, ciflow/inductor ) pytest test/inductor/test_max_autotune.py ghstack-source-id: 4956e5d00692fcf9ec3048085c798ca334808679 Pull Request resolved: #112861
pytorch · Dec 7, 2023 · 27d3206 · 27d3206
1 parent bc4bba4
commit 27d3206
Show file tree

Hide file tree

Showing 3 changed files with 3 additions and 5 deletions.
diff --git a/third_party/cutlass b/third_party/cutlass
diff --git a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/gemm_operation_extensions.py b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/gemm_operation_extensions.py
@@ -52,7 +52,7 @@ def __init__(self, operation_suffix=""):
         >;
 
         using ADDMM_EVT =  // alpha * acc + beta * C
-            cutlass::epilogue::fusion::Sm90EVT<cutlass::epilogue::fusion::Sm90Compute<cutlass::multiply_add,
+            cutlass::epilogue::fusion::Sm90EVT<cutlass::epilogue::fusion::Sm90Compute<cutlass::homogeneous_multiply_add,
                     ElementD, ElementAcc, RoundStyle>, // beta * C + (alpha * acc)
               cutlass::epilogue::fusion::Sm90ScalarBroadcast<ElementAcc>, // beta
               cutlass::epilogue::fusion::Sm90SrcFetch, // C

diff --git a/torch/_inductor/codegen/cuda/cutlass_utils.py b/torch/_inductor/codegen/cuda/cutlass_utils.py
@@ -141,11 +141,9 @@ def _gen_ops_cached(arch, version) -> List[Any]:
 
     # Import cutlass python scripts.
     assert try_import_cutlass()
-    import torch._inductor.codegen.cuda.cutlass_lib_extensions.generator_extended_v322 as cutlass_generator  # type: ignore[import]
+    import torch._inductor.codegen.cuda.cutlass_lib_extensions.generator_extended as cutlass_generator  # type: ignore[import]
     import cutlass_library.manifest as cutlass_manifest  # type: ignore[import]
 
-    import torch._inductor.codegen.cuda.cutlass_lib_extensions.generator_extended_v322 as cutlass_generator  # type: ignore[import]
-
     if arch is None or version is None:
         log.error(
             "Cannot detect cuda arch %s or cuda version %s. "