pytorch · nikitaved · May 15, 2023 · ngimel · May 15, 2023 · nikitaved
diff --git a/aten/src/ATen/native/TriangularOps.cpp b/aten/src/ATen/native/TriangularOps.cpp
@@ -58,7 +58,7 @@ void apply_triu_tril_single(
     parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
       for (int64_t i : c10::irange(start, end)) {
         for (int64_t j = 0; j < std::min(m, i + k); j++) {
-          result[i * res_row_stride + j * res_col_stride] = 0;
+          result[i * res_row_stride + j * res_col_stride] = static_cast<scalar_t>(0);
         }
         if (!inplace) {  // copy the rest of the self if not inplace
           for (int64_t j = std::max(zero, i + k); j < m; j++) {
@@ -71,7 +71,7 @@ void apply_triu_tril_single(
     parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
       for (int64_t i : c10::irange(start, end)) {
         for (int64_t j = std::max(zero, i + k + 1); j < m; j++) {
-          result[i * res_row_stride + j * res_col_stride] = 0;
+          result[i * res_row_stride + j * res_col_stride] = static_cast<scalar_t>(0);
         }
         if (!inplace) {  // copy the rest of the self if not inplace
           for (int64_t j = zero; j < std::min(m, i + k + 1); j++) {
@@ -155,7 +155,8 @@ void compute_triu_tril(const Tensor& self, int64_t k, const Tensor &result) {
     result_c = result;
   }
 
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
+      ScalarType::ComplexHalf,
       ScalarType::BFloat16,
       ScalarType::Half,
       ScalarType::Bool,

diff --git a/aten/src/ATen/native/cuda/TriangularOps.cu b/aten/src/ATen/native/cuda/TriangularOps.cu
@@ -69,8 +69,12 @@ void triu_tril_cuda_template(const Tensor& result, const Tensor& self, int64_t k
   int64_t N = self.numel();
   dim3 dim_block = cuda::getApplyBlock();
   dim3 dim_grid((N + dim_block.x - 1) / dim_block.x);
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kComplexHalf, at::ScalarType::Half, at::ScalarType::Bool,
-                                         self.scalar_type(), "triu_tril_cuda_template", [&]{
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
+      at::ScalarType::ComplexHalf,
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      at::ScalarType::Bool,
+      self.scalar_type(), "triu_tril_cuda_template", [&] {
     if (cuda::detail::canUse32BitIndexMath(result) && cuda::detail::canUse32BitIndexMath(self)) {
       auto result_info = cuda::detail::getTensorInfo<scalar_t, int32_t>(result);
       auto self_info = cuda::detail::getTensorInfo<scalar_t, int32_t>(self);

@@ -16510,15 +16510,13 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"),),
            sample_inputs_func=sample_inputs_adjoint),
     OpInfo('tril',
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half),
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            error_inputs_func=error_inputs_tril_triu,
            sample_inputs_func=sample_inputs_tril_triu),
     OpInfo('triu',
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half),
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            error_inputs_func=error_inputs_tril_triu,