pytorch · IvanYashchuk · Jun 13, 2022 · Jun 13, 2022 · Jun 13, 2022 · Jun 13, 2022
diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@@ -850,23 +850,25 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_cuda(
   auto acc_type = at::toAccumulateType(input.scalar_type(), /*is_cuda=*/true);
   Tensor mean = at::empty({M}, X->options().dtype(acc_type));
   Tensor rstd = at::empty({M}, X->options().dtype(acc_type));
+  // Calling the kernel for M==0 gives a CUDA error
+  // See: https://github.com/pytorch/pytorch/pull/28614
   if (M > 0) {
     LayerNormKernelImpl(*X, *gamma, *beta, M, N, eps, &Y, &mean, &rstd);
+  }
+  const auto input_shape = input.sizes();
+  const size_t axis = input.dim() - normalized_shape.size();
 
-    const auto input_shape = input.sizes();
-    const size_t axis = input.dim() - normalized_shape.size();
+  std::vector<int64_t> stat_shape;
+  for (size_t idx = 0; idx < axis; ++idx) {
+    stat_shape.push_back(input_shape[idx]);
+  }
+  for (size_t idx = axis; idx < input.dim(); ++idx) {
+    stat_shape.push_back(1);
+  }
 
-    std::vector<int64_t> stat_shape;
-    for (size_t idx = 0; idx < axis; ++idx) {
-      stat_shape.push_back(input_shape[idx]);
-    }
-    for (size_t idx = axis; idx < input.dim(); ++idx) {
-      stat_shape.push_back(1);
-    }
+  mean = mean.view(stat_shape);
+  rstd = rstd.view(stat_shape);
 
-    mean = mean.view(stat_shape);
-    rstd = rstd.view(stat_shape);
-  }
   return std::make_tuple(std::move(Y), std::move(mean), std::move(rstd));
 }
 

diff --git a/aten/src/ATen/native/layer_norm.cpp b/aten/src/ATen/native/layer_norm.cpp
@@ -29,10 +29,6 @@ void layer_norm_with_mean_rstd_out(
     double eps,
     int64_t M,
     int64_t N) {
-  if (M <= 0) {
-    return;
-  }
-
   LayerNormKernel(kCPU, input, gamma, beta, M, N, eps, &out, &mean, &rstd);
   const auto input_shape = input.sizes();
   const size_t axis = input.dim() - normalized_shape.size();

diff --git a/test/test_decomp.py b/test/test_decomp.py
@@ -450,6 +450,9 @@ def check_decomposed(aten_name):
         func = op.get_op()
         for sample_input in samples:
             if requires_grad:
+                if None in sample_input.args:
+                    continue
+
                 fn, primals = normalize_op_input_output(func, sample_input)
                 primals = tree_map(
                     lambda x: x if isinstance(x, torch.Tensor) else x, primals

diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
@@ -815,38 +815,6 @@ def normalize(input, norm_dims, eps):
     return out, mean, rstd
 
 
-@register_decomposition(aten.native_layer_norm.default)
-def native_layer_norm(
-    input: Tensor,
-    normalized_shape: List[int],
-    weight: Optional[Tensor],
-    bias: Optional[Tensor],
-    eps: float,
-) -> Tuple[Tensor, Tensor, Tensor]:
-    computation_dtype = utils.get_computation_dtype(input.dtype)
-
-    axis = input.dim() - len(normalized_shape)
-    if prod(list(input.shape[:axis])) == 0:
-        mean = input.new_zeros((0,), dtype=computation_dtype)
-        rstd = input.new_zeros((0,), dtype=computation_dtype)
-        out = input
-    else:
-        reduction_dims = list(range(axis, input.dim()))
-        out, mean, rstd = normalize(input, reduction_dims, eps)
-
-        if weight is not None:
-            out = out * weight
-        if bias is not None:
-            out = out + bias
-
-        out = out.to(dtype=input.dtype)
-
-    if input.device.type == 'cpu':
-        mean = mean.to(dtype=input.dtype)
-        rstd = rstd.to(dtype=input.dtype)
-    return (out, mean, rstd)
-
-
 @register_decomposition(aten.native_group_norm.default, disable_meta=True)
 def native_group_norm(
     input: Tensor,

diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
@@ -192,11 +192,13 @@
     "hsplit",
     "hstack",
     "narrow",
+    "native_layer_norm",
     "permute",
     "ravel",
     "reshape",
     "roll",
     "rot90",
+    "rsqrt",
     "stack",
     "swap_axes",  # alias for transpose
     "squeeze",
@@ -626,6 +628,11 @@ def round(a):
     return prims.round(a)
 
 
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def rsqrt(a):
+    return prims.rsqrt(a)
+
+
 @_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
 def sigmoid(a: TensorLikeType) -> TensorLikeType:
     return true_divide(1, add(1, exp(neg(a))))
@@ -1426,7 +1433,7 @@ def _set_correction(
 @out_wrapper
 def var(
     a: TensorLikeType,
-    dim: Union[Optional[int], Optional[List[int]]] = None,
+    dim: Optional[DimsType] = None,
     unbiased: Optional[bool] = None,
     keepdim: bool = False,
     *,
@@ -1484,7 +1491,7 @@ def std(
 
 def mean(
     a: TensorLikeType,
-    dim: Union[Optional[int], Optional[List[int]]] = None,
+    dim: Optional[DimsType] = None,
     keepdim: bool = False,
     *,
     dtype=None,
@@ -1539,7 +1546,7 @@ def std_mean(
 
 def var_mean(
     a: TensorLikeType,
-    dim: Union[Optional[int], Optional[List[int]]] = None,
+    dim: Optional[DimsType] = None,
     unbiased: Optional[bool] = None,
     keepdim: bool = False,
     *,
@@ -1797,6 +1804,92 @@ def narrow(a: TensorLikeType, dim: int, start: int, length: int) -> TensorLikeTy
     return prims.slice_in_dim(a, start, start + length, axis=dim)
 
 
+def _normalize(
+    a: Tensor, norm_dims: DimsType, eps: float
+) -> Tuple[Tensor, Tensor, Tensor]:
+    """Computes mean and 1/std of a tensor along norm_dims.
+
+    Used as a helper function for normalization layers.
+
+    Args:
+        a (Tensor): input tensor
+        norm_dims (DimsType): dimensions to normalize over
+        eps (float): epsilon for numerical stability
+
+    Returns:
+        out (Tensor): normalized tensor.
+        mean (Tensor): mean of the tensor along norm_dims.
+        rstd (Tensor): 1/std of the tensor along norm_dims.
+    """
+    computation_dtype = utils.get_computation_dtype(a.dtype)
+    a_acc = _maybe_convert_to_dtype(a, computation_dtype)
+    assert isinstance(a_acc, TensorLike)  # to avoid mypy error for var_mean
+    biased_var, mean = var_mean(a_acc, dim=norm_dims, unbiased=False, keepdim=True)
+    rstd = torch.rsqrt(biased_var + eps)
+    out = (a - mean) * rstd
+    return out, mean, rstd
+
+
+@register_decomposition(torch.ops.aten.native_layer_norm)
+def native_layer_norm(
+    input: Tensor,
+    normalized_shape: ShapeType,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    eps: float,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    normalized_ndim = len(normalized_shape)
+    utils.check(
+        normalized_ndim >= 1,
+        lambda: "Expected normalized_shape to be at least 1-dimensional, i.e., "
+        + "containing at least one element, but got normalized_shape = "
+        + str(normalized_shape),
+    )
+    # torch.Size([1, 2, 3]) == [1, 2, 3] evaluates to False
+    # while torch.Size([1, 2, 3]) == (1, 2, 3) is True
+    # therefore we use tuple(normalized_shape)
+    utils.check(
+        weight is None or weight.shape == tuple(normalized_shape),
+        lambda: "Expected weight to be of same shape as normalized_shape, but got "
+        + "weight of shape "
+        + str(weight.shape)  # type: ignore[union-attr]
+        + " and normalized_shape = "
+        + str(normalized_shape),
+    )
+    utils.check(
+        bias is None or bias.shape == tuple(normalized_shape),
+        lambda: "Expected bias to be of same shape as normalized_shape, but got "
+        + "bias of shape "
+        + str(bias.shape)  # type: ignore[union-attr]
+        + " and normalized_shape = "
+        + str(normalized_shape),
+    )
+    utils.check(
+        input.ndim >= normalized_ndim
+        and input.shape[(input.ndim - normalized_ndim) :] == tuple(normalized_shape),
+        lambda: "Given normalized_shape="
+        + str(normalized_shape)
+        + ", expected input with shape "
+        + str(normalized_shape)
+        + ", but got input of size "
+        + str(input.shape),
+    )
+    axis = input.ndim - normalized_ndim
+    reduction_dims = list(range(axis, input.ndim))
+    out, mean, rstd = _normalize(input, reduction_dims, eps)
+    if weight is None and bias is not None:
+        out = out + bias
+    elif weight is not None and bias is None:
+        out = out * weight
+    elif weight is not None and bias is not None:
+        out = out * weight + bias
+    out = prims.convert_element_type(out, input.dtype)
+    if input.device.type == "cpu":
+        mean = prims.convert_element_type(mean, input.dtype)
+        rstd = prims.convert_element_type(rstd, input.dtype)
+    return (out, mean, rstd)
+
+
 # TODO: Adding this as a meta function causes functorch tests to fail when compiled with debug mode.
 # test/test_eager_transforms.py::TestFunctionalizeCPU::test_functionalize_fx_transpose_simple_cpu
 @register_decomposition(torch.ops.aten.permute, disable_meta=True)

diff --git a/torch/_refs/nn/functional/__init__.py b/torch/_refs/nn/functional/__init__.py
@@ -2,6 +2,7 @@
 
 import torch._prims.utils as utils
 from torch._prims.utils import (
+    ShapeType,
     TensorLike,
     TensorLikeType,
     NumberType,
@@ -35,6 +36,8 @@
     "tanhshrink",
 ]
 
+Tensor = torch.Tensor
+
 # celu is implemented specially because it has an alpha argument
 # celu is very similar to elu
 @register_decomposition(torch.ops.aten.celu)
@@ -146,6 +149,19 @@ def relu(a: TensorLikeType, inplace: bool = False) -> TensorLikeType:
     return torch.where(torch.le(a, 0), 0, a)
 
 
+def layer_norm(
+    input: Tensor,
+    normalized_shape: ShapeType,
+    weight: Optional[Tensor] = None,
+    bias: Optional[Tensor] = None,
+    eps: float = 1e-5,
+) -> Tensor:
+    """
+    Reference implementation of :func:`torch.nn.functional.layer_norm`.
+    """
+    return torch.native_layer_norm(input, normalized_shape, weight, bias, eps)[0]
+
+
 @register_decomposition(torch.ops.aten.leaky_relu)
 @elementwise_type_promotion_wrapper(
     type_promoting_args=("a",),