pytorch · nikitaved · Sep 13, 2022 · Sep 13, 2022 · soulitzer · Dec 13, 2022
diff --git a/test/test_sparse.py b/test/test_sparse.py
@@ -3621,6 +3621,13 @@ def check_empty(sparse_shape, nnz, dense_shape, coalesce):
                 x = self._gen_sparse(sparse_dim, nnz_val, empty_sparse_shape, dtype, device, coalesce)[0]
                 check(self, x, x)
 
+        def check_autograd(x, y):
+            if dtype in {torch.double, torch.cdouble}:
+                xa = x.detach().clone().requires_grad_(True)
+                ya = y.detach().clone().requires_grad_(True)
+                gradcheck(lambda a, b: (a * b).to_dense(), (xa, ya), check_sparse_nnz=True)
+                gradcheck(lambda a, b: (a * b).to_dense(), (ya, xa), check_sparse_nnz=True)
+
         for dim in range(len(shape) + 1):
             sub_shape = shape[dim:]
             sparse_dim = len(sub_shape) // 2
@@ -3630,12 +3637,14 @@ def check_empty(sparse_shape, nnz, dense_shape, coalesce):
             x = self._gen_sparse(sparse_dim, nnz, sub_shape, dtype, device, coalesced)[0]
             y = self._gen_sparse(sparse_dim, nnz, sub_shape, dtype, device, coalesced)[0]
             check(self, x, y)
+            check_autograd(x, y)
 
             # check broadcasting in dense dims
             for d in range(sparse_dim, len(sub_shape)):
                 new_shape = sub_shape[:d] + (1,) + sub_shape[d + 1:]
                 y = self._gen_sparse(sparse_dim, nnz, new_shape, dtype, device, coalesced)[0]
                 check(self, x, y)
+                check_autograd(x, y)
 
     @coalescedonoff
     @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16))

diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
@@ -1124,12 +1124,12 @@
   values: gather_with_keepdimed_indices(self_t, dim, indices, keepdim)
 
 - name: mul.Tensor(Tensor self, Tensor other) -> Tensor
-  self: mul_tensor_backward(grad, other, self.scalar_type())
-  other: mul_tensor_backward(grad, self, other.scalar_type())
+  self: mul_tensor_backward(grad, other, self, self.scalar_type())
+  other: mul_tensor_backward(grad, self, other, other.scalar_type())
   result: other_t * self_p + self_t * other_p
 
 - name: mul.Scalar(Tensor self, Scalar other) -> Tensor
-  self: mul_tensor_backward(grad, at::lift_fresh(at::scalar_to_tensor(other)), self.scalar_type())
+  self: mul_tensor_backward(grad, at::lift_fresh(at::scalar_to_tensor(other)), self, self.scalar_type())
   result: self_t * other
 
 - name: mv(Tensor self, Tensor vec) -> Tensor

diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
@@ -505,9 +505,62 @@ Tensor masked_fill_backward(const Tensor& grad, const Tensor& mask) {
       : grad.masked_select(mask).sum();
 }
 
-Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st) {
-  auto out = grad * other.conj();
-  return handle_r_to_c(self_st, out);
+Tensor mul_tensor_backward(
+    const Tensor& grad,
+    const Tensor& other,
+    const Tensor& self,
+    ScalarType self_st) {
+  // If a.is_sparse and b.is_sparse, then mul(a, b) expects:
+  // 1. a.dim() == b.dim(),
+  // 2. a.shape[:a.sparse_dim] == b.shape[:b.sparse_dim].
+  // Note, however, that mul(a, b) will handle broadcasting in dense dims.
+  // Autograd, however, will have issues with that and will try running
+  // either a.sum(d=...) or b.sum(d=...) to propagate sparse grads of the
+  // same shape as the inputs'. However, sum(d=...) is not implemented
+  // for sparse tensors. So, instead, we explicitly reduce over grads'
+  // dense dims and create new sparse gradient tensors that now match
+  // the shape of the inputs.
+  const auto handle_sparse_sparse_case = [](Tensor& self_grad,
+                                            const Tensor& self) -> void {
+    // No broadcasting in dense dims, no need to modify anything.
+    if (self_grad.sizes() == self.sizes()) {
+      return;
+    }
+    // Here self dense dims broadcast over self_grad dense dims.
+    // This means we need to reduce over broadcasted dims in self_grad.values
+    // such that self_grad.values.sum(...).shape == self.values.shape,
+    // otherwise autograd would try running self_grad.sum(dim=...), and
+    // sum(dim=...) is not implemented for sparse tensors.
+    const auto self_sparse_dim = self.sparse_dim();
+    const auto self_grad_indices = self_grad._indices();
+    const auto self_grad_values = self_grad._values();
+    auto values_reduction_dims = at::DimVector();
+    // Find which dense dims broadcast
+    for (const auto d : c10::irange(self_sparse_dim, self.dim())) {
+      // If d broadcasts...
+      if (self.sizes()[d] == 1) {
+        // ... then map d to a dim relative to values to reduce over.
+        values_reduction_dims.push_back(d - self_sparse_dim + 1);
+      }
+    }
+    // Produce a "reduced" grad now.
+    self_grad = at::_sparse_coo_tensor_unsafe(
+        self_grad_indices,
+        // Need to specify dtype because sum can promote.
+        self_grad_values.sum(
+            values_reduction_dims,
+            /*keepdim=*/true,
+            self_grad_values.scalar_type()),
+        self.sizes());
+  };
+
+  auto self_grad = grad * other.conj();
+  // Handle sparse only case
+  if (other.is_sparse() && self.is_sparse()) {
+    // NOTE: Modifies self_grad
+    handle_sparse_sparse_case(self_grad, self);
+  }
+  return handle_r_to_c(self_st, self_grad);
 }
 
 Tensor div_tensor_self_backward(

diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
@@ -125,7 +125,11 @@ at::Tensor pow_backward_exponent(
     const at::Tensor& exponent,
     at::Tensor result);
 at::Tensor angle_backward(at::Tensor grad, const at::Tensor& self);
-at::Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st);
+at::Tensor mul_tensor_backward(
+    const Tensor& grad,
+    const Tensor& other,
+    const Tensor& self,
+    ScalarType self_st);
 at::Tensor div_tensor_self_backward(
     Tensor grad,
     Tensor other,