Update on "fx quant: hook up ConvTranspose{n}d"

Summary: Quantization of `ConvTranpose{n}d` is supported in Eager mode. This PR adds the support for FX graph mode. Note: this currenlty only works in `qnnpack` because per-channel weights are not supported by quantized conv transpose. In a future PR we should throw an error when someone tries to quantize a ConvTranspose model with per-channel weight observers until this is fixed. Test Plan: ``` python test/test_quantization.py TestQuantizeFxOps.test_conv_transpose_1d python test/test_quantization.py TestQuantizeFxOps.test_conv_transpose_2d ``` Reviewers: Subscribers: Tasks: Tags: Differential Revision: [D25674636](https://our.internmc.facebook.com/intern/diff/D25674636) [ghstack-poisoned]
pytorch · Dec 22, 2020 · 1b60e51 · 1b60e51
2 parents 36e0f69 + 8630901
commit 1b60e51
Show file tree

Hide file tree

Showing 124 changed files with 4,756 additions and 2,013 deletions.
diff --git a/aten/src/ATen/SparseTensorUtils.cpp b/aten/src/ATen/SparseTensorUtils.cpp
@@ -0,0 +1,113 @@
+#include <ATen/SparseTensorUtils.h>
+
+#include <ATen/ATen.h>
+#include <ATen/SparseTensorImpl.h>
+#include <ATen/Parallel.h>
+
+namespace at { namespace sparse {
+
+// NOTE [ Flatten Sparse Indices ]
+// This helper function flattens a sparse indices tensor (a Tensor) into a 1D
+// indices tensor. E.g.,
+//   input = [[2, 4, 0],
+//            [3, 1, 10]]
+//   full_size = [2, 12]
+//   output = [ 2 * 12 + 3, 4 * 12 + 1, 0 * 12 + 10 ] = [27, 49, 10]
+//
+// In other words, assuming that each `indices[i, :]` is a valid index to a
+// tensor `t` of shape `full_size`. This returns the corresponding indices to
+// the flattened tensor `t.reshape( prod(full_size[:indices.size(0)]), -1 )`.
+// if forceClone is true, the result will forced to be a clone of self.
+// if force_clone is true, the result will forced to be a clone of self.
+Tensor flatten_indices(const Tensor& indices, IntArrayRef full_size, bool force_clone /*= false*/) {
+  int64_t sparse_dim = indices.size(0);
+  if (sparse_dim == 1) {
+    if (force_clone) {
+      return indices.squeeze(0).clone(at::MemoryFormat::Contiguous);
+    } else {
+      return indices.squeeze(0);
+    }
+  } else {
+    std::vector<int64_t> indices_mult_cpu_vec;
+    indices_mult_cpu_vec.reserve(sparse_dim);
+    int64_t mult = 1;
+    for (int64_t i = sparse_dim - 1; i >= 0; i--) {
+      indices_mult_cpu_vec[i] = mult;
+      mult *= full_size[i];
+    }
+    auto indices_mult_cpu = at::from_blob(
+        indices_mult_cpu_vec.data(),
+        /*size=*/{sparse_dim, 1},
+        indices.options().device(kCPU));
+    // NB: must be blocking because this blob may be freed after this closure,
+    //     and non_blocking copy will see garbage.
+    auto indices_mult = indices_mult_cpu.to(indices.device(), /*non_blocking=*/false);
+    // Ideally we want matmul but matmul is slow on CPU Long and not implemented
+    // on CUDA Long. So mul is faster.
+    return indices.mul(indices_mult).sum(0);
+  }
+}
+
+// Flatten sparse tensor's indices from nD to 1D, similar to NOTE [ Flatten Sparse Indices ],
+// except this one allows partial flatten: only flatten on specified dims. Note that
+// the flatten indices might be uncoalesced if dims_to_flatten.size() < sparse_dim.
+// Also if input indices is already coalesced, the flattened indices will also be sorted.
+//
+// args:
+//    indices: sparse tensor indices
+//    sizes: sparse tensor sizes
+//    dims_to_flatten: a list of dim index to flatten
+//
+// Ex1:
+//   indices = [[2, 4, 0],
+//             [3, 1, 3]]
+//   sizes = [2, 12]
+//   dims_to_flatten = [0, 1]
+//   new_indices = [ 2 * 12 + 3, 4 * 12 + 1, 0 * 12 + 3 ] = [27, 49, 3]
+//
+// Ex2:
+//   dims_to_flatten = [1]
+//   new_indices = [ 3, 1, 3 ]  # uncoalesced
+Tensor flatten_indices_by_dims(const Tensor& indices, const IntArrayRef& sizes, const IntArrayRef& dims_to_flatten){
+  Tensor new_indices = at::zeros({indices.size(1)}, indices.options());
+  for (auto d : dims_to_flatten) {
+    new_indices.mul_(sizes[d]);
+    new_indices.add_(indices.select(0, d));
+  }
+  return new_indices;
+}
+
+Tensor coo_to_csr(const int64_t* indices, int64_t dim, int64_t nnz) {
+  /*
+    Find the CSR representation for a row `indices` from the COO format
+    Inputs:
+      `indices` is the row pointer from COO indices
+      `dim` is the row dimensionality
+      `nnz` is the number of non-zeros
+
+    Output:
+      `csr` is a compressed row array in a CSR format
+  */
+  Tensor csr = at::zeros({dim + 1}, kLong);
+
+  // TODO: eliminate this conditional when zero-size dims supported correctly
+  if (nnz > 0) {
+    auto csr_accessor = csr.accessor<int64_t, 1>();
+    // Convert the sparse matrix to CSR format
+    at::parallel_for(0, nnz, 10000, [&](int64_t start, int64_t end) {
+      int64_t h, hp0, hp1;
+      for (auto i = start; i < end; i++) {
+        hp0 = indices[i];
+        hp1 = (i+1 == nnz) ?  dim : indices[i+1];
+        if (hp0 != hp1) {
+          for (h = hp0; h < hp1; h++) {
+            csr_accessor[h+1] = i+1;
+          }
+        }
+      }
+    });
+  }
+  return csr;
+}
+
+}} // namespace at::sparse
diff --git a/aten/src/ATen/SparseTensorUtils.h b/aten/src/ATen/SparseTensorUtils.h
@@ -2,36 +2,36 @@
 
 #include <ATen/ATen.h>
 #include <ATen/SparseTensorImpl.h>
+#include <ATen/Parallel.h>
 
 namespace at { namespace sparse {
 
 // Just for documentary purposes
 using SparseTensor = Tensor;
-using LongTensor = Tensor;
-using IntTensor = Tensor;
 using SparseType = Type;
 
+
 // This is an internal utility function for getting at the SparseTensorImpl,
 // so that we can write sparse tensor specific accessors for special fields
 // in SparseTensor.  You should only use this for writing low level
 // setters/getters for SparseTensorImpl fields; otherwise, you should use
 // the low level setters/getters that were implemented using this.
 //
 // This may be called repeatedly, so make sure it's pretty cheap.
-inline SparseTensorImpl* get_sparse_impl(const SparseTensor& self) {
+inline  SparseTensorImpl* get_sparse_impl(const SparseTensor& self) {
   AT_ASSERTM(self.is_sparse(), "_internal_get_SparseTensorImpl: not a sparse tensor");
   return static_cast<SparseTensorImpl*>(self.unsafeGetTensorImpl());
 }
 
 // Takes indices and values and directly puts them into the sparse tensor, no
 // copy.  This used to be called THSTensor_(_move)
-inline void alias_into_sparse(const SparseTensor& self, const LongTensor& indices, const Tensor& values) {
+inline void alias_into_sparse(const SparseTensor& self, const Tensor& indices, const Tensor& values) {
   get_sparse_impl(self)->set_indices_and_values_unsafe(indices, values);
 }
 
 // Take indices and values and makes a (data) copy of them to put into the sparse
 // indices/values.  This used to be called THSTensor_(_set)
-inline void copy_into_sparse(const SparseTensor& self, const LongTensor& indices, const Tensor& values, bool non_blocking) {
+inline void copy_into_sparse(const SparseTensor& self, const Tensor& indices, const Tensor& values, bool non_blocking) {
   alias_into_sparse(
       self,
       indices.to(self._indices().options(), non_blocking, /*copy=*/true),
@@ -58,7 +58,7 @@ inline Tensor new_values_with_size_of(const Tensor& values, int64_t nnz) {
 }
 
 // NOTE [ Flatten Sparse Indices ]
-// This helper function flattens a sparse indices tensor (a LongTensor) into a 1D
+// This helper function flattens a sparse indices tensor (a Tensor) into a 1D
 // indices tensor. E.g.,
 //   input = [[2, 4, 0],
 //            [3, 1, 10]]
@@ -70,34 +70,7 @@ inline Tensor new_values_with_size_of(const Tensor& values, int64_t nnz) {
 // the flattened tensor `t.reshape( prod(full_size[:indices.size(0)]), -1 )`.
 // if forceClone is true, the result will forced to be a clone of self.
 // if force_clone is true, the result will forced to be a clone of self.
-inline LongTensor flatten_indices(const Tensor& indices, IntArrayRef full_size, bool force_clone = false) {
-  int64_t sparse_dim = indices.size(0);
-  if (sparse_dim == 1) {
-    if (force_clone) {
-      return indices.squeeze(0).clone(at::MemoryFormat::Contiguous);
-    } else {
-      return indices.squeeze(0);
-    }
-  } else {
-    std::vector<int64_t> indices_mult_cpu_vec;
-    indices_mult_cpu_vec.reserve(sparse_dim);
-    int64_t mult = 1;
-    for (int64_t i = sparse_dim - 1; i >= 0; i--) {
-      indices_mult_cpu_vec[i] = mult;
-      mult *= full_size[i];
-    }
-    auto indices_mult_cpu = at::from_blob(
-        indices_mult_cpu_vec.data(),
-        /*size=*/{sparse_dim, 1},
-        indices.options().device(kCPU));
-    // NB: must be blocking because this blob may be freed after this closure,
-    //     and non_blocking copy will see garbage.
-    auto indices_mult = indices_mult_cpu.to(indices.device(), /*non_blocking=*/false);
-    // Ideally we want matmul but matmul is slow on CPU Long and not implemented
-    // on CUDA Long. So mul is faster.
-    return indices.mul(indices_mult).sum(0);
-  }
-}
+TORCH_API Tensor flatten_indices(const Tensor& indices, IntArrayRef full_size, bool force_clone = false);
 
 // Flatten sparse tensor's indices from nD to 1D, similar to NOTE [ Flatten Sparse Indices ],
 // except this one allows partial flatten: only flatten on specified dims. Note that
@@ -119,13 +92,9 @@ inline LongTensor flatten_indices(const Tensor& indices, IntArrayRef full_size,
 // Ex2:
 //   dims_to_flatten = [1]
 //   new_indices = [ 3, 1, 3 ]  # uncoalesced
-inline LongTensor flatten_indices_by_dims(const LongTensor& indices, const IntArrayRef& sizes, const IntArrayRef& dims_to_flatten){
-  LongTensor new_indices = at::zeros({indices.size(1)}, indices.options());
-  for (auto d : dims_to_flatten) {
-    new_indices.mul_(sizes[d]);
-    new_indices.add_(indices.select(0, d));
-  }
-  return new_indices;
-}
+TORCH_API Tensor flatten_indices_by_dims(const Tensor& indices, const IntArrayRef& sizes, const IntArrayRef& dims_to_flatten);
+
+// Find the CSR representation for a row `indices` from the COO format 
+TORCH_API Tensor coo_to_csr(const int64_t* indices, int64_t dim, int64_t nnz);
 
 }} // namespace at::sparse
diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp
@@ -19,6 +19,25 @@ std::ostream& operator<<(std::ostream & out, TensorGeometryArg t) {
   return out;
 }
 
+void checkDim(
+    CheckedFrom c,
+    const Tensor& tensor,
+    const char* name,
+    int pos, // 1-indexed
+    int64_t dim) {
+  TORCH_CHECK(
+      tensor.dim() == dim,
+      "Expected ",
+      dim,
+      "-dimensional tensor, but got ",
+      tensor.dim(),
+      "-dimensional tensor for ",
+      TensorGeometryArg(TensorArg({tensor, name, pos})),
+      " (while checking arguments for ",
+      c,
+      ")");
+}
+
 void checkDim(CheckedFrom c, const TensorGeometryArg& t, int64_t dim) {
   TORCH_CHECK(t->dim() == dim,
     "Expected ", dim, "-dimensional tensor, but got ", t->dim(),

diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h
@@ -50,6 +50,12 @@ using CheckedFrom = const char*;
 // conversion will blow up if you have undefined tensors.
 
 TORCH_API std::ostream& operator<<(std::ostream& out, TensorGeometryArg t);
+TORCH_API void checkDim(
+    CheckedFrom c,
+    const Tensor& tensor,
+    const char* name,
+    int pos, // 1-indexed
+    int64_t dim);
 TORCH_API void checkDim(
     CheckedFrom c,
     const TensorGeometryArg& t,

diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
@@ -220,6 +220,7 @@ _(aten, blackman_window) \
 _(aten, block_diag) \
 _(aten, bmm) \
 _(aten, broadcast_tensors) \
+_(aten, broadcast_to) \
 _(aten, cartesian_prod) \
 _(aten, cat) \
 _(aten, cauchy) \