Update base for Update on "Construct CppSignatureGroup from NativeFun…

…ction" This will make it easier to implement the POC in peterbell10@d534f7d see also #45666 Signed-off-by: Edward Z. Yang <ezyang@fb.com> Differential Revision: [D25594005](https://our.internmc.facebook.com/intern/diff/D25594005) [ghstack-poisoned]
pytorch · Jan 4, 2021 · 53d4160 · 53d4160
2 parents 7a43b27 + e44b2b7
commit 53d4160
Show file tree

Hide file tree

Showing 503 changed files with 11,952 additions and 4,597 deletions.
diff --git a/.circleci/cimodel/data/dimensions.py b/.circleci/cimodel/data/dimensions.py
@@ -8,8 +8,8 @@
 ]
 
 ROCM_VERSIONS = [
-    "3.9",
     "3.10",
+    "4.0",
 ]
 
 ROCM_VERSION_LABELS = ["rocm" + v for v in ROCM_VERSIONS]

diff --git a/.circleci/config.yml b/.circleci/config.yml
diff --git a/BUILD.bazel b/BUILD.bazel
@@ -373,7 +373,6 @@ filegroup(
 filegroup(
     name = "thc_srcs_cu",
     srcs = [
-        "aten/src/THC/THCBlas.cu.cc",
         "aten/src/THC/THCReduceApplyUtils.cu.cc",
         "aten/src/THC/THCSleep.cu.cc",
         "aten/src/THC/THCSortUtils.cu.cc",

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -903,6 +903,16 @@ You'll need to install an appropriately configured flake8; see
 [Lint as you type](https://github.com/pytorch/pytorch/wiki/Lint-as-you-type)
 for documentation on how to do this.
 
+If you haven't set up the pre-commit hook and have already committed files and 
+CI reports `flake8` errors, you can run the check locally in your PR branch with:
+
+  ```bash
+  flake8 $(git diff --name-only $(git merge-base --fork-point master))
+  ```
+
+fix the code so that no errors are reported when you re-run the above check again, 
+and then commit the fix.
+
 ## Building PyTorch with ASAN
 
 [ASAN](https://github.com/google/sanitizers/wiki/AddressSanitizer) is very

diff --git a/android/gradle/android_tasks.gradle b/android/gradle/android_tasks.gradle
@@ -1,4 +1,3 @@
-
 import java.nio.file.Files
 import java.nio.file.Paths
 import java.io.FileOutputStream

diff --git a/android/pytorch_android/host/build.gradle b/android/pytorch_android/host/build.gradle
@@ -38,4 +38,3 @@ dependencies {
 }
 
 apply from: rootProject.file('gradle/release.gradle')
-
diff --git a/android/settings.gradle b/android/settings.gradle
@@ -4,4 +4,3 @@ project(':pytorch_android_torchvision').projectDir = file('pytorch_android_torch
 
 project(':pytorch_host').projectDir = file('pytorch_android/host')
 project(':test_app').projectDir = file('test_app/app')
-
diff --git a/aten/src/ATen/LegacyTHFunctionsCUDA.h b/aten/src/ATen/LegacyTHFunctionsCUDA.h
@@ -75,7 +75,6 @@ Tensor & _thnn_log_sigmoid_backward_out(Tensor & grad_input, const Tensor & grad
 Tensor _thnn_log_sigmoid_backward(const Tensor & grad_output, const Tensor & self, const Tensor & buffer);
 Tensor & _thnn_rrelu_with_noise_forward_out(Tensor & output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, c10::optional<at::Generator> generator);
 Tensor _thnn_rrelu_with_noise_forward(const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, c10::optional<at::Generator> generator);
-Tensor & _thnn_rrelu_with_noise_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training);
 Tensor _thnn_rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training);
 Tensor & _thnn_rrelu_with_noise_forward_(Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, c10::optional<at::Generator> generator);
 std::tuple<Tensor &,Tensor &,Tensor &> _thnn_conv2d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding);

diff --git a/aten/src/ATen/MemoryOverlap.cpp b/aten/src/ATen/MemoryOverlap.cpp
@@ -48,6 +48,9 @@ MemOverlapStatus get_overlap_status(TensorImpl* a, TensorImpl* b) {
   if (!a->is_contiguous() || !b->is_contiguous()) {
     return MemOverlapStatus::TOO_HARD;
   }
+  if (!a->has_storage() || !b->has_storage()) {
+    return MemOverlapStatus::NO;
+  }
   if (a->storage().data() == b->storage().data()) {
     const auto a_begin = static_cast<char*>(a->data());
     const auto a_end = a_begin + a->numel() * a->itemsize();

diff --git a/aten/src/ATen/SparseTensorUtils.cpp b/aten/src/ATen/SparseTensorUtils.cpp
@@ -0,0 +1,113 @@
+#include <ATen/SparseTensorUtils.h>
+
+#include <ATen/ATen.h>
+#include <ATen/SparseTensorImpl.h>
+#include <ATen/Parallel.h>
+
+namespace at { namespace sparse {
+
+// NOTE [ Flatten Sparse Indices ]
+// This helper function flattens a sparse indices tensor (a Tensor) into a 1D
+// indices tensor. E.g.,
+//   input = [[2, 4, 0],
+//            [3, 1, 10]]
+//   full_size = [2, 12]
+//   output = [ 2 * 12 + 3, 4 * 12 + 1, 0 * 12 + 10 ] = [27, 49, 10]
+//
+// In other words, assuming that each `indices[i, :]` is a valid index to a
+// tensor `t` of shape `full_size`. This returns the corresponding indices to
+// the flattened tensor `t.reshape( prod(full_size[:indices.size(0)]), -1 )`.
+// if forceClone is true, the result will forced to be a clone of self.
+// if force_clone is true, the result will forced to be a clone of self.
+Tensor flatten_indices(const Tensor& indices, IntArrayRef full_size, bool force_clone /*= false*/) {
+  int64_t sparse_dim = indices.size(0);
+  if (sparse_dim == 1) {
+    if (force_clone) {
+      return indices.squeeze(0).clone(at::MemoryFormat::Contiguous);
+    } else {
+      return indices.squeeze(0);
+    }
+  } else {
+    std::vector<int64_t> indices_mult_cpu_vec;
+    indices_mult_cpu_vec.reserve(sparse_dim);
+    int64_t mult = 1;
+    for (int64_t i = sparse_dim - 1; i >= 0; i--) {
+      indices_mult_cpu_vec[i] = mult;
+      mult *= full_size[i];
+    }
+    auto indices_mult_cpu = at::from_blob(
+        indices_mult_cpu_vec.data(),
+        /*size=*/{sparse_dim, 1},
+        indices.options().device(kCPU));
+    // NB: must be blocking because this blob may be freed after this closure,
+    //     and non_blocking copy will see garbage.
+    auto indices_mult = indices_mult_cpu.to(indices.device(), /*non_blocking=*/false);
+    // Ideally we want matmul but matmul is slow on CPU Long and not implemented
+    // on CUDA Long. So mul is faster.
+    return indices.mul(indices_mult).sum(0);
+  }
+}
+
+// Flatten sparse tensor's indices from nD to 1D, similar to NOTE [ Flatten Sparse Indices ],
+// except this one allows partial flatten: only flatten on specified dims. Note that
+// the flatten indices might be uncoalesced if dims_to_flatten.size() < sparse_dim.
+// Also if input indices is already coalesced, the flattened indices will also be sorted.
+//
+// args:
+//    indices: sparse tensor indices
+//    sizes: sparse tensor sizes
+//    dims_to_flatten: a list of dim index to flatten
+//
+// Ex1:
+//   indices = [[2, 4, 0],
+//             [3, 1, 3]]
+//   sizes = [2, 12]
+//   dims_to_flatten = [0, 1]
+//   new_indices = [ 2 * 12 + 3, 4 * 12 + 1, 0 * 12 + 3 ] = [27, 49, 3]
+//
+// Ex2:
+//   dims_to_flatten = [1]
+//   new_indices = [ 3, 1, 3 ]  # uncoalesced
+Tensor flatten_indices_by_dims(const Tensor& indices, const IntArrayRef& sizes, const IntArrayRef& dims_to_flatten){
+  Tensor new_indices = at::zeros({indices.size(1)}, indices.options());
+  for (auto d : dims_to_flatten) {
+    new_indices.mul_(sizes[d]);
+    new_indices.add_(indices.select(0, d));
+  }
+  return new_indices;
+}
+
+Tensor coo_to_csr(const int64_t* indices, int64_t dim, int64_t nnz) {
+  /*
+    Find the CSR representation for a row `indices` from the COO format
+    Inputs:
+      `indices` is the row pointer from COO indices
+      `dim` is the row dimensionality
+      `nnz` is the number of non-zeros
+
+    Output:
+      `csr` is a compressed row array in a CSR format
+  */
+  Tensor csr = at::zeros({dim + 1}, kLong);
+
+  // TODO: eliminate this conditional when zero-size dims supported correctly
+  if (nnz > 0) {
+    auto csr_accessor = csr.accessor<int64_t, 1>();
+    // Convert the sparse matrix to CSR format
+    at::parallel_for(0, nnz, 10000, [&](int64_t start, int64_t end) {
+      int64_t h, hp0, hp1;
+      for (auto i = start; i < end; i++) {
+        hp0 = indices[i];
+        hp1 = (i+1 == nnz) ?  dim : indices[i+1];
+        if (hp0 != hp1) {
+          for (h = hp0; h < hp1; h++) {
+            csr_accessor[h+1] = i+1;
+          }
+        }
+      }
+    });
+  }
+  return csr;
+}
+
+}} // namespace at::sparse
diff --git a/aten/src/ATen/SparseTensorUtils.h b/aten/src/ATen/SparseTensorUtils.h
@@ -2,36 +2,36 @@
 
 #include <ATen/ATen.h>
 #include <ATen/SparseTensorImpl.h>
+#include <ATen/Parallel.h>
 
 namespace at { namespace sparse {
 
 // Just for documentary purposes
 using SparseTensor = Tensor;
-using LongTensor = Tensor;
-using IntTensor = Tensor;
 using SparseType = Type;
 
+
 // This is an internal utility function for getting at the SparseTensorImpl,
 // so that we can write sparse tensor specific accessors for special fields
 // in SparseTensor.  You should only use this for writing low level
 // setters/getters for SparseTensorImpl fields; otherwise, you should use
 // the low level setters/getters that were implemented using this.
 //
 // This may be called repeatedly, so make sure it's pretty cheap.
-inline SparseTensorImpl* get_sparse_impl(const SparseTensor& self) {
+inline  SparseTensorImpl* get_sparse_impl(const SparseTensor& self) {
   AT_ASSERTM(self.is_sparse(), "_internal_get_SparseTensorImpl: not a sparse tensor");
   return static_cast<SparseTensorImpl*>(self.unsafeGetTensorImpl());
 }
 
 // Takes indices and values and directly puts them into the sparse tensor, no
 // copy.  This used to be called THSTensor_(_move)
-inline void alias_into_sparse(const SparseTensor& self, const LongTensor& indices, const Tensor& values) {
+inline void alias_into_sparse(const SparseTensor& self, const Tensor& indices, const Tensor& values) {
   get_sparse_impl(self)->set_indices_and_values_unsafe(indices, values);
 }
 
 // Take indices and values and makes a (data) copy of them to put into the sparse
 // indices/values.  This used to be called THSTensor_(_set)
-inline void copy_into_sparse(const SparseTensor& self, const LongTensor& indices, const Tensor& values, bool non_blocking) {
+inline void copy_into_sparse(const SparseTensor& self, const Tensor& indices, const Tensor& values, bool non_blocking) {
   alias_into_sparse(
       self,
       indices.to(self._indices().options(), non_blocking, /*copy=*/true),
@@ -58,7 +58,7 @@ inline Tensor new_values_with_size_of(const Tensor& values, int64_t nnz) {
 }
 
 // NOTE [ Flatten Sparse Indices ]
-// This helper function flattens a sparse indices tensor (a LongTensor) into a 1D
+// This helper function flattens a sparse indices tensor (a Tensor) into a 1D
 // indices tensor. E.g.,
 //   input = [[2, 4, 0],
 //            [3, 1, 10]]
@@ -70,34 +70,7 @@ inline Tensor new_values_with_size_of(const Tensor& values, int64_t nnz) {
 // the flattened tensor `t.reshape( prod(full_size[:indices.size(0)]), -1 )`.
 // if forceClone is true, the result will forced to be a clone of self.
 // if force_clone is true, the result will forced to be a clone of self.
-inline LongTensor flatten_indices(const Tensor& indices, IntArrayRef full_size, bool force_clone = false) {
-  int64_t sparse_dim = indices.size(0);
-  if (sparse_dim == 1) {
-    if (force_clone) {
-      return indices.squeeze(0).clone(at::MemoryFormat::Contiguous);
-    } else {
-      return indices.squeeze(0);
-    }
-  } else {
-    std::vector<int64_t> indices_mult_cpu_vec;
-    indices_mult_cpu_vec.reserve(sparse_dim);
-    int64_t mult = 1;
-    for (int64_t i = sparse_dim - 1; i >= 0; i--) {
-      indices_mult_cpu_vec[i] = mult;
-      mult *= full_size[i];
-    }
-    auto indices_mult_cpu = at::from_blob(
-        indices_mult_cpu_vec.data(),
-        /*size=*/{sparse_dim, 1},
-        indices.options().device(kCPU));
-    // NB: must be blocking because this blob may be freed after this closure,
-    //     and non_blocking copy will see garbage.
-    auto indices_mult = indices_mult_cpu.to(indices.device(), /*non_blocking=*/false);
-    // Ideally we want matmul but matmul is slow on CPU Long and not implemented
-    // on CUDA Long. So mul is faster.
-    return indices.mul(indices_mult).sum(0);
-  }
-}
+TORCH_API Tensor flatten_indices(const Tensor& indices, IntArrayRef full_size, bool force_clone = false);
 
 // Flatten sparse tensor's indices from nD to 1D, similar to NOTE [ Flatten Sparse Indices ],
 // except this one allows partial flatten: only flatten on specified dims. Note that
@@ -119,13 +92,9 @@ inline LongTensor flatten_indices(const Tensor& indices, IntArrayRef full_size,
 // Ex2:
 //   dims_to_flatten = [1]
 //   new_indices = [ 3, 1, 3 ]  # uncoalesced
-inline LongTensor flatten_indices_by_dims(const LongTensor& indices, const IntArrayRef& sizes, const IntArrayRef& dims_to_flatten){
-  LongTensor new_indices = at::zeros({indices.size(1)}, indices.options());
-  for (auto d : dims_to_flatten) {
-    new_indices.mul_(sizes[d]);
-    new_indices.add_(indices.select(0, d));
-  }
-  return new_indices;
-}
+TORCH_API Tensor flatten_indices_by_dims(const Tensor& indices, const IntArrayRef& sizes, const IntArrayRef& dims_to_flatten);
+
+// Find the CSR representation for a row `indices` from the COO format 
+TORCH_API Tensor coo_to_csr(const int64_t* indices, int64_t dim, int64_t nnz);
 
 }} // namespace at::sparse
diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp
@@ -19,6 +19,25 @@ std::ostream& operator<<(std::ostream & out, TensorGeometryArg t) {
   return out;
 }
 
+void checkDim(
+    CheckedFrom c,
+    const Tensor& tensor,
+    const char* name,
+    int pos, // 1-indexed
+    int64_t dim) {
+  TORCH_CHECK(
+      tensor.dim() == dim,
+      "Expected ",
+      dim,
+      "-dimensional tensor, but got ",
+      tensor.dim(),
+      "-dimensional tensor for ",
+      TensorGeometryArg(TensorArg({tensor, name, pos})),
+      " (while checking arguments for ",
+      c,
+      ")");
+}
+
 void checkDim(CheckedFrom c, const TensorGeometryArg& t, int64_t dim) {
   TORCH_CHECK(t->dim() == dim,
     "Expected ", dim, "-dimensional tensor, but got ", t->dim(),

diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h
@@ -50,6 +50,12 @@ using CheckedFrom = const char*;
 // conversion will blow up if you have undefined tensors.
 
 TORCH_API std::ostream& operator<<(std::ostream& out, TensorGeometryArg t);
+TORCH_API void checkDim(
+    CheckedFrom c,
+    const Tensor& tensor,
+    const char* name,
+    int pos, // 1-indexed
+    int64_t dim);
 TORCH_API void checkDim(
     CheckedFrom c,
     const TensorGeometryArg& t,

diff --git a/aten/src/ATen/core/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp
@@ -292,6 +292,11 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
         stream << ", axis: " << tensor_.q_per_channel_axis();
       }
     }
+
+    auto& fw_grad = tensor.fw_grad(/* level */ 0);
+    if (fw_grad.defined()) {
+      stream << ", tangent:" << std::endl << fw_grad;
+    }
     stream << " ]";
   }
   return stream;

diff --git a/aten/src/ATen/core/NamedRegistrations.cpp b/aten/src/ATen/core/NamedRegistrations.cpp
@@ -462,6 +462,7 @@ TORCH_LIBRARY_IMPL(aten, Named, m) {
   m.impl("tanh_", CppFunction::makeFallthrough());
   m.impl("tensor_split.indices", CppFunction::makeFallthrough());
   m.impl("tensor_split.sections", CppFunction::makeFallthrough());
+  m.impl("tensor_split.tensor_indices_or_sections", CppFunction::makeFallthrough());
   m.impl("threshold", CppFunction::makeFallthrough());
   m.impl("threshold.out", CppFunction::makeFallthrough());
   m.impl("threshold_", CppFunction::makeFallthrough());
@@ -509,4 +510,5 @@ TORCH_LIBRARY_IMPL(aten, Named, m) {
   m.impl("_version", CppFunction::makeFallthrough());
   m.impl("requires_grad_", CppFunction::makeFallthrough());
   m.impl("retain_grad", CppFunction::makeFallthrough());
+  m.impl("_fw_primal", CppFunction::makeFallthrough());
 }
diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
@@ -220,6 +220,7 @@ _(aten, blackman_window) \
 _(aten, block_diag) \
 _(aten, bmm) \
 _(aten, broadcast_tensors) \
+_(aten, broadcast_to) \
 _(aten, cartesian_prod) \
 _(aten, cat) \
 _(aten, cauchy) \
@@ -435,6 +436,7 @@ _(aten, logdet) \
 _(aten, logit) \
 _(aten, logspace) \
 _(aten, logsumexp) \
+_(aten, xlogy) \
 _(aten, lstm) \
 _(aten, lstm_cell) \
 _(aten, lstsq) \
@@ -551,6 +553,7 @@ _(aten, permute) \
 _(aten, pin_memory) \
 _(aten, pinverse) \
 _(aten, pixel_shuffle) \
+_(aten, pixel_unshuffle) \
 _(aten, poisson) \
 _(aten, polygamma) \
 _(aten, pow) \

diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h
@@ -107,7 +107,7 @@ struct Argument {
   c10::optional<int32_t> N_;
 
   c10::optional<IValue> default_value_;
-  // is this only specifyable as a keyword argument?
+  // is this only specifiable as a keyword argument?
   bool kwarg_only_;
   c10::optional<AliasInfo> alias_info_;
 };