Merge branch 'main' of https://github.com/pytorch/pytorch into peter/…

…cumulativeops
pytorch · Jul 16, 2023 · f2cd347 · f2cd347
2 parents fb24d18 + 9adfaf8
commit f2cd347
Show file tree

Hide file tree

Showing 85 changed files with 2,056 additions and 1,110 deletions.
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
@@ -160,6 +160,20 @@ case "$image" in
     TRITON=yes
     INDUCTOR_BENCHMARKS=yes
     ;;
+  pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
+    CUDA_VERSION=12.1.0
+    CUDNN_VERSION=8
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    ;;
   pytorch-linux-focal-py3-clang7-asan)
     ANACONDA_PYTHON_VERSION=3.9
     CLANG_VERSION=7

diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh
@@ -105,5 +105,13 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
     pip_install -r /opt/conda/requirements-docs.txt
   fi
 
+  # HACK HACK HACK
+  # gcc-9 for ubuntu-18.04 from http://ppa.launchpad.net/ubuntu-toolchain-r/test/ubuntu
+  # Pulls llibstdc++6 13.1.0-8ubuntu1~18.04 which is too new for conda
+  # So remove libstdc++6.so.3.29 installed by https://anaconda.org/anaconda/libstdcxx-ng/files?version=11.2.0
+  if grep 18.04.6 /etc/issue >/dev/null; then
+    rm /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/lib/libstdc++.so.6
+  fi
+
   popd
 fi
diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-bb3aae7b2543637191ad9c810f082eae622534b8
+29418e34a94e2c43f861a321265f7f21035e7b19
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
@@ -40,6 +40,7 @@ jobs:
           - docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7-inductor-benchmarks
           - docker-image-name: pytorch-linux-bionic-py3.8-clang9
           - docker-image-name: pytorch-linux-bionic-py3.11-clang9
+          - docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
           - docker-image-name: pytorch-linux-focal-rocm-n-1-py3
           - docker-image-name: pytorch-linux-focal-rocm-n-py3
           - docker-image-name: pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -305,12 +305,12 @@ jobs:
           { config: "default", shard: 1, num_shards: 1, runner: "linux.4xlarge" },
         ]}
 
-  linux-bionic-cuda12_1-py3_10-gcc9-bazel-test:
-    name: linux-bionic-cuda12.1-py3.10-gcc9-bazel-test
+  linux-focal-cuda12_1-py3_10-gcc9-bazel-test:
+    name: linux-focal-cuda12.1-py3.10-gcc9-bazel-test
     uses: ./.github/workflows/_bazel-build-test.yml
     with:
-      build-environment: linux-bionic-cuda12.1-py3.10-gcc9-bazel-test
-      docker-image-name: pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9-bazel-test
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
       cuda-version: "12.1"
       test-matrix: |
         { include: [

diff --git a/aten/src/ATen/native/cpu/ReduceUtils.h b/aten/src/ATen/native/cpu/ReduceUtils.h
@@ -7,6 +7,8 @@
 #include <ATen/native/ReductionType.h>
 #include <c10/util/irange.h>
 #include <ATen/OpMathType.h>
+#include <ATen/native/cpu/utils.h>
+#include <ATen/OpMathType.h>
 
 namespace at::native {
 inline namespace CPU_CAPABILITY {
@@ -104,7 +106,8 @@ inline void _init(scalar_t* self_ptr, at::opmath_type<scalar_t>* buffer_ptr, int
 }
 
 template <typename scalar_t>
-inline scalar_t _max(const scalar_t& x, const scalar_t& y) {
+inline typename std::enable_if<!std::is_same<scalar_t, Vec2>::value, scalar_t>::type
+_max(const scalar_t& x, const scalar_t& y) {
   return at::_isnan(y) ? y : std::max(x, y);
 }
 
@@ -114,8 +117,16 @@ inline Vectorized<scalar_t> _max(const Vectorized<scalar_t>& x, const Vectorized
   return vec::maximum(x, y);
 }
 
+template <typename vec_t>
+inline typename std::enable_if<std::is_same<vec_t, Vec2>::value, Vec2>::type
+_max(const vec_t& x, const vec_t& y) {
+  // vec::maximum propagates NaN
+  return maximum(x, y);
+}
+
 template <typename scalar_t>
-inline scalar_t _min(const scalar_t& x, const scalar_t& y) {
+inline typename std::enable_if<!std::is_same<scalar_t, Vec2>::value, scalar_t>::type
+_min(const scalar_t& x, const scalar_t& y) {
   return at::_isnan(y) ? y : std::min(x, y);
 }
 
@@ -125,6 +136,13 @@ inline Vectorized<scalar_t> _min(const Vectorized<scalar_t>& x, const Vectorized
   return vec::minimum(x, y);
 }
 
+template <typename vec_t>
+inline typename std::enable_if<std::is_same<vec_t, Vec2>::value, Vec2>::type
+_min(const vec_t& x, const vec_t& y) {
+  // vec::minimum propagates NaN
+  return minimum(x, y);
+}
+
 template <typename scalar_t, typename accumut, typename Op,
           typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
 inline void map_acc(

diff --git a/aten/src/ATen/native/cpu/SpmmReduceKernel.cpp b/aten/src/ATen/native/cpu/SpmmReduceKernel.cpp
@@ -9,18 +9,60 @@
 #include <ATen/native/cpu/ReduceUtils.h>
 #include <ATen/native/cpu/utils.h>
 #include <c10/util/irange.h>
+#include <ATen/OpMathType.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
 #else
 #include <ATen/ops/empty.h>
 #include <ATen/ops/empty_native.h>
+#include <ATen/ops/zeros.h>
 #endif
 
 namespace at { namespace native {
 
 namespace {
 
+template <typename scalar_t, typename index_t, ReductionType reduce>
+inline void _update(at::opmath_type<scalar_t>* out_ptr, int64_t e, int64_t c, const scalar_t val, scalar_t* other_data, int64_t K) {
+  using opmath_t = at::opmath_type<scalar_t>;
+  using Vec = vec::Vectorized<scalar_t>;
+  using aVec = VecType<scalar_t>;
+  constexpr int64_t kVecSize = Vec::size();
+  constexpr int64_t kVLEN = kVecSize * 4;
+
+  int64_t k = 0;
+  aVec val_vec = aVec((opmath_t)val);
+  scalar_t* other_ptr = other_data + c * K;
+
+  for (; k < K - (K % kVLEN); k += kVLEN) {
+    aVec out_vec0 = aVec::loadu(out_ptr + k);
+    aVec out_vec1 = aVec::loadu(out_ptr + k + kVecSize);
+    aVec out_vec2 = aVec::loadu(out_ptr + k + kVecSize * 2);
+    aVec out_vec3 = aVec::loadu(out_ptr + k + kVecSize * 3);
+
+    out_vec0 = update<aVec, reduce>(out_vec0, aVec::loadu(other_ptr + k) * val_vec);
+    out_vec1 = update<aVec, reduce>(out_vec1, aVec::loadu(other_ptr + k + kVecSize) * val_vec);
+    out_vec2 = update<aVec, reduce>(out_vec2, aVec::loadu(other_ptr + k + kVecSize * 2) * val_vec);
+    out_vec3 = update<aVec, reduce>(out_vec3, aVec::loadu(other_ptr + k + kVecSize * 3) * val_vec);
+
+    out_vec0.store(out_ptr + k);
+    out_vec1.store(out_ptr + k + kVecSize);
+    out_vec2.store(out_ptr + k + kVecSize * 2);
+    out_vec3.store(out_ptr + k + kVecSize * 3);
+  }
+  for (; k < K - (K % kVecSize); k += kVecSize) {
+    aVec out_vec = aVec::loadu(out_ptr + k);
+    out_vec = update<aVec, reduce>(out_vec, aVec::loadu(other_ptr + k) * val_vec);
+    out_vec.store(out_ptr + k);
+  }
+  for (; k < K; k++) {
+    opmath_t out_val = opmath_t(out_ptr[k]);
+    out_val = update<opmath_t, reduce>(out_val, opmath_t(other_ptr[k]) * opmath_t(val));
+    out_ptr[k] = out_val;
+  }
+}
+
 template <typename scalar_t, typename index_t, ReductionType reduce>
 void spmm_reduce_kernel_impl(
     const Tensor& out,
@@ -46,69 +88,54 @@ void spmm_reduce_kernel_impl(
   int64_t M = crow_indices.numel() - 1;
   int64_t K = other.size(-1);
 
-  using Vec = vec::Vectorized<scalar_t>;
+  int num_threads = at::get_num_threads();
+  using opmath_t = at::opmath_type<scalar_t>;
+  Tensor buffer;
+  opmath_t* buffer_data = nullptr;
+  static constexpr bool need_acc = is_reduced_floating_point_v<scalar_t>;
+  if constexpr (need_acc) {
+    auto acc_type = at::toAccumulateType(out.scalar_type(), /*is_cuda=*/true);
+    buffer = at::zeros({num_threads, K}, out.options().dtype(acc_type));
+    buffer_data = buffer.data_ptr<opmath_t>();
+  }
+
   utils::parallel_sparse_csr(csr_data, M, nnz, [&](int64_t begin, int64_t end) {
-    int64_t row_start, row_end, c;
+    int tid = at::get_thread_num();
+    TORCH_CHECK(tid < num_threads,
+                "expect thread id smaller than ", num_threads, ", got thread id ", tid);
+    opmath_t* buffer_ptr = nullptr;
+
+    int64_t row_start, row_end;
     for (const auto m : c10::irange(begin, end)) {
       row_start = csr_data[m];
       row_end = csr_data[m + 1];
 
       scalar_t* out_ptr = out_data + m * K;
-
-      constexpr int64_t kVecSize = Vec::size();
-      constexpr int64_t kVLEN = kVecSize * 4;
-      constexpr int64_t CHUNK_SIZE = 16;
+      if constexpr (need_acc) {
+        buffer_ptr = buffer_data + tid * K;
+      } else {
+        buffer_ptr = reinterpret_cast<opmath_t*>(out_ptr);
+      }
 
       // step 1: reinit the output row for reduce type 'amax' and 'amin'
       int64_t count = row_end - row_start;
       if (count != 0) {
-        init<scalar_t, reduce>(out_ptr, K, /*include_self*/false);
+        _init<scalar_t, reduce>(out_ptr, buffer_ptr, K, /*include_self*/false);
       }
 
       // step 2: reduce, do blocking on rowwise to reduce write memory bandwidth
+      constexpr int64_t CHUNK_SIZE = 16;
       for (int64_t e0 = row_start; e0 < row_end; e0 += CHUNK_SIZE) {
         int64_t e1 = std::min(e0 + CHUNK_SIZE, row_end);
-
-        int64_t k = 0;
-        for (; k < K - (K % kVLEN); k += kVLEN) {
-          Vec out_vec0 = Vec::loadu(out_ptr + k);
-          Vec out_vec1 = Vec::loadu(out_ptr + k + kVecSize);
-          Vec out_vec2 = Vec::loadu(out_ptr + k + kVecSize * 2);
-          Vec out_vec3 = Vec::loadu(out_ptr + k + kVecSize * 3);
-          for (const auto e : c10::irange(e0, e1)) {
-            c = col_data[e];
-            scalar_t val = val_data[e];
-            scalar_t* other_ptr = other_data + c * K + k;
-
-            out_vec0 = update<Vec, reduce>(out_vec0, Vec::loadu(other_ptr) * Vec(val));
-            out_vec1 = update<Vec, reduce>(out_vec1, Vec::loadu(other_ptr + kVecSize) * Vec(val));
-            out_vec2 = update<Vec, reduce>(out_vec2, Vec::loadu(other_ptr + kVecSize * 2) * Vec(val));
-            out_vec3 = update<Vec, reduce>(out_vec3, Vec::loadu(other_ptr + kVecSize * 3) * Vec(val));
-          }
-          out_vec0.store(out_ptr + k);
-          out_vec1.store(out_ptr + k + kVecSize);
-          out_vec2.store(out_ptr + k + kVecSize * 2);
-          out_vec3.store(out_ptr + k + kVecSize * 3);
-        }
-        for (; k < K - (K % kVecSize); k += kVecSize) {
-          Vec out_vec = Vec::loadu(out_ptr + k);
-          for (const auto e : c10::irange(e0, e1)) {
-            c = col_data[e];
-            scalar_t val = val_data[e];
-            scalar_t* other_ptr = other_data + c * K;
-            out_vec = update<Vec, reduce>(out_vec, Vec::loadu(other_ptr + k) * Vec(val));
-          }
-          out_vec.store(out_ptr + k);
+        for (const auto e : c10::irange(e0, e1)) {
+          int64_t c = col_data[e];
+          scalar_t val = val_data[e];
+          _update<scalar_t, index_t, reduce>(buffer_ptr, e, c, val, other_data, K);
         }
-        for (; k < K; k++) {
-          scalar_t out_val = out_ptr[k];
-          for (const auto e : c10::irange(e0, e1)) {
-            c = col_data[e];
-            scalar_t val = val_data[e];
-            scalar_t* other_ptr = other_data + c * K;
-            out_val = update<scalar_t, reduce>(out_val, other_ptr[k] * val);
-          }
-          out_ptr[k] = out_val;
+      }
+      if constexpr (need_acc) {
+        if (count != 0) {
+          vec::convert(buffer_ptr, out_ptr, K);
         }
       }
 
@@ -159,28 +186,54 @@ void spmm_reduce_arg_kernel_impl(
   int64_t M = crow_indices.numel() - 1;
   int64_t K = other.size(-1);
 
+  int num_threads = at::get_num_threads();
+  using opmath_t = at::opmath_type<scalar_t>;
+  Tensor buffer;
+  opmath_t* buffer_data = nullptr;
+  static constexpr bool need_acc = is_reduced_floating_point_v<scalar_t>;
+  if constexpr (need_acc) {
+    auto acc_type = at::toAccumulateType(out.scalar_type(), /*is_cuda=*/true);
+    buffer = at::zeros({num_threads, K}, out.options().dtype(acc_type));
+    buffer_data = buffer.data_ptr<opmath_t>();
+  }
+
   at::parallel_for(0, M, 1, [&](int64_t begin, int64_t end) {
+    int tid = at::get_thread_num();
+    TORCH_CHECK(tid < num_threads,
+                "expect thread id smaller than ", num_threads, ", got thread id ", tid);
+    opmath_t* buffer_ptr = nullptr;
+
     int64_t row_start, row_end, c;
     for (const auto m : c10::irange(begin, end)) {
       row_start = csr_data[m];
       row_end = csr_data[m + 1];
 
       scalar_t* out_ptr = out_data + m * K;
       index_t* arg_out_ptr = arg_out_data + m * K;
+      if constexpr (need_acc) {
+        buffer_ptr = buffer_data + tid * K;
+      } else {
+        buffer_ptr = reinterpret_cast<opmath_t*>(out_ptr);
+      }
 
       if (row_end != row_start) {
-        init<scalar_t, reduce>(out_ptr, K, /*include_self*/false);
+        _init<scalar_t, reduce>(out_ptr, buffer_ptr, K, /*include_self*/false);
         for (const auto e : c10::irange(row_start, row_end)) {
           c = col_data[e];
-          scalar_t val = val_data[e];
+          opmath_t val = opmath_t(val_data[e]);
 
           scalar_t* other_ptr = other_data + c * K;
           for (const auto k : c10::irange(K)) {
-            update_with_index<scalar_t, index_t, reduce>(
-                &out_ptr[k], val *  other_ptr[k], &arg_out_ptr[k], index_t(e));
+            update_with_index<opmath_t, index_t, reduce>(
+                &buffer_ptr[k], opmath_t(val *  other_ptr[k]), &arg_out_ptr[k], index_t(e));
           };
         }
       }
+      if constexpr (need_acc) {
+        if (row_end != row_start) {
+          vec::convert(buffer_ptr, out_ptr, K);
+        }
+      }
     }
   });
 }
@@ -381,14 +434,14 @@ void spmm_reduce_kernel(
     const Tensor& values,
     const Tensor& other,
     ReductionType reduce_op) {
-  AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, values.scalar_type(), "spmm_reduce_kernel", [&]() {
-    AT_DISPATCH_INDEX_TYPES(col_indices.scalar_type(), "spmm_reduce_indices", [&]() {
-      AT_DISPATCH_REDUCTION_TYPES(reduce_op, [&]() {
-        spmm_reduce_kernel_impl<scalar_t, index_t, reduce>(
-            out, crow_indices, col_indices, values, other);
+    AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, values.scalar_type(), "spmm_reduce_kernel", [&]() {
+      AT_DISPATCH_INDEX_TYPES(col_indices.scalar_type(), "spmm_reduce_indices", [&]() {
+        AT_DISPATCH_REDUCTION_TYPES(reduce_op, [&]() {
+          spmm_reduce_kernel_impl<scalar_t, index_t, reduce>(
+              out, crow_indices, col_indices, values, other);
+        });
       });
     });
-  });
 }
 
 void spmm_reduce_arg_kernel(

diff --git a/aten/src/ATen/native/cpu/utils.h b/aten/src/ATen/native/cpu/utils.h
@@ -50,13 +50,24 @@ struct Vec2 {
     std::tie(v0, v1) = convert_bfloat16_float(Vectorized<BFloat16>::loadu(ptr));
     return {v0, v1};
   }
+  static Vec2 loadu(const float* ptr) {
+    return {Vectorized<float>::loadu(ptr), Vectorized<float>::loadu(ptr + Vectorized<float>::size())};
+  }
   void store(BFloat16* ptr) const {
     Vectorized<BFloat16> val = convert_float_bfloat16(val0, val1);
     val.store(ptr);
   }
+  void store(float* ptr) const {
+    val0.store(ptr);
+    val1.store(ptr + Vectorized<float>::size());
+  }
 };
 inline Vec2 operator+(const Vec2& a, const Vec2& b) { return {a.val0 + b.val0, a.val1 + b.val1}; }
 inline Vec2 operator*(const Vec2& a, const Vec2& b) { return {a.val0 * b.val0, a.val1 * b.val1}; }
+inline Vec2 operator-(const Vec2& a, const Vec2& b) { return {a.val0 - b.val0, a.val1 - b.val1}; }
+inline Vec2 operator/(const Vec2& a, const Vec2& b) { return {a.val0 / b.val0, a.val1 / b.val1}; }
+inline Vec2 maximum(const Vec2& a, const Vec2& b) { return {vec::maximum(a.val0, b.val0), vec::maximum(a.val1, b.val1)}; }
+inline Vec2 minimum(const Vec2& a, const Vec2& b) { return {vec::minimum(a.val0, b.val0), vec::minimum(a.val1, b.val1)}; }
 
 template <typename scalar_t> struct VectorizedType { using type = Vectorized<scalar_t>; };
 template <> struct VectorizedType<BFloat16> { using type = Vec2; };

diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h
@@ -208,10 +208,15 @@ class C10_API SymInt {
     if (!is_heap_allocated()) {
       return c10::make_optional(data_);
     }
-    int64_t c = toSymNodeImplUnowned()->large_negative_int();
+    auto* node = toSymNodeImplUnowned();
+    int64_t c = node->large_negative_int();
     if (c != 0) {
       return c10::make_optional(c);
     }
+    c10::optional<int64_t> d = node->maybe_as_int();
+    if (d.has_value()) {
+      return d;
+    }
     return c10::nullopt;
   }