Update on "[vulkan] Distribute weight prepacking along y dimension fo…

…r conv2d" Differential Revision: [D25222752](https://our.internmc.facebook.com/intern/diff/D25222752) [ghstack-poisoned]
pytorch · Dec 1, 2020 · f419c73 · f419c73
2 parents adb309c + 5f181e2
commit f419c73
Show file tree

Hide file tree

Showing 99 changed files with 2,030 additions and 1,066 deletions.
diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py
@@ -93,7 +93,7 @@
             ]),
         ]),
         ("rocm", [
-            ("3.7", [
+            ("3.9", [
                 ("3.6", [
                     ('build_only', [XImportant(True)]),
                 ]),

diff --git a/.circleci/cimodel/data/simple/docker_definitions.py b/.circleci/cimodel/data/simple/docker_definitions.py
@@ -29,7 +29,6 @@
     "pytorch-linux-xenial-py3.6-gcc5.4",  # this one is used in doc builds
     "pytorch-linux-xenial-py3.6-gcc7.2",
     "pytorch-linux-xenial-py3.6-gcc7",
-    "pytorch-linux-bionic-rocm3.7-py3.6",
     "pytorch-linux-bionic-rocm3.8-py3.6",
     "pytorch-linux-bionic-rocm3.9-py3.6",
 ]

diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -453,12 +453,8 @@ jobs:
         no_output_timeout: "1h"
         command: |
           set -e
-          # TODO: Remove this after we figure out why rocm tests are failing
-          if [[ "${DOCKER_IMAGE}" == *rocm3.5* ]]; then
-            export DOCKER_TAG="ab1632df-fa59-40e6-8c23-98e004f61148"
-          fi
-          if [[ "${DOCKER_IMAGE}" == *rocm3.7* ]]; then
-            export DOCKER_TAG="1045c7b891104cb4fd23399eab413b6213e48aeb"
+          if [[ "${DOCKER_IMAGE}" == *rocm3.9* ]]; then
+            export DOCKER_TAG="f3d89a32912f62815e4feaeed47e564e887dffd6"
           fi
           if [[ ${BUILD_ENVIRONMENT} == *"pure_torch"* ]]; then
             echo 'BUILD_CAFFE2=OFF' >> "${BASH_ENV}"
@@ -538,12 +534,8 @@ jobs:
         command: |
           set -e
           export PYTHONUNBUFFERED=1
-          # TODO: Remove this after we figure out why rocm tests are failing
-          if [[ "${DOCKER_IMAGE}" == *rocm3.5* ]]; then
-            export DOCKER_TAG="ab1632df-fa59-40e6-8c23-98e004f61148"
-          fi
-          if [[ "${DOCKER_IMAGE}" == *rocm3.7* ]]; then
-            export DOCKER_TAG="1045c7b891104cb4fd23399eab413b6213e48aeb"
+          if [[ "${DOCKER_IMAGE}" == *rocm3.9* ]]; then
+            export DOCKER_TAG="f3d89a32912f62815e4feaeed47e564e887dffd6"
           fi
           # See Note [Special build images]
           output_image=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1}
@@ -7280,9 +7272,6 @@ workflows:
       - docker_build_job:
           name: "docker-pytorch-linux-xenial-py3.6-gcc7"
           image_name: "pytorch-linux-xenial-py3.6-gcc7"
-      - docker_build_job:
-          name: "docker-pytorch-linux-bionic-rocm3.7-py3.6"
-          image_name: "pytorch-linux-bionic-rocm3.7-py3.6"
       - docker_build_job:
           name: "docker-pytorch-linux-bionic-rocm3.8-py3.6"
           image_name: "pytorch-linux-bionic-rocm3.8-py3.6"
@@ -7713,11 +7702,11 @@ workflows:
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.8-gcc9"
           resource_class: large
       - pytorch_linux_build:
-          name: pytorch_linux_bionic_rocm3_7_py3_6_build
+          name: pytorch_linux_bionic_rocm3_9_py3_6_build
           requires:
-            - "docker-pytorch-linux-bionic-rocm3.7-py3.6"
-          build_environment: "pytorch-linux-bionic-rocm3.7-py3.6-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm3.7-py3.6"
+            - "docker-pytorch-linux-bionic-rocm3.9-py3.6"
+          build_environment: "pytorch-linux-bionic-rocm3.9-py3.6-build"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm3.9-py3.6"
           resource_class: xlarge
       - pytorch_macos_10_13_py3_build:
           name: pytorch_macos_10_13_py3_build

diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh
@@ -274,13 +274,6 @@ case "$image" in
     VISION=yes
     KATEX=yes
     ;;
-  pytorch-linux-bionic-rocm3.7-py3.6)
-    ANACONDA_PYTHON_VERSION=3.6
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    ROCM_VERSION=3.7
-    ;;
   pytorch-linux-bionic-rocm3.8-py3.6)
     ANACONDA_PYTHON_VERSION=3.6
     PROTOBUF=yes

diff --git a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
@@ -15,12 +15,8 @@ jobs:
         no_output_timeout: "1h"
         command: |
           set -e
-          # TODO: Remove this after we figure out why rocm tests are failing
-          if [[ "${DOCKER_IMAGE}" == *rocm3.5* ]]; then
-            export DOCKER_TAG="ab1632df-fa59-40e6-8c23-98e004f61148"
-          fi
-          if [[ "${DOCKER_IMAGE}" == *rocm3.7* ]]; then
-            export DOCKER_TAG="1045c7b891104cb4fd23399eab413b6213e48aeb"
+          if [[ "${DOCKER_IMAGE}" == *rocm3.9* ]]; then
+            export DOCKER_TAG="f3d89a32912f62815e4feaeed47e564e887dffd6"
           fi
           if [[ ${BUILD_ENVIRONMENT} == *"pure_torch"* ]]; then
             echo 'BUILD_CAFFE2=OFF' >> "${BASH_ENV}"
@@ -100,12 +96,8 @@ jobs:
         command: |
           set -e
           export PYTHONUNBUFFERED=1
-          # TODO: Remove this after we figure out why rocm tests are failing
-          if [[ "${DOCKER_IMAGE}" == *rocm3.5* ]]; then
-            export DOCKER_TAG="ab1632df-fa59-40e6-8c23-98e004f61148"
-          fi
-          if [[ "${DOCKER_IMAGE}" == *rocm3.7* ]]; then
-            export DOCKER_TAG="1045c7b891104cb4fd23399eab413b6213e48aeb"
+          if [[ "${DOCKER_IMAGE}" == *rocm3.9* ]]; then
+            export DOCKER_TAG="f3d89a32912f62815e4feaeed47e564e887dffd6"
           fi
           # See Note [Special build images]
           output_image=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1}

diff --git a/.jenkins/pytorch/common.sh b/.jenkins/pytorch/common.sh
@@ -129,13 +129,18 @@ fi
 if [[ "$BUILD_ENVIRONMENT" == *pytorch-xla-linux-bionic* ]] || \
    [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-xenial-cuda9-cudnn7-py2* ]] || \
    [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-xenial-cuda10.1-cudnn7-py3* ]] || \
+   [[ "$BUILD_ENVIRONMENT" == *pytorch-*centos* ]] || \
    [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-bionic* ]]; then
   if ! which conda; then
     echo "Expected ${BUILD_ENVIRONMENT} to use conda, but 'which conda' returns empty"
     exit 1
   else
     conda install -q -y cmake
   fi
+  if [[ "$BUILD_ENVIRONMENT" == *pytorch-*centos* ]]; then
+    # cmake3 package will conflict with conda cmake
+    sudo yum -y remove cmake3 || true
+  fi
 fi
 
 retry () {

diff --git a/BUILD.bazel b/BUILD.bazel
@@ -457,7 +457,6 @@ filegroup(
     name = "aten_srcs_cu",
     srcs = [
         "aten/src/ATen/cuda/detail/IndexUtils.cu.cc",
-        "aten/src/ATen/cuda/detail/CUDAGraphsUtils.cu.cc",
         "aten/src/ATen/native/cuda/Activation.cu.cc",
         "aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu.cc",
         "aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu.cc",

diff --git a/android/libs/fbjni b/android/libs/fbjni
diff --git a/android/pytorch_android/CMakeLists.txt b/android/pytorch_android/CMakeLists.txt
@@ -1,5 +1,8 @@
 cmake_minimum_required(VERSION 3.4.1)
 project(pytorch_jni CXX)
+
+include(GNUInstallDirs)
+
 set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_VERBOSE_MAKEFILE ON)
 message(STATUS "ANDROID_STL:${ANDROID_STL}")
@@ -68,8 +71,8 @@ target_compile_options(pytorch_jni PRIVATE
   -fexceptions
 )
 
-target_include_directories(pytorch_jni PUBLIC
-    ${libtorch_include_DIR}
+target_include_directories(pytorch_jni BEFORE
+   PUBLIC $<BUILD_INTERFACE:${libtorch_include_DIR}>
 )
 
 set(fbjni_DIR ${CMAKE_CURRENT_LIST_DIR}/../libs/fbjni/)
@@ -155,3 +158,13 @@ if(USE_VULKAN)
 endif()
 
 target_link_libraries(pytorch_jni ${pytorch_jni_LIBS})
+
+install(TARGETS pytorch_jni
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) #For windows
+
+if(MSVC)
+  install(FILES $<TARGET_PDB_FILE:pytorch_jni> DESTINATION ${CMAKE_INSTALL_LIBDIR} OPTIONAL)
+  install(TARGETS pytorch_jni DESTINATION ${CMAKE_INSTALL_LIBDIR})
+endif()
diff --git a/aten/src/ATen/CUDAGeneratorImpl.h b/aten/src/ATen/CUDAGeneratorImpl.h
@@ -2,122 +2,10 @@
 
 #include <c10/core/GeneratorImpl.h>
 #include <ATen/core/Generator.h>
-#include <ATen/Tensor.h>
-#include <ATen/Context.h>
-#include <limits>
 
 // TODO: this file should be in ATen/cuda, not top level
 
 namespace at {
-/**
- * Note [CUDA Graph-safe RNG states]
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * Strategy:
- * ~~~~~~~~~
- * A CUDA graph containing multiple RNG ops behaves like a
- * single giant kernel from the perspective of ops external
- * to the graph.  During graph capture, logic below records
- * the total of all offset increments that occur in the graphed
- * region, and records the final total as the offset for the
- * entire graph.
- *
- * When the graph reruns, the logic that reruns it
- * increments this device's CUDA generator's offset
- * by that total.
- *
- * Meanwhile, within the graph, at capture time, instead of
- * populating PhiloxCudaStates with the uint64_t offset pulled
- * directly from the global state, PhiloxCudaState instead
- * holds a pointer to one-element stream-local int64_t device tensor
- * holding an initial offset value, and a uint64_t holding an
- * intra-graph offset. (The intra-graph offset starts from zero
- * when capture begins.)  In each consumer kernel,
- * at::cuda::philox::unpack computes the offset to use for this kernel
- * as intra-graph offset + *initial offset.
- *
- * When the graph reruns, the logic that reruns it first
- * fill_s the initial offset tensor with this device's
- * CUDA generator's current offset.
- *
- * The control flow above ensures graphed execution is bitwise
- * identical to eager execution as long as RNG ops are enqueued
- * from a single thread, even if RNG ops and graphs containing
- * RNG ops are enqueued and run simultaneously on multiple streams.
- *
- * Usage:
- * ~~~~~~
- * PhiloxCudaState in this file, and unpack() in
- * cuda/CUDAGraphsUtils.cuh allow non-divergent use of
- * CUDAGeneratorImpl whether graph capture is underway or not.
- *
- * Each PhiloxCudaState instance should be used for one and only one
- * consumer kernel.
- *
- * Example (see e.g. native/cuda/Dropout.cu):
- *
- * #include <ATen/cuda/CUDAGeneratorImpl.h>
- * #include <ATen/cuda/CUDAGraphsUtils.cuh>
- *
- * __global__ void kernel(..., PhiloxCudaState philox_args) {
- *   auto seeds = at::cuda::philox::unpack(philox_args);
- *   IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
- *   curandStatePhilox4_32_10_t state;
- *   curand_init(std::get<0>(seeds), // seed
- *               idx,                // per-thread subsequence
- *               std::get<1>(seeds), // offset in subsequence
- *               &state);
- *   ...
- * }
- *
- * host_caller(...) {
- *   PhiloxCudaState rng_engine_inputs;
- *   {
- *     // See Note [Acquire lock when using random generators]
- *     std::lock_guard<std::mutex> lock(gen->mutex_);
- *
- *     // gen could be HostState or DevState here! No divergent code needed!
- *     rng_engine_inputs = gen->philox_cuda_state(offset_increment);
- *   }
- *   kernel<<<...>>>(..., rng_engine_inputs);
- * }
- *
- */
-
-
-// Stores state values. Passed as a kernel argument. See "Usage:" above.
-struct PhiloxCudaState {
-  PhiloxCudaState() = default;
-  PhiloxCudaState(const PhiloxCudaState&) = default;
-  // Called if graph capture is not underway
-  PhiloxCudaState(uint64_t seed,
-                  uint64_t offset) {
-    seed_ = seed;
-    offset_.val = offset;
-  }
-  // Called if graph capture is underway
-  PhiloxCudaState(uint64_t seed,
-                  int64_t* offset_extragraph,
-                  uint32_t offset_intragraph) {
-    seed_ = seed;
-    offset_.ptr = offset_extragraph;
-    offset_intragraph_ = offset_intragraph;
-    captured_ = true;
-  }
-
-  // Public members, directly accessible by at::cuda::philox::unpack.
-  // If we made them private with getters/setters, the getters/setters
-  // would have to be __device__, and we can't declare __device__ in ATen.
-  union Payload {
-    uint64_t val;
-    int64_t* ptr;
-  };
-
-  uint64_t seed_;
-  Payload offset_;
-  uint32_t offset_intragraph_;
-  bool captured_ = false;
-};
 
 struct TORCH_CUDA_API CUDAGeneratorImpl : public c10::GeneratorImpl {
   // Constructors
@@ -131,22 +19,13 @@ struct TORCH_CUDA_API CUDAGeneratorImpl : public c10::GeneratorImpl {
   uint64_t seed() override;
   void set_philox_offset_per_thread(uint64_t offset);
   uint64_t philox_offset_per_thread();
-  void graph_prologue(int64_t* offset_extragraph_);
-  uint64_t graph_epilogue();
-  PhiloxCudaState philox_cuda_state(uint64_t increment);
-
-  // Temporarily accommodates call sites that use philox_engine_inputs.
-  // Allows incremental refactor of call sites to use philox_cuda_state.
   std::pair<uint64_t, uint64_t> philox_engine_inputs(uint64_t increment);
-
   static DeviceType device_type();
 
 private:
   CUDAGeneratorImpl* clone_impl() const override;
   uint64_t seed_ = default_rng_seed_val;
   uint64_t philox_offset_per_thread_ = 0;
-  int64_t* offset_extragraph_;
-  uint32_t offset_intragraph_ = 0;
 };
 
 namespace cuda {

diff --git a/aten/src/ATen/ScalarOps.cpp b/aten/src/ATen/ScalarOps.cpp
@@ -29,11 +29,10 @@ Tensor& scalar_fill(Tensor& self, Scalar value) {
   return self;
 }
 
-Tensor scalar_tensor_static(Scalar s, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt,
-                            c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> memory_format_opt) {
+Tensor scalar_tensor_static(Scalar s, c10::optional<ScalarType> dtype_opt, c10::optional<Device> device_opt) {
   at::tracer::impl::NoTracerDispatchMode tracer_guard;
   at::AutoNonVariableTypeMode non_var_type_mode(true);
-  auto result = at::detail::empty_cpu({}, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt);
+  auto result = at::detail::empty_cpu({}, dtype_opt, c10::nullopt, device_opt, c10::nullopt, c10::nullopt);
   scalar_fill(result, s);
   return result;
 }

diff --git a/aten/src/ATen/ScalarOps.h b/aten/src/ATen/ScalarOps.h
@@ -12,9 +12,7 @@ namespace detail {
 // but we also want to skip compute_types which in not avoidable
 // in TensorIterator for now.
 Tensor& scalar_fill(Tensor& self, Scalar value);
-TORCH_API Tensor scalar_tensor_static(Scalar s, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt,
-                                      c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt,
-                                      c10::optional<c10::MemoryFormat> memory_format_opt);
+TORCH_API Tensor scalar_tensor_static(Scalar s, c10::optional<ScalarType> dtype_opt, c10::optional<Device> device_opt);
 } // namespace detail
 } // namespace at
 
@@ -27,12 +25,12 @@ inline at::Tensor scalar_to_tensor(Scalar s, const Device device = at::kCPU) {
   // This is the fast track we have for CPU scalar tensors.
   if (device == at::kCPU && !s.isComplex()) {
     if (s.isFloatingPoint()) {
-      return at::detail::scalar_tensor_static(s, at::kDouble, c10::nullopt, at::kCPU, c10::nullopt, c10::nullopt);
+      return at::detail::scalar_tensor_static(s, at::kDouble, at::kCPU);
     } else if (s.isBoolean()) {
-      return at::detail::scalar_tensor_static(s, at::kBool, c10::nullopt, at::kCPU, c10::nullopt, c10::nullopt);
+      return at::detail::scalar_tensor_static(s, at::kBool, at::kCPU);
     } else {
       AT_ASSERT(s.isIntegral(false));
-      return at::detail::scalar_tensor_static(s, at::kLong, c10::nullopt, at::kCPU, c10::nullopt, c10::nullopt);
+      return at::detail::scalar_tensor_static(s, at::kLong, at::kCPU);
     }
   }
   if (s.isFloatingPoint()) {

diff --git a/aten/src/ATen/core/Generator.h b/aten/src/ATen/core/Generator.h
@@ -13,12 +13,6 @@
 #include <c10/util/intrusive_ptr.h>
 #include <c10/core/Device.h>
 #include <c10/core/DispatchKeySet.h>
-
-// For the record I don't think this is a correct pimpl idiom.
-// Including Impl header in interface header defeats the purpose
-// because you can't change Impl private members without forcing
-// everything that included the interface to rebuild.
-// Impl should be forward-declared in the interface header instead.
 #include <c10/core/GeneratorImpl.h>
 
 /**

diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
@@ -531,6 +531,7 @@ _(aten, nll_loss2d_forward) \
 _(aten, nll_loss_backward) \
 _(aten, nll_loss_forward) \
 _(aten, nonzero) \
+_(aten, nonzero_numpy) \
 _(aten, norm) \
 _(aten, norm_except_dim) \
 _(aten, normal) \