Update on "[vulkan] Add mean.dim op for vulkan"

[ghstack-poisoned]
pytorch · Nov 3, 2020 · 9185eb4 · 9185eb4
2 parents 1800c75 + c68c3d0
commit 9185eb4
Show file tree

Hide file tree

Showing 34 changed files with 570 additions and 254 deletions.
diff --git a/.circleci/docker/centos-rocm/Dockerfile b/.circleci/docker/centos-rocm/Dockerfile
@@ -64,7 +64,6 @@ ENV PATH /opt/rocm/hcc/bin:$PATH
 ENV PATH /opt/rocm/hip/bin:$PATH
 ENV PATH /opt/rocm/opencl/bin:$PATH
 ENV PATH /opt/rocm/llvm/bin:$PATH
-ENV HIP_PLATFORM hcc
 ENV LANG en_US.utf8
 ENV LC_ALL en_US.utf8
 

diff --git a/.circleci/docker/ubuntu-rocm/Dockerfile b/.circleci/docker/ubuntu-rocm/Dockerfile
@@ -58,7 +58,6 @@ ENV PATH /opt/rocm/hcc/bin:$PATH
 ENV PATH /opt/rocm/hip/bin:$PATH
 ENV PATH /opt/rocm/opencl/bin:$PATH
 ENV PATH /opt/rocm/llvm/bin:$PATH
-ENV HIP_PLATFORM hcc
 ENV LANG C.UTF-8
 ENV LC_ALL C.UTF-8
 

diff --git a/.jenkins/caffe2/common.sh b/.jenkins/caffe2/common.sh
@@ -13,6 +13,8 @@ if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then
 fi
 
 if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
+    # HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors
+    unset HIP_PLATFORM
     if which sccache > /dev/null; then
         # Save sccache logs to file
         sccache --stop-server || true

diff --git a/.jenkins/pytorch/common.sh b/.jenkins/pytorch/common.sh
@@ -12,6 +12,8 @@ SCRIPT_DIR="$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )"
 
 # Figure out which Python to use for ROCm
 if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]] && [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then
+  # HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors
+  unset HIP_PLATFORM
   PYTHON=$(which "python${BASH_REMATCH[1]}")
   # non-interactive bashs do not expand aliases by default
   shopt -s expand_aliases

diff --git a/.jenkins/pytorch/macos-build.sh b/.jenkins/pytorch/macos-build.sh
@@ -8,23 +8,8 @@ git submodule update --init --recursive
 export CMAKE_PREFIX_PATH=${WORKSPACE_DIR}/miniconda3/
 
 # Build PyTorch
-if [[ "${BUILD_ENVIRONMENT}" == *cuda9.2* ]]; then
-  export CUDA_VERSION=9.2
-  export TORCH_CUDA_ARCH_LIST=5.2
-  export PATH=/Developer/NVIDIA/CUDA-${CUDA_VERSION}/bin${PATH:+:${PATH}}
-  export DYLD_LIBRARY_PATH=/Developer/NVIDIA/CUDA-${CUDA_VERSION}/lib${DYLD_LIBRARY_PATH:+:${DYLD_LIBRARY_PATH}}
-  export CUDA_HOME=/Developer/NVIDIA/CUDA-${CUDA_VERSION}
-  export USE_CUDA=1
-
-  if [ -z "${IN_CI}" ]; then
-    # Eigen gives "explicit specialization of class must precede its first use" error
-    # when compiling with Xcode 9.1 toolchain, so we have to use Xcode 8.2 toolchain instead.
-    export DEVELOPER_DIR=/Library/Developer/CommandLineTools
-  fi
-else
-  if [ -z "${IN_CI}" ]; then
-    export DEVELOPER_DIR=/Applications/Xcode9.app/Contents/Developer
-  fi
+if [ -z "${IN_CI}" ]; then
+  export DEVELOPER_DIR=/Applications/Xcode9.app/Contents/Developer
 fi
 
 if which sccache > /dev/null; then
@@ -34,17 +19,10 @@ if which sccache > /dev/null; then
   printf "#!/bin/sh\nexec sccache $(which clang) \$*" > "${WORKSPACE_DIR}/clang"
   chmod a+x "${WORKSPACE_DIR}/clang"
 
-  if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
-    printf "#!/bin/sh\nexec sccache $(which nvcc) \$*" > "${WORKSPACE_DIR}/nvcc"
-    chmod a+x "${WORKSPACE_DIR}/nvcc"
-    export CUDA_NVCC_EXECUTABLE="${WORKSPACE_DIR}/nvcc"
-  fi
-
   export PATH="${WORKSPACE_DIR}:$PATH"
 fi
 
-# If we run too many parallel jobs, we will OOM
-MAX_JOBS=2 USE_DISTRIBUTED=1 python setup.py install
+USE_DISTRIBUTED=1 python setup.py install
 
 assert_git_not_dirty
 

diff --git a/.jenkins/pytorch/win-test.sh b/.jenkins/pytorch/win-test.sh
@@ -1,5 +1,5 @@
-#!/bin/bash -ex
-
+#!/bin/bash
+set -ex
 # shellcheck disable=SC2034
 COMPACT_JOB_NAME=pytorch-win-ws2019-cuda10-cudnn7-py3-test
 
@@ -42,28 +42,30 @@ fi
 
 run_tests() {
     if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test ]]; then
-        $SCRIPT_HELPERS_DIR/test_python_nn.bat "$DETERMINE_FROM" && \
-        $SCRIPT_HELPERS_DIR/test_python_all_except_nn.bat "$DETERMINE_FROM" && \
-        $SCRIPT_HELPERS_DIR/test_custom_script_ops.bat && \
-        $SCRIPT_HELPERS_DIR/test_custom_backend.bat && \
+        $SCRIPT_HELPERS_DIR/test_python_nn.bat "$DETERMINE_FROM"
+        $SCRIPT_HELPERS_DIR/test_python_all_except_nn.bat "$DETERMINE_FROM"
+        $SCRIPT_HELPERS_DIR/test_custom_script_ops.bat
+        $SCRIPT_HELPERS_DIR/test_custom_backend.bat
         $SCRIPT_HELPERS_DIR/test_libtorch.bat
     else
         if [[ "${JOB_BASE_NAME}" == *-test1 ]]; then
             export PYTORCH_COLLECT_COVERAGE=1
-            $SCRIPT_HELPERS_DIR/test_python_nn.bat "$DETERMINE_FROM" && \
+            $SCRIPT_HELPERS_DIR/test_python_nn.bat "$DETERMINE_FROM"
             $SCRIPT_HELPERS_DIR/test_libtorch.bat
             if [[ "${USE_CUDA}" == "1" ]]; then
               $SCRIPT_HELPERS_DIR/test_python_jit_legacy.bat "$DETERMINE_FROM"
             fi
         elif [[ "${JOB_BASE_NAME}" == *-test2 ]]; then
-            $SCRIPT_HELPERS_DIR/test_python_all_except_nn.bat "$DETERMINE_FROM" && \
-            $SCRIPT_HELPERS_DIR/test_custom_backend.bat && \
+            $SCRIPT_HELPERS_DIR/test_python_all_except_nn.bat "$DETERMINE_FROM"
+            $SCRIPT_HELPERS_DIR/test_custom_backend.bat
             $SCRIPT_HELPERS_DIR/test_custom_script_ops.bat
         fi
     fi
 }
 
-run_tests && assert_git_not_dirty && echo "TEST PASSED"
+run_tests
+assert_git_not_dirty
+echo "TEST PASSED"
 
 if [[ "${BUILD_ENVIRONMENT}" == "pytorch-win-vs2019-cuda10-cudnn7-py3" ]] && [[ "${JOB_BASE_NAME}" == *-test1 ]]; then
   pushd $TEST_DIR

diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -1738,5 +1738,71 @@ Tensor chain_matmul(TensorList matrices) {
   }
 }
 
+/*
+Calculates the Kronecker product between two Tensors.
+*/
+Tensor kron(const Tensor& self, const Tensor& other) {
+  /*
+  We can obtain the kron result using tensordot or einsum. The implementation below uses tensordot.
+  In einsum notation suppose we have `self` with dim 4 and `other` with dim 2
+  the result of below tensordot is in einsum 0123, 45 -> 012345.
+  To obtain the correct kron we need to permute and reshape the array.
+  The permutation rule is the following: going from right to left
+  take axes in turn to form the permutation
+  with our example the correct permutation is 012435 and
+  the kron shape is (shape_self[0], shape_self[1], shape_self[3]*shape_other[0],
+  shape_self[4]*shape_other[1])
+  */
+  std::vector<int64_t> self_sizes = self.sizes().vec();
+  std::vector<int64_t> other_sizes = other.sizes().vec();
+  int64_t self_ndim = self.dim();
+  int64_t other_ndim = other.dim();
+  int64_t min_ndim = std::min(self_ndim, other_ndim);
+  int64_t ndim_diff = std::abs(self_ndim - other_ndim);
+
+  std::vector<int64_t> a_axes(self_ndim);
+  std::vector<int64_t> b_axes(other_ndim);
+  std::iota(a_axes.begin(), a_axes.end(), 0);
+  std::iota(b_axes.begin(), b_axes.end(), 0 + self_ndim);
+
+  bool is_a_larger = self_ndim >= other_ndim;
+  std::vector<int64_t> kron_permutation(self_ndim + other_ndim);
+  for (int64_t i = 0; i < ndim_diff; i++) {
+    kron_permutation[i] = is_a_larger ? a_axes[i] : b_axes[i];
+  }
+  for (int64_t i = 0, j = 0; i < min_ndim; i++, j += 2) {
+    kron_permutation[self_ndim + other_ndim - 1 - j] = b_axes[other_ndim - 1 - i];
+    kron_permutation[self_ndim + other_ndim - 1 - j - 1] = a_axes[self_ndim - 1 - i];
+  }
+
+  std::vector<int64_t> result_shape(std::max(self_ndim, other_ndim));
+  for (int64_t i = 0; i < ndim_diff; i++) {
+    result_shape[i] = is_a_larger ? self_sizes[i] : other_sizes[i];
+  }
+  for (int64_t i = 0; i < min_ndim; i++) {
+    result_shape[ndim_diff + i] = is_a_larger
+        ? self_sizes[ndim_diff + i] * other_sizes[i]
+        : other_sizes[ndim_diff + i] * self_sizes[i];
+  }
+
+  Tensor result = at::tensordot(self, other, {}, {});
+  // Step 2: now permute result
+  result = result.permute(kron_permutation);
+  // Step 3: reshape
+  result = result.reshape(result_shape);
+
+  return result;
+}
+
+Tensor& kron_out(Tensor& result, const Tensor& self, const Tensor& other) {
+  TORCH_CHECK(result.scalar_type() == self.scalar_type(),
+    "result dtype ", result.scalar_type(), " does not match self dtype ", self.scalar_type());
+
+  Tensor result_tmp = at::kron(self, other);
+  at::native::resize_output(result, result_tmp.sizes());
+  result.copy_(result_tmp);
+  return result;
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -2142,6 +2142,15 @@
     CPU: kl_div_backward_cpu
     CUDA: kl_div_backward_cuda
 
+- func: kron(Tensor self, Tensor other) -> Tensor
+  variants: function, method
+  dispatch:
+    Math: kron
+
+- func: kron.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    Math: kron_out
+
 - func: kthvalue(Tensor self, int k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
   use_c10_dispatcher: full
   variants: function, method

diff --git a/aten/src/ATen/native/vulkan/glsl/mean.glsl b/aten/src/ATen/native/vulkan/glsl/mean.glsl
@@ -4,7 +4,7 @@ layout(std430) buffer;
 layout(std430) uniform;
 layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D   uOutput;
 layout(set = 0, binding = 1)          uniform PRECISION                    sampler3D uInput;
-layout(set = 0, binding = 2)          uniform                                        Block {
+layout(set = 0, binding = 2)          uniform PRECISION restrict                     Block {
   int W;
   int H;
 } uBlock;
@@ -13,11 +13,11 @@ layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
 
 void main() {
   ivec3 pos = ivec3(gl_GlobalInvocationID);
-  vec4 r = vec4(1.0) / float(uBlock.W) / float(uBlock.H);
+  vec4 r = vec4(1.0) / (float(uBlock.W) * float(uBlock.H));
   vec4 acc = vec4(0);
   int xi, yi;
-  for (xi = 0; xi < uBlock.W; ++xi) {
-    for (yi = 0; yi < uBlock.H; ++yi) {
+  for (yi = 0; yi < uBlock.H; ++yi) {
+    for (xi = 0; xi < uBlock.W; ++xi) {
       acc += texelFetch(uInput, ivec3(xi, yi, pos.z), 0);
     }
   }

diff --git a/aten/src/ATen/native/vulkan/glsl/mean2d.glsl b/aten/src/ATen/native/vulkan/glsl/mean2d.glsl
@@ -4,7 +4,7 @@ layout(std430) buffer;
 layout(std430) uniform;
 layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D   uOutput;
 layout(set = 0, binding = 1)          uniform PRECISION                    sampler3D uInput;
-layout(set = 0, binding = 2)          uniform                                        Block {
+layout(set = 0, binding = 2)          uniform PRECISION restrict                     Block {
   int W;
   int H;
 } uBlock;
@@ -13,18 +13,17 @@ layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
 
 void main() {
   ivec3 pos = ivec3(gl_GlobalInvocationID);
-  vec4 r = vec4(1.0) / float(uBlock.W) / float(uBlock.H);
+  vec4 r = vec4(1.0) / (float(uBlock.W) * float(uBlock.H));
   vec4 acc = vec4(0);
   int xi, yi;
   int zi = (imageSize(uOutput).x*pos.y + pos.x)/4;
   int zo = (imageSize(uOutput).x*pos.y + pos.x)%4;
-  for (xi = 0; xi < uBlock.W; ++xi) {
-    for (yi = 0; yi < uBlock.H; ++yi) {
+  for (yi = 0; yi < uBlock.H; ++yi) {
+    for (xi = 0; xi < uBlock.W; ++xi) {
       acc += texelFetch(uInput, ivec3(xi, yi, zi), 0);
     }
   }
   vec4 outValue = r * acc;
 
-  int test = (imageSize(uOutput).x*pos.x + pos.x);
   imageStore(uOutput, pos, vec4(outValue[zo], 0,0,0));
 }