Update on "split TestAsserts by functionality"

Instead of having one large TestAsserts test case, we split of tests for self-contained functionality like container or complex checking into separate test cases. That makes it a lot easier to keep an overview over what is tested. [ghstack-poisoned]
pytorch · Jun 18, 2021 · 00783ac · 00783ac
2 parents 14f2c56 + 39816ba
commit 00783ac
Show file tree

Hide file tree

Showing 32 changed files with 361 additions and 450 deletions.
diff --git a/BUILD.bazel b/BUILD.bazel
@@ -421,7 +421,6 @@ filegroup(
         "aten/src/THCUNN/LogSigmoid.cu.cc",
         "aten/src/THCUNN/MultiLabelMarginCriterion.cu.cc",
         "aten/src/THCUNN/MultiMarginCriterion.cu.cc",
-        "aten/src/THCUNN/RReLU.cu.cc",
         "aten/src/THCUNN/SoftMarginCriterion.cu.cc",
         "aten/src/THCUNN/SoftPlus.cu.cc",
         "aten/src/THCUNN/SoftShrink.cu.cc",
@@ -1729,7 +1728,7 @@ cc_library(
         ],
         [
             ":aten",
-            "@tensorpipe//:tensorpipe_cpu",
+            "@tensorpipe",
         ],
     ),
     alwayslink = True,

diff --git a/aten/src/ATen/LegacyTHFunctionsCUDA.h b/aten/src/ATen/LegacyTHFunctionsCUDA.h
@@ -51,10 +51,7 @@ std::tuple<Tensor &,Tensor &> _thnn_log_sigmoid_forward_out(const Tensor & self,
 std::tuple<Tensor,Tensor> _thnn_log_sigmoid_forward(const Tensor & self);
 Tensor & _thnn_log_sigmoid_backward_out(const Tensor & grad_output, const Tensor & self, const Tensor & buffer, Tensor & grad_input);
 Tensor _thnn_log_sigmoid_backward(const Tensor & grad_output, const Tensor & self, const Tensor & buffer);
-Tensor & _thnn_rrelu_with_noise_forward_out(const Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training, c10::optional<at::Generator> generator, Tensor & output);
-Tensor _thnn_rrelu_with_noise_forward(const Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training, c10::optional<at::Generator> generator);
 Tensor _thnn_rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training);
-Tensor & _thnn_rrelu_with_noise_forward_(Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training, c10::optional<at::Generator> generator);
 std::tuple<Tensor &,Tensor &,Tensor &> _thnn_conv2d_forward_out(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt, IntArrayRef stride, IntArrayRef padding, Tensor & output, Tensor & columns, Tensor & ones);
 std::tuple<Tensor,Tensor,Tensor> _thnn_conv2d_forward(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const optional<Tensor> & bias, IntArrayRef stride, IntArrayRef padding);
 std::tuple<Tensor &,Tensor &,Tensor &> _thnn_conv2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, const Tensor & columns, const Tensor & ones);

diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
@@ -1285,112 +1285,6 @@ Tensor _thnn_log_sigmoid_backward(const Tensor & grad_output, const Tensor & sel
     }
     return grad_input;
 }
-Tensor & _thnn_rrelu_with_noise_forward_out(const Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training, c10::optional<at::Generator> generator, Tensor & output) {
-    const OptionalDeviceGuard device_guard(device_of(self));
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto lower_ = lower.toDouble();
-            auto upper_ = upper.toDouble();
-            auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THNN_CudaDoubleRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto lower_ = lower.toDouble();
-            auto upper_ = upper.toDouble();
-            auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THNN_CudaRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator);
-            break;
-        }
-        case ScalarType::Half: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto lower_ = lower.toDouble();
-            auto upper_ = upper.toDouble();
-            auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THNN_CudaHalfRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator);
-            break;
-        }
-        default:
-            AT_ERROR("_thnn_rrelu_with_noise_forward_out not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return output;
-}
-Tensor _thnn_rrelu_with_noise_forward(const Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training, c10::optional<at::Generator> generator) {
-    const OptionalDeviceGuard device_guard(device_of(self));
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    auto output_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
-    auto output = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(output_));
-    switch (dispatch_scalar_type) {
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto lower_ = lower.toDouble();
-            auto upper_ = upper.toDouble();
-            THNN_CudaDoubleRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto lower_ = lower.toDouble();
-            auto upper_ = upper.toDouble();
-            THNN_CudaRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator);
-            break;
-        }
-        case ScalarType::Half: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto lower_ = lower.toDouble();
-            auto upper_ = upper.toDouble();
-            THNN_CudaHalfRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator);
-            break;
-        }
-        default:
-            AT_ERROR("_thnn_rrelu_with_noise_forward not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return output;
-}
-Tensor & _thnn_rrelu_with_noise_forward_(Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training, c10::optional<at::Generator> generator) {
-    const OptionalDeviceGuard device_guard(device_of(self));
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto lower_ = lower.toDouble();
-            auto upper_ = upper.toDouble();
-            THNN_CudaDoubleRReLU_updateOutput(globalContext().getTHCState(), self_, self_, noise_, lower_, upper_, training, true, generator);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto lower_ = lower.toDouble();
-            auto upper_ = upper.toDouble();
-            THNN_CudaRReLU_updateOutput(globalContext().getTHCState(), self_, self_, noise_, lower_, upper_, training, true, generator);
-            break;
-        }
-        case ScalarType::Half: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto lower_ = lower.toDouble();
-            auto upper_ = upper.toDouble();
-            THNN_CudaHalfRReLU_updateOutput(globalContext().getTHCState(), self_, self_, noise_, lower_, upper_, training, true, generator);
-            break;
-        }
-        default:
-            AT_ERROR("_thnn_rrelu_with_noise_forward_ not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return self;
-}
 std::tuple<Tensor &,Tensor &,Tensor &> _thnn_conv2d_forward_out(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt, IntArrayRef stride, IntArrayRef padding, Tensor & output, Tensor & columns, Tensor & ones) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);

diff --git a/aten/src/ATen/native/cuda/Activation.cu b/aten/src/ATen/native/cuda/Activation.cu
@@ -8,13 +8,17 @@
 
 #include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
+#include <ATen/CUDAGeneratorImpl.h>
 #include <ATen/Dispatch.h>
 #include <ATen/NativeFunctions.h>
+#include <ATen/TensorUtils.h>
 #include <ATen/core/Array.h>
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 #include <ATen/cuda/detail/IndexUtils.cuh>
 #include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/cuda/detail/KernelUtils.h>
 #include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/DistributionTemplates.h>
 #include <c10/cuda/CUDAMathCompat.h>
 
 namespace at {
@@ -243,6 +247,179 @@ std::tuple<Tensor, Tensor> prelu_backward_cuda(const Tensor& grad_out_, const Te
   return std::tuple<Tensor, Tensor>{input_grad, weight_grad};
 }
 
+// -----------------------------------
+// rrelu
+// -----------------------------------
+template <typename scalar_t, int unroll_factor, typename F>
+#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
+C10_LAUNCH_BOUNDS_2(256, 4)
+#endif
+__global__ void rrelu_with_noise_cuda_kernel(
+    int numel,
+    PhiloxCudaState philox_args,
+    scalar_t* output,
+    scalar_t* input,
+    scalar_t* noise,
+    double lower,
+    double upper,
+    const F& random_func) {
+  auto seeds = at::cuda::philox::unpack(philox_args);
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  curandStatePhilox4_32_10_t state;
+  curand_init(std::get<0>(seeds),
+              idx,
+              std::get<1>(seeds),
+              &state);
+
+  int grid_stride = blockDim.x * gridDim.x * unroll_factor;
+  int rounded_size = ((numel - 1) / grid_stride + 1) * grid_stride;
+  double range = upper - lower;
+
+  for (int linear_index = idx; linear_index < rounded_size; linear_index += grid_stride) {
+    auto rand = random_func(&state);
+
+    // ensure that (&rand.x)[ii] is safe
+    static_assert(sizeof(rand)/sizeof(rand.x) == unroll_factor, "");
+
+    #pragma unroll
+    for (int ii = 0; ii < unroll_factor; ii++) {
+      int li = linear_index + blockDim.x * gridDim.x * ii;
+      if (li >= numel) {
+        continue;
+      }
+      scalar_t r = static_cast<scalar_t>((&rand.x)[ii]);
+      r = r * range + lower;
+      if (input[li] <= 0) {
+        output[li] = input[li] * r;
+        noise[li] = r;
+      } else {
+        output[li] = input[li];
+        noise[li] = static_cast<scalar_t>(0);
+      }
+    }
+    __syncthreads();
+  }
+}
+
+template <typename scalar_t>
+inline void _rrelu_with_noise_cuda_train(
+    Tensor& output,
+    const Tensor& input_,
+    const Tensor& noise_,
+    const Scalar& lower_,
+    const Scalar& upper_,
+    c10::optional<Generator> generator) {
+  auto input = input_.contiguous();
+  auto noise = noise_.contiguous();
+  Tensor tmp_output = output.contiguous();
+
+  int64_t numel = input.numel();
+  auto execution_policy = calc_execution_policy(numel);
+
+  auto counter_offset = std::get<0>(execution_policy);
+  auto grid = std::get<1>(execution_policy);
+  auto block = std::get<2>(execution_policy);
+
+  auto gen = get_generator_or_default<CUDAGeneratorImpl>(
+      generator, cuda::detail::getDefaultCUDAGenerator());
+  PhiloxCudaState rng_engine_inputs;
+  {
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    rng_engine_inputs = gen->philox_cuda_state(counter_offset);
+  }
+
+  scalar_t* input_data = input.data_ptr<scalar_t>();
+  scalar_t* noise_data = noise.data_ptr<scalar_t>();
+  scalar_t* output_data = tmp_output.data_ptr<scalar_t>();
+
+  double lower = lower_.to<double>();
+  double upper = upper_.to<double>();
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  if (std::is_same<scalar_t, double>::value) {
+    rrelu_with_noise_cuda_kernel<scalar_t, 2><<<grid, block, 0, stream>>>(
+        numel,
+        rng_engine_inputs,
+        output_data,
+        input_data,
+        noise_data,
+        lower,
+        upper,
+        [] __device__ (curandStatePhilox4_32_10_t* state) {
+          return curand_uniform2_double(state);
+        });
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+  } else {
+    // half and float
+    rrelu_with_noise_cuda_kernel<scalar_t, 4><<<grid, block, 0, stream>>>(
+        numel,
+        rng_engine_inputs,
+        output_data,
+        input_data,
+        noise_data,
+        lower, upper,
+        [] __device__ (curandStatePhilox4_32_10_t* state) {
+          return curand_uniform4(state);
+        });
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+
+  if (!output.is_contiguous()) {
+    output.copy_(tmp_output);
+  }
+}
+
+Tensor& rrelu_with_noise_out_cuda(const Tensor& self,
+    const Tensor& noise,
+    const Scalar& lower,
+    const Scalar& upper,
+    bool training,
+    c10::optional<Generator> generator,
+    Tensor& output) {
+  TensorArg self_arg{self, "self", 1}, noise_arg{noise, "noise", 2},
+      output_arg{output, "output", 3};
+  checkAllSameGPU("rrelu_with_noise_out_cuda", {self_arg, noise_arg, output_arg});
+
+  if (training) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        self.scalar_type(), "rrelu_with_noise_out_cuda", [&] {
+          _rrelu_with_noise_cuda_train<scalar_t>(
+              output, self, noise, lower, upper, generator);
+        });
+  }
+  else {
+    auto lower_tensor = lower.to<double>();
+    auto upper_tensor = upper.to<double>();
+    Scalar negative_slope = (lower_tensor + upper_tensor) / 2;
+    at::leaky_relu_out(output, self, negative_slope);
+  }
+  return output;
+}
+
+Tensor rrelu_with_noise_cuda(
+    const Tensor& self,
+    const Tensor& noise,
+    const Scalar& lower,
+    const Scalar& upper,
+    bool training,
+    c10::optional<Generator> generator) {
+  Tensor output = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  return at::native::rrelu_with_noise_out_cuda(self, noise, lower, upper, training, generator, output);
+}
+
+Tensor& rrelu_with_noise_cuda_(
+    Tensor& self,
+    const Tensor& noise,
+    const Scalar& lower,
+    const Scalar& upper,
+    bool training,
+    c10::optional<Generator> generator) {
+  return at::native::rrelu_with_noise_out_cuda(
+      self, noise, lower, upper, training, generator, self);
+}
+
 // -----------------------------------
 // hardshrink
 // -----------------------------------

diff --git a/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm b/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm
@@ -118,6 +118,10 @@ - (void)endSynchronization:(NSError*)error {
     TORCH_CHECK(_buffer, "Allocate GPU memory failed!");
   }
   copyToMetalBuffer(_commandBuffer, _buffer, _image);
+  if (_image.isTemporaryImage && _image.readCount != 0) {
+    _image =
+        createStaticImage((MPSTemporaryImage*)_image, _commandBuffer, false);
+  }
 }
 
 void MPSImageWrapper::synchronize() {

diff --git a/aten/src/ATen/native/metal/ops/MetalReshape.mm b/aten/src/ATen/native/metal/ops/MetalReshape.mm
@@ -32,18 +32,18 @@ Tensor view(const Tensor& input, IntArrayRef size) {
   MetalTensorImplStorage mt{inferred_size, stride_value};
   mt.texture()->allocateTemporaryStorage(inferred_size, commandBuffer);
   MPSImage* Y = mt.texture()->image();
-  id<MTLComputePipelineState> state = [[MPSCNNContext sharedInstance]
-      specializedPipelineState:"reshape"
-                     Constants:@[
-                       @(Y.height),
-                       @(Y.width),
-                       @(Y.featureChannels),
-                       @(Y.numberOfImages),
-                       @(X.height),
-                       @(X.width),
-                       @(X.featureChannels),
-                       @(X.numberOfImages),
-                     ]];
+  id<MTLComputePipelineState> state =
+      [[MPSCNNContext sharedInstance] specializedPipelineState:"reshape"
+                                                     Constants:@[
+                                                       @(Y.height),
+                                                       @(Y.width),
+                                                       @(Y.featureChannels),
+                                                       @(Y.numberOfImages),
+                                                       @(X.height),
+                                                       @(X.width),
+                                                       @(X.featureChannels),
+                                                       @(X.numberOfImages),
+                                                     ]];
   id<MTLComputeCommandEncoder> encoder =
       [commandBuffer.buffer computeCommandEncoder];
   [encoder setComputePipelineState:state];
@@ -95,7 +95,13 @@ Tensor flatten_using_ints(
   return input.reshape(shape);
 }
 
+Tensor detach(const Tensor& input) {
+  TORCH_CHECK(input.is_metal());
+  return input;
+}
+
 TORCH_LIBRARY_IMPL(aten, Metal, m) {
+  m.impl("detach", TORCH_FN(detach));
   m.impl("view", TORCH_FN(view));
   m.impl("reshape", TORCH_FN(reshape));
   m.impl("flatten.using_ints", TORCH_FN(flatten_using_ints));