Skip to content

Commit

Permalink
Update on "split TestAsserts by functionality"
Browse files Browse the repository at this point in the history
Instead of having one large TestAsserts test case, we split of tests for
self-contained functionality like container or complex checking into
separate test cases. That makes it a lot easier to keep an overview over
what is tested.

[ghstack-poisoned]
  • Loading branch information
pmeier committed Jun 18, 2021
2 parents 14f2c56 + 39816ba commit 00783ac
Show file tree
Hide file tree
Showing 32 changed files with 361 additions and 450 deletions.
3 changes: 1 addition & 2 deletions BUILD.bazel
Expand Up @@ -421,7 +421,6 @@ filegroup(
"aten/src/THCUNN/LogSigmoid.cu.cc",
"aten/src/THCUNN/MultiLabelMarginCriterion.cu.cc",
"aten/src/THCUNN/MultiMarginCriterion.cu.cc",
"aten/src/THCUNN/RReLU.cu.cc",
"aten/src/THCUNN/SoftMarginCriterion.cu.cc",
"aten/src/THCUNN/SoftPlus.cu.cc",
"aten/src/THCUNN/SoftShrink.cu.cc",
Expand Down Expand Up @@ -1729,7 +1728,7 @@ cc_library(
],
[
":aten",
"@tensorpipe//:tensorpipe_cpu",
"@tensorpipe",
],
),
alwayslink = True,
Expand Down
3 changes: 0 additions & 3 deletions aten/src/ATen/LegacyTHFunctionsCUDA.h
Expand Up @@ -51,10 +51,7 @@ std::tuple<Tensor &,Tensor &> _thnn_log_sigmoid_forward_out(const Tensor & self,
std::tuple<Tensor,Tensor> _thnn_log_sigmoid_forward(const Tensor & self);
Tensor & _thnn_log_sigmoid_backward_out(const Tensor & grad_output, const Tensor & self, const Tensor & buffer, Tensor & grad_input);
Tensor _thnn_log_sigmoid_backward(const Tensor & grad_output, const Tensor & self, const Tensor & buffer);
Tensor & _thnn_rrelu_with_noise_forward_out(const Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training, c10::optional<at::Generator> generator, Tensor & output);
Tensor _thnn_rrelu_with_noise_forward(const Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training, c10::optional<at::Generator> generator);
Tensor _thnn_rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training);
Tensor & _thnn_rrelu_with_noise_forward_(Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training, c10::optional<at::Generator> generator);
std::tuple<Tensor &,Tensor &,Tensor &> _thnn_conv2d_forward_out(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt, IntArrayRef stride, IntArrayRef padding, Tensor & output, Tensor & columns, Tensor & ones);
std::tuple<Tensor,Tensor,Tensor> _thnn_conv2d_forward(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const optional<Tensor> & bias, IntArrayRef stride, IntArrayRef padding);
std::tuple<Tensor &,Tensor &,Tensor &> _thnn_conv2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, const Tensor & columns, const Tensor & ones);
Expand Down
106 changes: 0 additions & 106 deletions aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
Expand Up @@ -1285,112 +1285,6 @@ Tensor _thnn_log_sigmoid_backward(const Tensor & grad_output, const Tensor & sel
}
return grad_input;
}
Tensor & _thnn_rrelu_with_noise_forward_out(const Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training, c10::optional<at::Generator> generator, Tensor & output) {
const OptionalDeviceGuard device_guard(device_of(self));
auto dispatch_scalar_type = infer_scalar_type(self);

switch (dispatch_scalar_type) {
case ScalarType::Double: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
auto lower_ = lower.toDouble();
auto upper_ = upper.toDouble();
auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
THNN_CudaDoubleRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator);
break;
}
case ScalarType::Float: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
auto lower_ = lower.toDouble();
auto upper_ = upper.toDouble();
auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
THNN_CudaRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator);
break;
}
case ScalarType::Half: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
auto lower_ = lower.toDouble();
auto upper_ = upper.toDouble();
auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_rrelu_with_noise_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
THNN_CudaHalfRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator);
break;
}
default:
AT_ERROR("_thnn_rrelu_with_noise_forward_out not supported on CUDAType for ", dispatch_scalar_type);
}
return output;
}
Tensor _thnn_rrelu_with_noise_forward(const Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training, c10::optional<at::Generator> generator) {
const OptionalDeviceGuard device_guard(device_of(self));
auto dispatch_scalar_type = infer_scalar_type(self);
auto output_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
auto output = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(output_));
switch (dispatch_scalar_type) {
case ScalarType::Double: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type);
auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type);
auto lower_ = lower.toDouble();
auto upper_ = upper.toDouble();
THNN_CudaDoubleRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator);
break;
}
case ScalarType::Float: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type);
auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type);
auto lower_ = lower.toDouble();
auto upper_ = upper.toDouble();
THNN_CudaRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator);
break;
}
case ScalarType::Half: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type);
auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward", false, DeviceType::CUDA, dispatch_scalar_type);
auto lower_ = lower.toDouble();
auto upper_ = upper.toDouble();
THNN_CudaHalfRReLU_updateOutput(globalContext().getTHCState(), self_, output_, noise_, lower_, upper_, training, false, generator);
break;
}
default:
AT_ERROR("_thnn_rrelu_with_noise_forward not supported on CUDAType for ", dispatch_scalar_type);
}
return output;
}
Tensor & _thnn_rrelu_with_noise_forward_(Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training, c10::optional<at::Generator> generator) {
const OptionalDeviceGuard device_guard(device_of(self));
auto dispatch_scalar_type = infer_scalar_type(self);

switch (dispatch_scalar_type) {
case ScalarType::Double: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type);
auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type);
auto lower_ = lower.toDouble();
auto upper_ = upper.toDouble();
THNN_CudaDoubleRReLU_updateOutput(globalContext().getTHCState(), self_, self_, noise_, lower_, upper_, training, true, generator);
break;
}
case ScalarType::Float: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type);
auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type);
auto lower_ = lower.toDouble();
auto upper_ = upper.toDouble();
THNN_CudaRReLU_updateOutput(globalContext().getTHCState(), self_, self_, noise_, lower_, upper_, training, true, generator);
break;
}
case ScalarType::Half: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type);
auto noise_ = checked_dense_tensor_unwrap(noise, "noise", 2, "_thnn_rrelu_with_noise_forward_", false, DeviceType::CUDA, dispatch_scalar_type);
auto lower_ = lower.toDouble();
auto upper_ = upper.toDouble();
THNN_CudaHalfRReLU_updateOutput(globalContext().getTHCState(), self_, self_, noise_, lower_, upper_, training, true, generator);
break;
}
default:
AT_ERROR("_thnn_rrelu_with_noise_forward_ not supported on CUDAType for ", dispatch_scalar_type);
}
return self;
}
std::tuple<Tensor &,Tensor &,Tensor &> _thnn_conv2d_forward_out(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt, IntArrayRef stride, IntArrayRef padding, Tensor & output, Tensor & columns, Tensor & ones) {
// See [Note: hacky wrapper removal for optional tensor]
c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
Expand Down
177 changes: 177 additions & 0 deletions aten/src/ATen/native/cuda/Activation.cu
Expand Up @@ -8,13 +8,17 @@

#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/CUDAGeneratorImpl.h>
#include <ATen/Dispatch.h>
#include <ATen/NativeFunctions.h>
#include <ATen/TensorUtils.h>
#include <ATen/core/Array.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#include <ATen/cuda/detail/IndexUtils.cuh>
#include <ATen/cuda/detail/OffsetCalculator.cuh>
#include <ATen/cuda/detail/KernelUtils.h>
#include <ATen/native/cuda/Loops.cuh>
#include <ATen/native/cuda/DistributionTemplates.h>
#include <c10/cuda/CUDAMathCompat.h>

namespace at {
Expand Down Expand Up @@ -243,6 +247,179 @@ std::tuple<Tensor, Tensor> prelu_backward_cuda(const Tensor& grad_out_, const Te
return std::tuple<Tensor, Tensor>{input_grad, weight_grad};
}

// -----------------------------------
// rrelu
// -----------------------------------
template <typename scalar_t, int unroll_factor, typename F>
#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
C10_LAUNCH_BOUNDS_2(256, 4)
#endif
__global__ void rrelu_with_noise_cuda_kernel(
int numel,
PhiloxCudaState philox_args,
scalar_t* output,
scalar_t* input,
scalar_t* noise,
double lower,
double upper,
const F& random_func) {
auto seeds = at::cuda::philox::unpack(philox_args);
int idx = blockIdx.x * blockDim.x + threadIdx.x;
curandStatePhilox4_32_10_t state;
curand_init(std::get<0>(seeds),
idx,
std::get<1>(seeds),
&state);

int grid_stride = blockDim.x * gridDim.x * unroll_factor;
int rounded_size = ((numel - 1) / grid_stride + 1) * grid_stride;
double range = upper - lower;

for (int linear_index = idx; linear_index < rounded_size; linear_index += grid_stride) {
auto rand = random_func(&state);

// ensure that (&rand.x)[ii] is safe
static_assert(sizeof(rand)/sizeof(rand.x) == unroll_factor, "");

#pragma unroll
for (int ii = 0; ii < unroll_factor; ii++) {
int li = linear_index + blockDim.x * gridDim.x * ii;
if (li >= numel) {
continue;
}
scalar_t r = static_cast<scalar_t>((&rand.x)[ii]);
r = r * range + lower;
if (input[li] <= 0) {
output[li] = input[li] * r;
noise[li] = r;
} else {
output[li] = input[li];
noise[li] = static_cast<scalar_t>(0);
}
}
__syncthreads();
}
}

template <typename scalar_t>
inline void _rrelu_with_noise_cuda_train(
Tensor& output,
const Tensor& input_,
const Tensor& noise_,
const Scalar& lower_,
const Scalar& upper_,
c10::optional<Generator> generator) {
auto input = input_.contiguous();
auto noise = noise_.contiguous();
Tensor tmp_output = output.contiguous();

int64_t numel = input.numel();
auto execution_policy = calc_execution_policy(numel);

auto counter_offset = std::get<0>(execution_policy);
auto grid = std::get<1>(execution_policy);
auto block = std::get<2>(execution_policy);

auto gen = get_generator_or_default<CUDAGeneratorImpl>(
generator, cuda::detail::getDefaultCUDAGenerator());
PhiloxCudaState rng_engine_inputs;
{
// See Note [Acquire lock when using random generators]
std::lock_guard<std::mutex> lock(gen->mutex_);
rng_engine_inputs = gen->philox_cuda_state(counter_offset);
}

scalar_t* input_data = input.data_ptr<scalar_t>();
scalar_t* noise_data = noise.data_ptr<scalar_t>();
scalar_t* output_data = tmp_output.data_ptr<scalar_t>();

double lower = lower_.to<double>();
double upper = upper_.to<double>();

auto stream = at::cuda::getCurrentCUDAStream();

if (std::is_same<scalar_t, double>::value) {
rrelu_with_noise_cuda_kernel<scalar_t, 2><<<grid, block, 0, stream>>>(
numel,
rng_engine_inputs,
output_data,
input_data,
noise_data,
lower,
upper,
[] __device__ (curandStatePhilox4_32_10_t* state) {
return curand_uniform2_double(state);
});
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else {
// half and float
rrelu_with_noise_cuda_kernel<scalar_t, 4><<<grid, block, 0, stream>>>(
numel,
rng_engine_inputs,
output_data,
input_data,
noise_data,
lower, upper,
[] __device__ (curandStatePhilox4_32_10_t* state) {
return curand_uniform4(state);
});
C10_CUDA_KERNEL_LAUNCH_CHECK();
}

if (!output.is_contiguous()) {
output.copy_(tmp_output);
}
}

Tensor& rrelu_with_noise_out_cuda(const Tensor& self,
const Tensor& noise,
const Scalar& lower,
const Scalar& upper,
bool training,
c10::optional<Generator> generator,
Tensor& output) {
TensorArg self_arg{self, "self", 1}, noise_arg{noise, "noise", 2},
output_arg{output, "output", 3};
checkAllSameGPU("rrelu_with_noise_out_cuda", {self_arg, noise_arg, output_arg});

if (training) {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
self.scalar_type(), "rrelu_with_noise_out_cuda", [&] {
_rrelu_with_noise_cuda_train<scalar_t>(
output, self, noise, lower, upper, generator);
});
}
else {
auto lower_tensor = lower.to<double>();
auto upper_tensor = upper.to<double>();
Scalar negative_slope = (lower_tensor + upper_tensor) / 2;
at::leaky_relu_out(output, self, negative_slope);
}
return output;
}

Tensor rrelu_with_noise_cuda(
const Tensor& self,
const Tensor& noise,
const Scalar& lower,
const Scalar& upper,
bool training,
c10::optional<Generator> generator) {
Tensor output = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
return at::native::rrelu_with_noise_out_cuda(self, noise, lower, upper, training, generator, output);
}

Tensor& rrelu_with_noise_cuda_(
Tensor& self,
const Tensor& noise,
const Scalar& lower,
const Scalar& upper,
bool training,
c10::optional<Generator> generator) {
return at::native::rrelu_with_noise_out_cuda(
self, noise, lower, upper, training, generator, self);
}

// -----------------------------------
// hardshrink
// -----------------------------------
Expand Down
4 changes: 4 additions & 0 deletions aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm
Expand Up @@ -118,6 +118,10 @@ - (void)endSynchronization:(NSError*)error {
TORCH_CHECK(_buffer, "Allocate GPU memory failed!");
}
copyToMetalBuffer(_commandBuffer, _buffer, _image);
if (_image.isTemporaryImage && _image.readCount != 0) {
_image =
createStaticImage((MPSTemporaryImage*)_image, _commandBuffer, false);
}
}

void MPSImageWrapper::synchronize() {
Expand Down
30 changes: 18 additions & 12 deletions aten/src/ATen/native/metal/ops/MetalReshape.mm
Expand Up @@ -32,18 +32,18 @@ Tensor view(const Tensor& input, IntArrayRef size) {
MetalTensorImplStorage mt{inferred_size, stride_value};
mt.texture()->allocateTemporaryStorage(inferred_size, commandBuffer);
MPSImage* Y = mt.texture()->image();
id<MTLComputePipelineState> state = [[MPSCNNContext sharedInstance]
specializedPipelineState:"reshape"
Constants:@[
@(Y.height),
@(Y.width),
@(Y.featureChannels),
@(Y.numberOfImages),
@(X.height),
@(X.width),
@(X.featureChannels),
@(X.numberOfImages),
]];
id<MTLComputePipelineState> state =
[[MPSCNNContext sharedInstance] specializedPipelineState:"reshape"
Constants:@[
@(Y.height),
@(Y.width),
@(Y.featureChannels),
@(Y.numberOfImages),
@(X.height),
@(X.width),
@(X.featureChannels),
@(X.numberOfImages),
]];
id<MTLComputeCommandEncoder> encoder =
[commandBuffer.buffer computeCommandEncoder];
[encoder setComputePipelineState:state];
Expand Down Expand Up @@ -95,7 +95,13 @@ Tensor flatten_using_ints(
return input.reshape(shape);
}

Tensor detach(const Tensor& input) {
TORCH_CHECK(input.is_metal());
return input;
}

TORCH_LIBRARY_IMPL(aten, Metal, m) {
m.impl("detach", TORCH_FN(detach));
m.impl("view", TORCH_FN(view));
m.impl("reshape", TORCH_FN(reshape));
m.impl("flatten.using_ints", TORCH_FN(flatten_using_ints));
Expand Down

0 comments on commit 00783ac

Please sign in to comment.