Skip to content

Commit

Permalink
Add missing period as suggested by svekars on "Fix torch.bucketize do…
Browse files Browse the repository at this point in the history
…cs for "right""


The docs correctly (i.e matching actual op behavior) state that

`right = False` means `boundaries[i-1] < input[m][n]...[l][x] <= boundaries[i]`.

However they previously stated that
`If 'right' is False (default), then the left boundary is closed.`

which contradicts the `boundaries[i-1] < input[m][n]...[l][x] <= boundaries[i]` statement.

This modifies the docs to say `... then the left boundary is OPEN.` and also clarifies that this is the opposite behavior of numpy.digitize.

Fixes #91580

[ghstack-poisoned]
  • Loading branch information
davidberard98 committed Aug 16, 2023
2 parents febb5e8 + 033b8d5 commit 9159edc
Show file tree
Hide file tree
Showing 58 changed files with 1,131 additions and 609 deletions.
2 changes: 1 addition & 1 deletion .github/ci_commit_pins/vision.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
f2b6f43a85452fe47eaa042ce684183add17fcac
498b9c8662e2322615748aafc321ad4a5bc02afb
1 change: 1 addition & 0 deletions .github/merge_rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
- scripts/onnx/**
- test/onnx/**
- tools/onnx/**
- torch/_dynamo/backends/onnxrt.py
- torch/_C/__init__.pyi.in
- torch/_C/_onnx.pyi
- torch/csrc/jit/passes/onnx.*
Expand Down
5 changes: 3 additions & 2 deletions CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,12 @@ nn/qat/ @jerryzh168
/torch/testing/_internal/distributed @mrshenli @zhaojuanmao @rohan-varma @H-Huang @awgu @kwen2501 @wanchaol @fegin @fduwjj

# ONNX Export
/torch/_dynamo/backends/onnxrt.py @bowenbao @abock @thiagocrepaldi @wschin
/torch/csrc/jit/passes/onnx.h @bowenbao @abock @thiagocrepaldi
/torch/csrc/jit/passes/onnx.cpp @bowenbao @abock @thiagocrepaldi
/torch/csrc/jit/passes/onnx/ @bowenbao @abock @thiagocrepaldi
/torch/onnx/ @bowenbao @abock @thiagocrepaldi
/test/onnx/ @bowenbao @abock @thiagocrepaldi
/torch/onnx/ @bowenbao @abock @thiagocrepaldi @wschin
/test/onnx/ @bowenbao @abock @thiagocrepaldi @wschin

# Docker
/.ci/docker/ @jeffdaily
Expand Down
111 changes: 1 addition & 110 deletions aten/src/ATen/cuda/CUDABlas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
#include <ATen/ATen.h>
#include <ATen/cuda/CUDABlas.h>
#include <ATen/cuda/Exceptions.h>
#include <ATen/cuda/CUDADataType.h>
#include <c10/cuda/CUDAFunctions.h>
#include <c10/macros/Export.h>
#include <c10/util/irange.h>
Expand Down Expand Up @@ -197,6 +196,7 @@ static size_t _getWorkspaceSize() {
static size_t workspace_size = _parseChosenWorkspaceSize();
return workspace_size;
}

} // anonymous namespace

namespace at::cuda::blas {
Expand Down Expand Up @@ -876,115 +876,6 @@ template void gemm_and_bias(
int64_t result_ld,
GEMMAndBiasActivationEpilogue activation);

void scaled_gemm(
char transa,
char transb,
int64_t m,
int64_t n,
int64_t k,
const void* mat1_ptr,
const void* mat1_scale_ptr,
int64_t mat1_ld,
ScalarType mat1_dtype,
const void* mat2_ptr,
const void* mat2_scale_ptr,
int64_t mat2_ld,
ScalarType mat2_dtype,
const void* bias_ptr,
ScalarType bias_dtype,
void* result_ptr,
const void *result_scale_ptr,
int64_t result_ld,
ScalarType result_dtype,
void* amax_ptr) {
const auto computeType = CUBLAS_COMPUTE_32F;
const auto scaleType = CUDA_R_32F;
CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, _cublasOpFromChar(transa));
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_AMAX_D_POINTER, amax_ptr);
CuBlasLtMatrixLayout Adesc(ScalarTypeToCudaDataType(mat1_dtype), m, k, mat1_ld, transa == 't');
CuBlasLtMatrixLayout Bdesc(ScalarTypeToCudaDataType(mat2_dtype), k, n, mat2_ld, transb == 't');
CuBlasLtMatrixLayout Cdesc(ScalarTypeToCudaDataType(bias_dtype), m, n, result_ld);
CuBlasLtMatrixLayout Ddesc(ScalarTypeToCudaDataType(result_dtype), m, n, result_ld);
if (bias_ptr) {
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias_ptr);
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, CUBLASLT_EPILOGUE_RELU_BIAS);
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToCudaDataType(bias_dtype));
}
size_t workspaceSize = _getWorkspaceSize();
auto workspace = at::empty(
{static_cast<int64_t>(workspaceSize)},
at::device({at::kCUDA, at::cuda::current_device()}).dtype(at::kByte));

CuBlasLtMatmulPreference preference;
preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, workspaceSize);
cublasLtMatmulHeuristicResult_t heuristicResult = {};
int returnedResult = 0;
cublasLtHandle_t ltHandle =
reinterpret_cast<cublasLtHandle_t>(at::cuda::getCurrentCUDABlasHandle());
TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
ltHandle,
computeDesc.descriptor(),
Adesc.descriptor(),
Bdesc.descriptor(),
Cdesc.descriptor(),
Ddesc.descriptor(),
preference.descriptor(),
1,
&heuristicResult,
&returnedResult));
if (returnedResult == 0) {
TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED);
}
float alpha_val = 1.0;
float beta_val = 0.0;
cublasStatus_t cublasStatus = cublasLtMatmul(
ltHandle,
computeDesc.descriptor(),
&alpha_val,
mat1_ptr,
Adesc.descriptor(),
mat2_ptr,
Bdesc.descriptor(),
&beta_val,
nullptr,
Cdesc.descriptor(),
result_ptr,
Ddesc.descriptor(),
&heuristicResult.algo,
workspace.data_ptr(),
workspaceSize,
at::cuda::getCurrentCUDAStream());
TORCH_CHECK(
cublasStatus == CUBLAS_STATUS_SUCCESS,
"CUDA error: ",
at::cuda::blas::_cublasGetErrorEnum(cublasStatus),
" when calling cublasLtMatmul with transpose_mat1 ",
transa,
" transpose_mat2 ",
transb,
" m ",
m,
" n ",
n,
" k ",
k,
" mat1_ld ",
mat1_ld,
" mat2_ld ",
mat2_ld,
" result_ld ",
result_ld,
" computeType ",
computeType,
" scaleType ",
scaleType);
}

void int8_gemm(
bool transpose_mat1,
bool transpose_mat2,
Expand Down
22 changes: 0 additions & 22 deletions aten/src/ATen/cuda/CUDABlas.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,28 +100,6 @@ void int8_gemm(
int64_t mat2_ld,
int32_t* result_ptr,
int64_t result_ld);

void scaled_gemm(
char transa,
char transb,
int64_t m,
int64_t n,
int64_t k,
const void* mat1_ptr,
const void* mat1_scale_ptr,
int64_t mat1_ld,
ScalarType mat1_dtype,
const void* mat2_ptr,
const void* mat2_scale_ptr,
int64_t mat2_ld,
ScalarType mat2_dtype,
const void* bias,
ScalarType bias_dtype,
void* result_ptr,
const void* result_scale_ptr,
int64_t result_ld,
ScalarType result_dtype,
void* amax_ptr);
#endif

#define CUDABLAS_BGEMM_ARGTYPES(Dtype) \
Expand Down
3 changes: 2 additions & 1 deletion aten/src/ATen/miopen/Descriptors.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,10 @@ struct ConvolutionDescriptor
&miopenCreateConvolutionDescriptor,
&miopenDestroyConvolutionDescriptor>
{
void set(miopenDataType_t dataType, miopenConvolutionMode_t c_mode, int dim, int* pad, int* stride, int * upscale /* aka dilation */, int groups) {
void set(miopenDataType_t dataType, miopenConvolutionMode_t c_mode, int dim, int* pad, int* stride, int * upscale /* aka dilation */, int groups, bool deterministic) {
MIOPEN_CHECK(miopenInitConvolutionNdDescriptor(mut_desc(), dim, pad, stride, upscale, c_mode));
MIOPEN_CHECK(miopenSetConvolutionGroupCount(mut_desc(), groups));
MIOPEN_CHECK(miopenSetConvolutionAttribute(mut_desc(), MIOPEN_CONVOLUTION_ATTRIB_DETERMINISTIC, deterministic ? 1 : 0));
}
};

Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/native/BinaryOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1393,7 +1393,7 @@ Tensor& comparison_op_out(Tensor& result, const Tensor& self, const Tensor& othe

template <typename OutImpl>
Tensor comparison_op(const Tensor& self, const Tensor& other, OutImpl& out_impl) {
Tensor result = at::empty({0}, self.options().dtype(kBool));
Tensor result = at::empty(self.sizes(), self.options().dtype(kBool).device(self.device()));
return out_impl(result, self, other);
}

Expand Down
111 changes: 0 additions & 111 deletions aten/src/ATen/native/cuda/Blas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
#else
#include <ATen/ops/_addmm_activation_native.h>
#include <ATen/ops/_efficientzerotensor.h>
#include <ATen/ops/_scaled_mm_native.h>
#include <ATen/ops/addmm_native.h>
#include <ATen/ops/addmv_native.h>
#include <ATen/ops/baddbmm_native.h>
Expand Down Expand Up @@ -714,114 +713,4 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
return _int_mm_out_cuda(self, mat2, result);
}

// Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax
// Scales are only applicable when matrices are of Float8 type and assumbed to be equal to 1.0 by default.
// If output matrix type is 16 or 32-bit type, neither scale_result is applied nor amax is computed.
// Known limitations:
// - Only works if mat1 is row-major and mat2 is column-major
// - Only works if matrices sizes are divisible by 32
std::tuple<Tensor&, Tensor&>
_scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
const c10::optional<at::Tensor>& bias,
c10::optional<c10::ScalarType> out_dtype,
const c10::optional<at::Tensor>& scale_a,
const c10::optional<at::Tensor>& scale_b,
const c10::optional<at::Tensor>& scale_result,
Tensor& out, Tensor& amax) {
// Check sizes
TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
TORCH_CHECK(
mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
TORCH_CHECK(!scale_a || (scale_a->numel() == 1 && scale_a->scalar_type() == kFloat),
"scale_a must be float scalar");
TORCH_CHECK(!scale_b || (scale_b->numel() == 1 && scale_b->scalar_type() == kFloat),
"scale_b must be a float scalar");
TORCH_CHECK(!scale_result || (scale_result->numel() == 1 && scale_result->scalar_type() == kFloat),
"scale_result must be a float scalar");
TORCH_CHECK(!bias || bias->numel() == mat2.sizes()[1], "Bias must be size ", mat2.sizes()[1],
" but got ", bias->numel());
TORCH_CHECK(mat1.sizes()[0] % 16 == 0 && mat1.sizes()[1] % 16 == 0, "mat1 shape (", mat1.sizes()[0], "x",
mat1.sizes()[1], " must be divisible by 16");
TORCH_CHECK(mat2.sizes()[0] % 16 == 0 && mat2.sizes()[1] % 16 == 0, "mat2 shape (", mat2.sizes()[0], "x",
mat2.sizes()[1], " must be divisible by 16");
// Check types
TORCH_CHECK(!out_dtype || *out_dtype == out.scalar_type(), "out_dtype must match output matrix type");
TORCH_CHECK(amax.scalar_type() == kFloat, "amax must be a float scalar");
TORCH_CHECK(isFloat8Type(mat1.scalar_type()), "Expected mat1 to be Float8 matrix got ", mat1.scalar_type());
TORCH_CHECK(isFloat8Type(mat2.scalar_type()), "Expected mat2 to be Float8 matrix got ", mat1.scalar_type());
// Type restrictions imposed by CuBLASLt as of CUDA-12.1
TORCH_CHECK(mat1.scalar_type() != ScalarType::Float8_e5m2 || mat2.scalar_type() != ScalarType::Float8_e5m2,
"Multiplication of two Float8_e5m2 matrices is not supported");
if (bias) {
TORCH_CHECK(bias->scalar_type() == ScalarType::BFloat16 || bias->scalar_type() == ScalarType::Half,
"Bias must be either Half or BFloat16, but got ", bias->scalar_type());
TORCH_CHECK((out.scalar_type() != kFloat && out.scalar_type() != ScalarType::BFloat16) ||
bias->scalar_type() == ScalarType::BFloat16,
"Bias must be BFloat16 to compute ", out.scalar_type(), " output, but got ", bias->scalar_type());
TORCH_CHECK(out.scalar_type() != ScalarType::Half || bias->scalar_type() == ScalarType::Half,
"Bias must be Float16 to compute ", out.scalar_type(), " output, but got ", bias->scalar_type());
}
{
auto bias_ = bias.value_or(Tensor());
auto scale_a_ = scale_a.value_or(Tensor());
auto scale_b_ = scale_b.value_or(Tensor());
auto scale_result_ = scale_result.value_or(Tensor());
TensorArg targs[]{{out, "out", 0}, {amax, "amax", 1}, {mat1, "mat1", 2}, {mat2, "mat2", 3},
{bias_, "bias", 4}, {scale_a_, "scale_a", 5}, {scale_b_, "scale_b", 6},
{scale_result_, "scale_result", 7}};
checkAllSameGPU(__func__, targs);
}

IntArrayRef mat1_sizes = mat1.sizes();
IntArrayRef mat2_sizes = mat2.sizes();
at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
at::native::resize_output(amax, {});

#if !defined(USE_ROCM) && !defined(_MSC_VER)
cublasCommonArgs args(mat1, mat2, out);
const auto out_dtype_ = args.result->scalar_type();
TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");
at::cuda::blas::scaled_gemm(
args.transa,
args.transb,
args.m,
args.n,
args.k,
args.mata->data_ptr(),
scale_a ? scale_a->data_ptr() : nullptr,
args.lda,
args.mata->scalar_type(),
args.matb->data_ptr(),
scale_b ? scale_b->data_ptr() : nullptr,
args.ldb,
args.matb->scalar_type(),
bias ? bias->data_ptr(): nullptr,
bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_,
args.result->data_ptr(),
scale_result ? scale_result->data_ptr() : nullptr,
args.result_ld,
out_dtype_,
amax.data_ptr());
#else
TORCH_CHECK(false, "_scaled_mm_out_cuda is not compiled for this platform.");
#endif

return {out, amax};
}

std::tuple<Tensor, Tensor>
_scaled_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
const c10::optional<at::Tensor>& bias,
c10::optional<c10::ScalarType> out_dtype,
const c10::optional<at::Tensor>& scale_a,
const c10::optional<at::Tensor>& scale_b,
const c10::optional<at::Tensor>& scale_result) {
const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
Tensor amax = at::empty({0}, mat_a.options().dtype(ScalarType::Float));
return _scaled_mm_out_cuda(mat_a, mat_b, bias, out_dtype, scale_a, scale_b, scale_result, out ,amax);
}

} // namespace at::native
14 changes: 7 additions & 7 deletions aten/src/ATen/native/miopen/Conv_miopen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -719,7 +719,7 @@ void raw_miopen_convolution_forward_out(
args.idesc.set(input);
args.wdesc.set(weight, input.suggest_memory_format(), 0);
args.odesc.set(output);
args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, deterministic);

if (benchmark) {
miopenConvFwdAlgorithm_t fwdAlg;
Expand Down Expand Up @@ -827,7 +827,7 @@ void raw_miopen_depthwise_convolution_forward_out(
args.idesc.set(input);
args.wdesc.set(weight, input.suggest_memory_format(), 0);
args.odesc.set(output);
args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, deterministic);

if (benchmark) {
miopenConvFwdAlgorithm_t fwdAlg;
Expand Down Expand Up @@ -982,7 +982,7 @@ void raw_miopen_convolution_backward_weight_out(
args.idesc.set(input);
args.wdesc.set(grad_weight, input.suggest_memory_format(), 0);
args.odesc.set(grad_output);
args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, deterministic);

if (benchmark) {
miopenConvBwdWeightsAlgorithm_t bwdFilterAlg;
Expand Down Expand Up @@ -1026,7 +1026,7 @@ void raw_miopen_depthwise_convolution_backward_weight_out(
args.idesc.set(input);
args.wdesc.set(grad_weight, input.suggest_memory_format(), 0);
args.odesc.set(grad_output);
args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, deterministic);

if (benchmark) {
miopenConvBwdWeightsAlgorithm_t bwdFilterAlg;
Expand Down Expand Up @@ -1231,7 +1231,7 @@ void raw_miopen_convolution_backward_input_out(
args.idesc.set(grad_input);
args.wdesc.set(weight, grad_output.suggest_memory_format(), 0);
args.odesc.set(grad_output);
args.cdesc.set(dataType, c_mode, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
args.cdesc.set(dataType, c_mode, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, deterministic);

if (benchmark) {
miopenConvBwdDataAlgorithm_t bwdDataAlg;
Expand Down Expand Up @@ -1340,7 +1340,7 @@ void raw_miopen_depthwise_convolution_backward_input_out(
args.idesc.set(grad_input);
args.wdesc.set(weight, grad_output.suggest_memory_format(), 0);
args.odesc.set(grad_output);
args.cdesc.set(dataType, c_mode, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
args.cdesc.set(dataType, c_mode, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, deterministic);

if (benchmark) {
miopenConvBwdDataAlgorithm_t bwdDataAlg;
Expand Down Expand Up @@ -1502,7 +1502,7 @@ void raw_miopen_convolution_relu_out(
args.idesc.set(input);
args.wdesc.set(weight, input.suggest_memory_format(), 0);
args.odesc.set(output);
args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, deterministic);

TensorDescriptor bdesc;
bdesc.set(bias.expand({1, bias.size(0)}), output.dim());
Expand Down

0 comments on commit 9159edc

Please sign in to comment.