Add missing period as suggested by svekars on "Fix torch.bucketize do…

…cs for "right"" The docs correctly (i.e matching actual op behavior) state that `right = False` means `boundaries[i-1] < input[m][n]...[l][x] <= boundaries[i]`. However they previously stated that `If 'right' is False (default), then the left boundary is closed.` which contradicts the `boundaries[i-1] < input[m][n]...[l][x] <= boundaries[i]` statement. This modifies the docs to say `... then the left boundary is OPEN.` and also clarifies that this is the opposite behavior of numpy.digitize. Fixes #91580 [ghstack-poisoned]
pytorch · Aug 16, 2023 · 9159edc · 9159edc
2 parents febb5e8 + 033b8d5
commit 9159edc
Show file tree

Hide file tree

Showing 58 changed files with 1,131 additions and 609 deletions.
diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-f2b6f43a85452fe47eaa042ce684183add17fcac
+498b9c8662e2322615748aafc321ad4a5bc02afb
diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml
@@ -10,6 +10,7 @@
   - scripts/onnx/**
   - test/onnx/**
   - tools/onnx/**
+  - torch/_dynamo/backends/onnxrt.py
   - torch/_C/__init__.pyi.in
   - torch/_C/_onnx.pyi
   - torch/csrc/jit/passes/onnx.*

diff --git a/CODEOWNERS b/CODEOWNERS
@@ -57,11 +57,12 @@ nn/qat/ @jerryzh168
 /torch/testing/_internal/distributed @mrshenli @zhaojuanmao @rohan-varma @H-Huang @awgu @kwen2501 @wanchaol @fegin @fduwjj
 
 # ONNX Export
+/torch/_dynamo/backends/onnxrt.py @bowenbao @abock @thiagocrepaldi @wschin
 /torch/csrc/jit/passes/onnx.h @bowenbao @abock @thiagocrepaldi
 /torch/csrc/jit/passes/onnx.cpp @bowenbao @abock @thiagocrepaldi
 /torch/csrc/jit/passes/onnx/ @bowenbao @abock @thiagocrepaldi
-/torch/onnx/ @bowenbao @abock @thiagocrepaldi
-/test/onnx/ @bowenbao @abock @thiagocrepaldi
+/torch/onnx/ @bowenbao @abock @thiagocrepaldi @wschin
+/test/onnx/ @bowenbao @abock @thiagocrepaldi @wschin
 
 # Docker
 /.ci/docker/ @jeffdaily

diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -5,7 +5,6 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDABlas.h>
 #include <ATen/cuda/Exceptions.h>
-#include <ATen/cuda/CUDADataType.h>
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/macros/Export.h>
 #include <c10/util/irange.h>
@@ -197,6 +196,7 @@ static size_t _getWorkspaceSize() {
   static size_t workspace_size = _parseChosenWorkspaceSize();
   return workspace_size;
 }
+
 } // anonymous namespace
 
 namespace at::cuda::blas {
@@ -876,115 +876,6 @@ template void gemm_and_bias(
     int64_t result_ld,
     GEMMAndBiasActivationEpilogue activation);
 
-void scaled_gemm(
-    char transa,
-    char transb,
-    int64_t m,
-    int64_t n,
-    int64_t k,
-    const void* mat1_ptr,
-    const void* mat1_scale_ptr,
-    int64_t mat1_ld,
-    ScalarType mat1_dtype,
-    const void* mat2_ptr,
-    const void* mat2_scale_ptr,
-    int64_t mat2_ld,
-    ScalarType mat2_dtype,
-    const void* bias_ptr,
-    ScalarType bias_dtype,
-    void* result_ptr,
-    const void *result_scale_ptr,
-    int64_t result_ld,
-    ScalarType result_dtype,
-    void* amax_ptr) {
-  const auto computeType = CUBLAS_COMPUTE_32F;
-  const auto scaleType = CUDA_R_32F;
-  CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
-  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, _cublasOpFromChar(transa));
-  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
-  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
-  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
-  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
-  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_AMAX_D_POINTER, amax_ptr);
-  CuBlasLtMatrixLayout Adesc(ScalarTypeToCudaDataType(mat1_dtype), m, k, mat1_ld, transa == 't');
-  CuBlasLtMatrixLayout Bdesc(ScalarTypeToCudaDataType(mat2_dtype), k, n, mat2_ld, transb == 't');
-  CuBlasLtMatrixLayout Cdesc(ScalarTypeToCudaDataType(bias_dtype), m, n, result_ld);
-  CuBlasLtMatrixLayout Ddesc(ScalarTypeToCudaDataType(result_dtype), m, n, result_ld);
-  if (bias_ptr) {
-    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias_ptr);
-    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, CUBLASLT_EPILOGUE_RELU_BIAS);
-    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToCudaDataType(bias_dtype));
-  }
-  size_t workspaceSize = _getWorkspaceSize();
-  auto workspace = at::empty(
-      {static_cast<int64_t>(workspaceSize)},
-      at::device({at::kCUDA, at::cuda::current_device()}).dtype(at::kByte));
-
-  CuBlasLtMatmulPreference preference;
-  preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, workspaceSize);
-  cublasLtMatmulHeuristicResult_t heuristicResult = {};
-  int returnedResult = 0;
-  cublasLtHandle_t ltHandle =
-      reinterpret_cast<cublasLtHandle_t>(at::cuda::getCurrentCUDABlasHandle());
-  TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
-      ltHandle,
-      computeDesc.descriptor(),
-      Adesc.descriptor(),
-      Bdesc.descriptor(),
-      Cdesc.descriptor(),
-      Ddesc.descriptor(),
-      preference.descriptor(),
-      1,
-      &heuristicResult,
-      &returnedResult));
-  if (returnedResult == 0) {
-    TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED);
-  }
-  float alpha_val = 1.0;
-  float beta_val = 0.0;
-  cublasStatus_t cublasStatus = cublasLtMatmul(
-      ltHandle,
-      computeDesc.descriptor(),
-      &alpha_val,
-      mat1_ptr,
-      Adesc.descriptor(),
-      mat2_ptr,
-      Bdesc.descriptor(),
-      &beta_val,
-      nullptr,
-      Cdesc.descriptor(),
-      result_ptr,
-      Ddesc.descriptor(),
-      &heuristicResult.algo,
-      workspace.data_ptr(),
-      workspaceSize,
-      at::cuda::getCurrentCUDAStream());
-  TORCH_CHECK(
-      cublasStatus == CUBLAS_STATUS_SUCCESS,
-      "CUDA error: ",
-      at::cuda::blas::_cublasGetErrorEnum(cublasStatus),
-      " when calling cublasLtMatmul with transpose_mat1 ",
-      transa,
-      " transpose_mat2 ",
-      transb,
-      " m ",
-      m,
-      " n ",
-      n,
-      " k ",
-      k,
-      " mat1_ld ",
-      mat1_ld,
-      " mat2_ld ",
-      mat2_ld,
-      " result_ld ",
-      result_ld,
-      " computeType ",
-      computeType,
-      " scaleType ",
-      scaleType);
-}
-
 void int8_gemm(
     bool transpose_mat1,
     bool transpose_mat2,

diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h
@@ -100,28 +100,6 @@ void int8_gemm(
     int64_t mat2_ld,
     int32_t* result_ptr,
     int64_t result_ld);
-
-void scaled_gemm(
-    char transa,
-    char transb,
-    int64_t m,
-    int64_t n,
-    int64_t k,
-    const void* mat1_ptr,
-    const void* mat1_scale_ptr,
-    int64_t mat1_ld,
-    ScalarType mat1_dtype,
-    const void* mat2_ptr,
-    const void* mat2_scale_ptr,
-    int64_t mat2_ld,
-    ScalarType mat2_dtype,
-    const void* bias,
-    ScalarType bias_dtype,
-    void* result_ptr,
-    const void* result_scale_ptr,
-    int64_t result_ld,
-    ScalarType result_dtype,
-    void* amax_ptr);
 #endif
 
 #define CUDABLAS_BGEMM_ARGTYPES(Dtype)                                                        \

diff --git a/aten/src/ATen/miopen/Descriptors.h b/aten/src/ATen/miopen/Descriptors.h
@@ -111,9 +111,10 @@ struct ConvolutionDescriptor
                       &miopenCreateConvolutionDescriptor,
                       &miopenDestroyConvolutionDescriptor>
 {
-  void set(miopenDataType_t dataType, miopenConvolutionMode_t c_mode,  int dim, int* pad, int* stride, int * upscale /* aka dilation */, int groups) {
+  void set(miopenDataType_t dataType, miopenConvolutionMode_t c_mode,  int dim, int* pad, int* stride, int * upscale /* aka dilation */, int groups, bool deterministic) {
     MIOPEN_CHECK(miopenInitConvolutionNdDescriptor(mut_desc(), dim, pad, stride, upscale, c_mode));
     MIOPEN_CHECK(miopenSetConvolutionGroupCount(mut_desc(), groups));
+    MIOPEN_CHECK(miopenSetConvolutionAttribute(mut_desc(), MIOPEN_CONVOLUTION_ATTRIB_DETERMINISTIC, deterministic ? 1 : 0));
   }
 };
 

diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
@@ -1393,7 +1393,7 @@ Tensor& comparison_op_out(Tensor& result, const Tensor& self, const Tensor& othe
 
 template <typename OutImpl>
 Tensor comparison_op(const Tensor& self, const Tensor& other, OutImpl& out_impl) {
-  Tensor result = at::empty({0}, self.options().dtype(kBool));
+  Tensor result = at::empty(self.sizes(), self.options().dtype(kBool).device(self.device()));
   return out_impl(result, self, other);
 }
 

diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
@@ -15,7 +15,6 @@
 #else
 #include <ATen/ops/_addmm_activation_native.h>
 #include <ATen/ops/_efficientzerotensor.h>
-#include <ATen/ops/_scaled_mm_native.h>
 #include <ATen/ops/addmm_native.h>
 #include <ATen/ops/addmv_native.h>
 #include <ATen/ops/baddbmm_native.h>
@@ -714,114 +713,4 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
   return _int_mm_out_cuda(self, mat2, result);
 }
 
-// Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax
-// Scales are only applicable when matrices are of Float8 type and assumbed to be equal to 1.0 by default.
-// If output matrix type is 16 or 32-bit type, neither scale_result is applied nor amax is computed.
-// Known limitations:
-//  - Only works if mat1 is row-major and mat2 is column-major
-//  - Only works if matrices sizes are divisible by 32
-std::tuple<Tensor&, Tensor&>
-_scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
-          const c10::optional<at::Tensor>& bias,
-          c10::optional<c10::ScalarType> out_dtype,
-          const c10::optional<at::Tensor>& scale_a,
-          const c10::optional<at::Tensor>& scale_b,
-          const c10::optional<at::Tensor>& scale_result,
-          Tensor& out, Tensor& amax) {
-  // Check sizes
-  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
-  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
-  TORCH_CHECK(
-      mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
-      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
-  TORCH_CHECK(!scale_a || (scale_a->numel() == 1 && scale_a->scalar_type() == kFloat),
-       "scale_a must be float scalar");
-  TORCH_CHECK(!scale_b || (scale_b->numel() == 1 && scale_b->scalar_type() == kFloat),
-       "scale_b must be a float scalar");
-  TORCH_CHECK(!scale_result || (scale_result->numel() == 1 && scale_result->scalar_type() == kFloat),
-       "scale_result must be a float scalar");
-  TORCH_CHECK(!bias || bias->numel() == mat2.sizes()[1], "Bias must be size ", mat2.sizes()[1],
-       " but got ", bias->numel());
-  TORCH_CHECK(mat1.sizes()[0] % 16 == 0 && mat1.sizes()[1] % 16 == 0, "mat1 shape (", mat1.sizes()[0], "x",
-       mat1.sizes()[1], " must be divisible by 16");
-  TORCH_CHECK(mat2.sizes()[0] % 16 == 0 && mat2.sizes()[1] % 16 == 0, "mat2 shape (", mat2.sizes()[0], "x",
-       mat2.sizes()[1], " must be divisible by 16");
-  // Check types
-  TORCH_CHECK(!out_dtype || *out_dtype == out.scalar_type(), "out_dtype must match output matrix type");
-  TORCH_CHECK(amax.scalar_type() == kFloat, "amax must be a float scalar");
-  TORCH_CHECK(isFloat8Type(mat1.scalar_type()), "Expected mat1 to be Float8 matrix got ", mat1.scalar_type());
-  TORCH_CHECK(isFloat8Type(mat2.scalar_type()), "Expected mat2 to be Float8 matrix got ", mat1.scalar_type());
-  // Type restrictions imposed by CuBLASLt as of CUDA-12.1
-  TORCH_CHECK(mat1.scalar_type() != ScalarType::Float8_e5m2 || mat2.scalar_type() != ScalarType::Float8_e5m2,
-        "Multiplication of two Float8_e5m2 matrices is not supported");
-  if (bias) {
-    TORCH_CHECK(bias->scalar_type() == ScalarType::BFloat16 || bias->scalar_type() == ScalarType::Half,
-         "Bias must be either Half or BFloat16, but got ", bias->scalar_type());
-    TORCH_CHECK((out.scalar_type() != kFloat && out.scalar_type() != ScalarType::BFloat16) ||
-          bias->scalar_type() == ScalarType::BFloat16,
-          "Bias must be BFloat16 to compute ", out.scalar_type(), " output, but got ", bias->scalar_type());
-    TORCH_CHECK(out.scalar_type() != ScalarType::Half || bias->scalar_type() == ScalarType::Half,
-          "Bias must be Float16 to compute ", out.scalar_type(), " output, but got ", bias->scalar_type());
-  }
-  {
-    auto bias_ = bias.value_or(Tensor());
-    auto scale_a_ = scale_a.value_or(Tensor());
-    auto scale_b_ = scale_b.value_or(Tensor());
-    auto scale_result_ = scale_result.value_or(Tensor());
-    TensorArg targs[]{{out, "out", 0}, {amax, "amax", 1}, {mat1, "mat1", 2}, {mat2, "mat2", 3},
-                      {bias_, "bias", 4}, {scale_a_, "scale_a", 5}, {scale_b_, "scale_b", 6},
-                      {scale_result_, "scale_result", 7}};
-    checkAllSameGPU(__func__, targs);
-  }
-
-  IntArrayRef mat1_sizes = mat1.sizes();
-  IntArrayRef mat2_sizes = mat2.sizes();
-  at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
-  at::native::resize_output(amax, {});
-
-#if !defined(USE_ROCM) && !defined(_MSC_VER)
-  cublasCommonArgs args(mat1, mat2, out);
-  const auto out_dtype_ = args.result->scalar_type();
-  TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");
-  at::cuda::blas::scaled_gemm(
-      args.transa,
-      args.transb,
-      args.m,
-      args.n,
-      args.k,
-      args.mata->data_ptr(),
-      scale_a ? scale_a->data_ptr() : nullptr,
-      args.lda,
-      args.mata->scalar_type(),
-      args.matb->data_ptr(),
-      scale_b ? scale_b->data_ptr() : nullptr,
-      args.ldb,
-      args.matb->scalar_type(),
-      bias ? bias->data_ptr(): nullptr,
-      bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_,
-      args.result->data_ptr(),
-      scale_result ? scale_result->data_ptr() : nullptr,
-      args.result_ld,
-      out_dtype_,
-      amax.data_ptr());
-#else
-  TORCH_CHECK(false, "_scaled_mm_out_cuda is not compiled for this platform.");
-#endif
-
-  return {out, amax};
-}
-
-std::tuple<Tensor, Tensor>
-_scaled_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
-          const c10::optional<at::Tensor>& bias,
-          c10::optional<c10::ScalarType> out_dtype,
-          const c10::optional<at::Tensor>& scale_a,
-          const c10::optional<at::Tensor>& scale_b,
-          const c10::optional<at::Tensor>& scale_result) {
-  const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
-  Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
-  Tensor amax = at::empty({0}, mat_a.options().dtype(ScalarType::Float));
-  return _scaled_mm_out_cuda(mat_a, mat_b, bias, out_dtype, scale_a, scale_b, scale_result, out ,amax);
-}
-
 } // namespace at::native
diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@@ -719,7 +719,7 @@ void raw_miopen_convolution_forward_out(
   args.idesc.set(input);
   args.wdesc.set(weight, input.suggest_memory_format(), 0);
   args.odesc.set(output);
-  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
+  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, deterministic);
 
   if (benchmark) {
       miopenConvFwdAlgorithm_t fwdAlg;
@@ -827,7 +827,7 @@ void raw_miopen_depthwise_convolution_forward_out(
   args.idesc.set(input);
   args.wdesc.set(weight, input.suggest_memory_format(), 0);
   args.odesc.set(output);
-  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
+  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, deterministic);
 
   if (benchmark) {
       miopenConvFwdAlgorithm_t fwdAlg;
@@ -982,7 +982,7 @@ void raw_miopen_convolution_backward_weight_out(
   args.idesc.set(input);
   args.wdesc.set(grad_weight, input.suggest_memory_format(), 0);
   args.odesc.set(grad_output);
-  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
+  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, deterministic);
 
   if (benchmark) {
       miopenConvBwdWeightsAlgorithm_t bwdFilterAlg;
@@ -1026,7 +1026,7 @@ void raw_miopen_depthwise_convolution_backward_weight_out(
   args.idesc.set(input);
   args.wdesc.set(grad_weight, input.suggest_memory_format(), 0);
   args.odesc.set(grad_output);
-  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
+  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, deterministic);
 
   if (benchmark) {
       miopenConvBwdWeightsAlgorithm_t bwdFilterAlg;
@@ -1231,7 +1231,7 @@ void raw_miopen_convolution_backward_input_out(
   args.idesc.set(grad_input);
   args.wdesc.set(weight, grad_output.suggest_memory_format(), 0);
   args.odesc.set(grad_output);
-  args.cdesc.set(dataType, c_mode, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
+  args.cdesc.set(dataType, c_mode, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, deterministic);
 
   if (benchmark) {
       miopenConvBwdDataAlgorithm_t bwdDataAlg;
@@ -1340,7 +1340,7 @@ void raw_miopen_depthwise_convolution_backward_input_out(
   args.idesc.set(grad_input);
   args.wdesc.set(weight, grad_output.suggest_memory_format(), 0);
   args.odesc.set(grad_output);
-  args.cdesc.set(dataType, c_mode, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
+  args.cdesc.set(dataType, c_mode, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, deterministic);
 
   if (benchmark) {
       miopenConvBwdDataAlgorithm_t bwdDataAlg;
@@ -1502,7 +1502,7 @@ void raw_miopen_convolution_relu_out(
   args.idesc.set(input);
   args.wdesc.set(weight, input.suggest_memory_format(), 0);
   args.odesc.set(output);
-  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
+  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, deterministic);
 
   TensorDescriptor bdesc;
   bdesc.set(bias.expand({1, bias.size(0)}), output.dim());