Merge branch 'pytorch:main' into batching-rules-matrix_exp

pytorch · Jan 19, 2024 · 9444380 · 9444380
2 parents 736249d + f316c35
commit 9444380
Show file tree

Hide file tree

Showing 44 changed files with 1,221 additions and 461 deletions.
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
@@ -274,8 +274,6 @@ test_dynamo_shard() {
     --exclude-inductor-tests \
     --exclude-jit-executor \
     --exclude-distributed-tests \
-    --exclude \
-      test_jit \
     --shard "$1" "$NUM_TEST_SHARDS" \
     --verbose
   assert_git_not_dirty

diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
@@ -342,7 +342,8 @@ class OperatingSystem:
     BinaryBuildWorkflow(
         os=OperatingSystem.MACOS_ARM64,
         package_type="conda",
-        cross_compile_arm64=True,
+        cross_compile_arm64=False,
+        macos_runner="macos-13-xlarge",
         build_configs=generate_binary_build_matrix.generate_conda_matrix(
             OperatingSystem.MACOS_ARM64
         ),

diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml
@@ -11,8 +11,10 @@ on:
     branches:
       - nightly
     tags:
-      # We want to run this build on final release tag
+      # Final Release tags look like: v1.11.0
       - v[0-9]+.[0-9]+.[0-9]+
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
       - ciflow/nightly/*
 
 concurrency:
@@ -101,6 +103,16 @@ jobs:
           echo "${RUNNER_TEMP}/bin" >> "${GITHUB_PATH}"
           # Generate PyTorch version to use
           echo "PYTORCH_VERSION=$(python3 .github/scripts/generate_pytorch_version.py --no-build-suffix)" >> "${GITHUB_ENV}"
+      - name: Setup test specific variables
+        if: ${{ startsWith(github.event.ref, 'refs/tags/v') }}
+        run: |
+          if [[ ${{ github.event.ref }} =~ ^refs/tags/v[0-9]+\.[0-9]+\.[0-9]+-rc[0-9]+$ ]]; then
+            {
+              echo "DOCKER_IMAGE=pytorch-test";
+              echo "INSTALL_CHANNEL=pytorch-test";
+              echo "TRITON_VERSION=$(cut -f 1 .ci/docker/triton_version.txt)";
+            } >> "${GITHUB_ENV}"
+          fi
       - name: Setup nightly specific variables
         if: ${{ github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/ciflow/nightly/') }}
         run: |

diff --git a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp
@@ -30,6 +30,7 @@
 #include <ATen/ops/kl_div_native.h>
 #include <ATen/ops/l1_loss_native.h>
 #include <ATen/ops/log.h>
+#include <ATen/ops/log_sigmoid.h>
 #include <ATen/ops/margin_ranking_loss_native.h>
 #include <ATen/ops/mean.h>
 #include <ATen/ops/min.h>
@@ -358,21 +359,20 @@ Tensor binary_cross_entropy_with_logits(const Tensor& input, const Tensor& targe
   c10::MaybeOwned<Tensor> pos_weight_maybe_owned = at::borrow_from_optional_tensor(pos_weight_opt);
   const Tensor& pos_weight = *pos_weight_maybe_owned;
 
-    Tensor loss;
-    auto max_val = (-input).clamp_min_(0);
-    if (pos_weight.defined()) {
-        // pos_weight need to be broadcasted, thus mul(target) is not inplace.
-        auto log_weight = (pos_weight - 1).mul(target).add_(1);
-        loss = (1 - target).mul_(input).add_(log_weight.mul_(((-max_val).exp_().add_((-input - max_val).exp_())).log_().add_(max_val)));
-    } else {
-        loss = (1 - target).mul_(input).add_(max_val).add_((-max_val).exp_().add_((-input -max_val).exp_()).log_());
-    }
+  Tensor loss;
+  if (pos_weight.defined()) {
+      // pos_weight need to be broadcasted, thus mul(target) is not inplace.
+      auto log_weight = (pos_weight - 1).mul(target).add_(1);
+      loss = (1 - target).mul_(input).sub_(log_weight.mul_(at::log_sigmoid(input)));
+  } else {
+      loss = (1 - target).mul_(input).sub_(at::log_sigmoid(input));
+  }
 
-    if (weight.defined()) {
-        loss.mul_(weight);
-    }
+  if (weight.defined()) {
+      loss.mul_(weight);
+  }
 
-    return apply_loss_reduction(loss, reduction);
+  return apply_loss_reduction(loss, reduction);
 }
 
 Tensor poisson_nll_loss(const Tensor& input, const Tensor& target, const bool log_input, const bool full, const double eps, const int64_t reduction)

diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLibBlas.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLibBlas.cpp
@@ -218,6 +218,20 @@ static void apply_triangular_solve_batched(const Tensor& A, const Tensor& B, boo
 }
 
 void triangular_solve_batched_cublas(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
+  // Workaround the following a bug on CUDA < 12.1
+  // RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasStrsmBatched
+  // See https://github.com/pytorch/pytorch/issues/79191#issuecomment-1154222580
+#if defined(CUSOLVER_VERSION) && CUSOLVER_VERSION < 12100
+  constexpr auto max_batch_size = 524280;
+  if (B.size(-1) > max_batch_size) {
+    auto n_chunks = (B.size(-1) + max_batch_size - 1) / max_batch_size; // ceildiv
+    auto splits = B.split(n_chunks, /*dim=*/-1);
+    for (const Tensor& b : splits) {
+      triangular_solve_batched_cublas(A, b, left, upper, transpose, unitriangular);
+    }
+    return;
+  }
+#endif
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(A.scalar_type(), "triangular_solve_cuda", [&]{
     apply_triangular_solve_batched<scalar_t>(A, B, left, upper, transpose, unitriangular);
   });

diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -1658,6 +1658,8 @@ static at::Tensor _quantized_convolution_onednn(
       auto upper_bound_value =
           unary_scalars[1].get().toOptional<at::Scalar>().value().to<float>();
       op_attr = ideep::attr_t::fuse_clamp(lower_bound_value, upper_bound_value);
+    } else if (has_unary_post_op && unary_attr.value()=="hardswish") {
+      op_attr = ideep::attr_t::fuse_hardswish();
     } else {
       op_attr = ideep::attr_t();
     }
@@ -1851,8 +1853,8 @@ class QConvoneDNN final {
     } else {
       // Conv2D post op check
       TORCH_CHECK(
-        attr == "none" || attr == "relu" || attr == "hardtanh",
-        "none post_op or post_op relu/hardtanh is supported for quantized pointwise conv2d. Got unary_post_op: ",
+        attr == "none" || attr == "relu" || attr == "hardtanh" || attr == "hardswish",
+        "none post_op or post_op relu/hardtanh/hardswish is supported for quantized pointwise conv2d. Got unary_post_op: ",
         attr,
         ".")
     }