Update

[ghstack-poisoned]
pytorch · Jun 18, 2024 · 2fec428 · 2fec428
2 parents 1c81afc + 3bd7420
commit 2fec428
Show file tree

Hide file tree

Showing 185 changed files with 9,709 additions and 9,692 deletions.
diff --git a/.github/ci_commit_pins/torchbench.txt b/.github/ci_commit_pins/torchbench.txt
@@ -1 +1 @@
-d6015d42d9a1834bc7595c4bd6852562fb80b30b
+0dab1dd97709096e8129f8a08115ee83f64f2194
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
@@ -26,3 +26,4 @@ retryable_workflows:
 - windows-binary
 labeler_config: labeler.yml
 label_to_label_config: label_to_label.yml
+mergebot: True
diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml
@@ -47,6 +47,9 @@ jobs:
     timeout-minutes: 240
     outputs:
       test-matrix: ${{ steps.filter.outputs.test-matrix }}
+    defaults:
+      run:
+        shell: bash
     steps:
       # Duplicated in win-test because this MUST go before a checkout
       - name: Enable git symlinks on Windows and disable fsmonitor daemon
@@ -89,6 +92,7 @@ jobs:
 
       - name: Parse ref
         id: parse-ref
+        shell: bash
         run: python3 .github/scripts/parse_ref.py
 
       - name: Get workflow job id

diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml
@@ -41,6 +41,9 @@ jobs:
       fail-fast: false
     runs-on: ${{ matrix.runner }}
     timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
+    defaults:
+      run:
+        shell: bash
     steps:
       # Duplicated in win-build because this MUST go before a checkout
       - name: Enable git symlinks on Windows and disable fsmonitor daemon
@@ -224,6 +227,7 @@ jobs:
 
       - name: Parse ref
         id: parse-ref
+        shell: bash
         run: python3 .github/scripts/parse_ref.py
 
       - name: Uninstall PyTorch

diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -1643,15 +1643,6 @@ exclude_patterns = [
     'torch/linalg/__init__.py',
     'torch/monitor/__init__.py',
     'torch/nested/__init__.py',
-    'torch/nn/__init__.py',
-    'torch/nn/_reduction.py',
-    'torch/nn/backends/__init__.py',
-    'torch/nn/backends/thnn.py',
-    'torch/nn/common_types.py',
-    'torch/nn/cpp.py',
-    'torch/nn/functional.py',
-    'torch/nn/grad.py',
-    'torch/nn/init.py',
     'torch/nn/intrinsic/__init__.py',
     'torch/nn/intrinsic/modules/__init__.py',
     'torch/nn/intrinsic/modules/fused.py',
@@ -1668,40 +1659,6 @@ exclude_patterns = [
     'torch/nn/intrinsic/quantized/modules/bn_relu.py',
     'torch/nn/intrinsic/quantized/modules/conv_relu.py',
     'torch/nn/intrinsic/quantized/modules/linear_relu.py',
-    'torch/nn/modules/__init__.py',
-    'torch/nn/modules/_functions.py',
-    'torch/nn/modules/activation.py',
-    'torch/nn/modules/adaptive.py',
-    'torch/nn/modules/batchnorm.py',
-    'torch/nn/modules/channelshuffle.py',
-    'torch/nn/modules/container.py',
-    'torch/nn/modules/conv.py',
-    'torch/nn/modules/distance.py',
-    'torch/nn/modules/dropout.py',
-    'torch/nn/modules/flatten.py',
-    'torch/nn/modules/fold.py',
-    'torch/nn/modules/instancenorm.py',
-    'torch/nn/modules/lazy.py',
-    'torch/nn/modules/linear.py',
-    'torch/nn/modules/loss.py',
-    'torch/nn/modules/module.py',
-    'torch/nn/modules/normalization.py',
-    'torch/nn/modules/padding.py',
-    'torch/nn/modules/pixelshuffle.py',
-    'torch/nn/modules/pooling.py',
-    'torch/nn/modules/rnn.py',
-    'torch/nn/modules/sparse.py',
-    'torch/nn/modules/transformer.py',
-    'torch/nn/modules/upsampling.py',
-    'torch/nn/modules/utils.py',
-    'torch/nn/parallel/__init__.py',
-    'torch/nn/parallel/_functions.py',
-    'torch/nn/parallel/comm.py',
-    'torch/nn/parallel/data_parallel.py',
-    'torch/nn/parallel/parallel_apply.py',
-    'torch/nn/parallel/replicate.py',
-    'torch/nn/parallel/scatter_gather.py',
-    'torch/nn/parameter.py',
     'torch/nn/qat/__init__.py',
     'torch/nn/qat/dynamic/__init__.py',
     'torch/nn/qat/dynamic/modules/__init__.py',

diff --git a/Dockerfile b/Dockerfile
@@ -77,6 +77,11 @@ RUN case ${TARGETPLATFORM} in \
     esac && \
     /opt/conda/bin/conda clean -ya
 RUN /opt/conda/bin/pip install torchelastic
+RUN IS_CUDA=$(python -c 'import torch ; print(torch.cuda._is_compiled())'); \
+    echo "Is torch compiled with cuda: ${IS_CUDA}"; \
+    if test "${IS_CUDA}" != "True" -a ! -z "${CUDA_VERSION}"; then \
+        exit 1; \
+    fi
 
 FROM ${BASE_IMAGE} as official
 ARG PYTORCH_VERSION

diff --git a/aten/src/ATen/FunctionalInverses.cpp b/aten/src/ATen/FunctionalInverses.cpp
@@ -303,7 +303,7 @@ Tensor FunctionalInverses::_nested_view_from_buffer_inverse(const Tensor& base,
     return Tensor();
 }
 
-Tensor FunctionalInverses::_nested_view_from_jagged_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, const Tensor& offsets, const Tensor& dummy, const std::optional<Tensor>& lengths, int64_t ragged_idx) {
+Tensor FunctionalInverses::_nested_view_from_jagged_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, const Tensor& offsets, const Tensor& dummy, const std::optional<Tensor>& lengths, int64_t ragged_idx, const c10::optional<Tensor>& min_seqlen, const c10::optional<Tensor>& max_seqlen) {
   auto values = at::_nested_get_values(mutated_view);
   if (inverse_return_mode != InverseReturnMode::NeverView) {
     return values;
@@ -317,7 +317,12 @@ Tensor FunctionalInverses::_nested_get_values_inverse(const Tensor& base, const
   auto lengths = at::_nested_get_lengths(base);
   auto ragged_idx = at::_nested_get_ragged_idx(base);
   auto dummy = at::_nested_get_jagged_dummy(base);
-  auto nt = at::_nested_view_from_jagged(mutated_view, offsets, dummy, lengths, ragged_idx);
+  auto min_seqlen = at::_nested_get_min_seqlen(base);
+  auto max_seqlen = at::_nested_get_max_seqlen(base);
+  auto nt = at::_nested_view_from_jagged(
+      mutated_view, offsets, dummy, lengths, ragged_idx,
+      (min_seqlen.defined() ? c10::optional<Tensor>(min_seqlen) : c10::nullopt),
+      (max_seqlen.defined() ? c10::optional<Tensor>(max_seqlen) : c10::nullopt));
 
   if (inverse_return_mode != InverseReturnMode::NeverView) {
     return nt;

diff --git a/aten/src/ATen/MapAllocator.h b/aten/src/ATen/MapAllocator.h
@@ -55,6 +55,10 @@ class TORCH_API MapAllocator {
     return base_ptr_;
   }
 
+  int flags() const {
+    return flags_;
+  }
+
   static MapAllocator* fromDataPtr(const at::DataPtr&);
   static at::DataPtr makeDataPtr(
       c10::string_view filename,

diff --git a/aten/src/ATen/cpu/vec/vec_mask.h b/aten/src/ATen/cpu/vec/vec_mask.h
@@ -259,6 +259,7 @@ VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator<, ~a& b)
 VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator==, ~(a ^ b))
 VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator>=, (a == b) | (a > b))
 VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator<=, (a == b) | (a < b))
+VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator!=, (a ^ b))
 
 #undef VEC_MASK_DEFINE_UNARY_OP_GLOBAL
 #undef VEC_MASK_DEFINE_BINARY_OP_GLOBAL

diff --git a/aten/src/ATen/cuda/tunable/GemmCommon.h b/aten/src/ATen/cuda/tunable/GemmCommon.h
@@ -81,8 +81,7 @@ struct GemmParams : OpParams {
   }
 
   std::string Signature() const override {
-    static std::string val = c10::str(transa, transb, "_", m, "_", n, "_", k);
-    return val;
+    return c10::str(transa, transb, "_", m, "_", n, "_", k);
   }
 
   size_t GetSize(bool duplicate_inputs) const {
@@ -144,82 +143,14 @@ struct GemmParams : OpParams {
   bool duplicate_inputs_;
 };
 
-template <typename T>
-struct GemmAndBiasParams : OpParams {
-  std::string Signature() const override {
-    static std::string val = c10::str(transa, transb, "_", m, "_", n, "_", k);
-    return val;
-  }
-
-  size_t GetSize(bool duplicate_inputs) const {
-    size_t size = sizeof(T) * ldc * n;
-    if (duplicate_inputs) {
-      size += sizeof(T) * lda * ((transa == 'n' || transa == 'N') ? k : m);
-      size += sizeof(T) * ldb * ((transb == 'n' || transb == 'N') ? n : k);
-    }
-    return size;
-  }
-
-  GemmAndBiasParams* DeepCopy(bool duplicate_inputs) const {
-    GemmAndBiasParams* copy = new GemmAndBiasParams;
-    *copy = *this;
-    c10::DeviceIndex device = 0;
-    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
-    size_t c_size = ldc * n * sizeof(T);
-    copy->c = static_cast<T*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
-    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
-        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
-    if (duplicate_inputs) {
-      size_t a_size = sizeof(T) * lda * ((transa == 'n' || transa == 'N') ? k : m);
-      size_t b_size = sizeof(T) * ldb * ((transb == 'n' || transb == 'N') ? n : k);
-      copy->a = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(a_size));
-      copy->b = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(b_size));
-      copy->duplicate_inputs_ = true;
-    }
-    return copy;
-  }
-
-  // only call on object returned by DeepCopy
-  void Delete() {
-    c10::cuda::CUDACachingAllocator::raw_delete(c);
-    if (duplicate_inputs_) {
-      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(a));
-      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(b));
-    }
-  }
-
-  TuningStatus NumericalCheck(GemmAndBiasParams<T> *other) {
-    auto c_dtype = c10::CppTypeToScalarType<T>::value;
-    return detail::NumericalCheck(c_dtype, c, other->c, ldc*n) ? OK : FAIL;
-  }
-
-  char transa;
-  char transb;
-  int64_t m;
-  int64_t n;
-  int64_t k;
-  at::opmath_type<T> alpha;
-  const T* a;
-  int64_t lda;
-  const T* b;
-  int64_t ldb;
-  T* c;
-  int64_t ldc;
-  const T* bias;
-  at::cuda::blas::GEMMAndBiasActivationEpilogue activation;
-private:
-  bool duplicate_inputs_;
-};
-
 template <typename T>
 struct GemmStridedBatchedParams : OpParams {
   GemmStridedBatchedParams() {
     duplicate_inputs_ = false;
   }
 
   std::string Signature() const override {
-    static std::string val = c10::str(transa, transb, "_", m, "_", n, "_", k, "_B_", batch);
-    return val;
+    return c10::str(transa, transb, "_", m, "_", n, "_", k, "_B_", batch);
   }
 
   size_t GetSize(bool duplicate_inputs) const {
@@ -292,8 +223,7 @@ struct ScaledGemmParams : OpParams {
   }
 
   std::string Signature() const override {
-    static std::string val = c10::str(transa, transb, "_", m, "_", n, "_", k);
-    return val;
+    return c10::str(transa, transb, "_", m, "_", n, "_", k);
   }
 
   size_t GetSize(bool duplicate_inputs) const {