Update

[ghstack-poisoned]
pytorch · Jun 18, 2024 · 4573c4e · 4573c4e
2 parents cd9894a + 32e5ded
commit 4573c4e
Show file tree

Hide file tree

Showing 113 changed files with 1,467 additions and 1,420 deletions.
diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh
@@ -191,7 +191,7 @@ function clone_pytorch_xla() {
 function checkout_install_torchbench() {
   local commit
   commit=$(get_pinned_commit torchbench)
-  git clone https://github.com/eellison/benchmark torchbench
+  git clone https://github.com/pytorch/benchmark torchbench
   pushd torchbench
   git checkout "$commit"
 

diff --git a/.ci/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh b/.ci/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh
@@ -9,7 +9,7 @@ test_cpu_speed_mini_sequence_labeler () {
   export OMP_NUM_THREADS=4
   export MKL_NUM_THREADS=4
 
-  git clone https://github.com/eellison/benchmark.git
+  git clone https://github.com/pytorch/benchmark.git
 
   cd benchmark/
 

diff --git a/.ci/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh b/.ci/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh
@@ -9,7 +9,7 @@ test_gpu_speed_cudnn_lstm () {
   export OMP_NUM_THREADS=4
   export MKL_NUM_THREADS=4
 
-  git clone https://github.com/eellison/benchmark.git
+  git clone https://github.com/pytorch/benchmark.git
 
   cd benchmark/
 

diff --git a/.ci/pytorch/perf_test/test_gpu_speed_lstm.sh b/.ci/pytorch/perf_test/test_gpu_speed_lstm.sh
@@ -9,7 +9,7 @@ test_gpu_speed_lstm () {
   export OMP_NUM_THREADS=4
   export MKL_NUM_THREADS=4
 
-  git clone https://github.com/eellison/benchmark.git
+  git clone https://github.com/pytorch/benchmark.git
 
   cd benchmark/
 

diff --git a/.ci/pytorch/perf_test/test_gpu_speed_mlstm.sh b/.ci/pytorch/perf_test/test_gpu_speed_mlstm.sh
@@ -9,7 +9,7 @@ test_gpu_speed_mlstm () {
   export OMP_NUM_THREADS=4
   export MKL_NUM_THREADS=4
 
-  git clone https://github.com/eellison/benchmark.git
+  git clone https://github.com/pytorch/benchmark.git
 
   cd benchmark/
 

diff --git a/.github/ci_commit_pins/torchbench.txt b/.github/ci_commit_pins/torchbench.txt
@@ -1 +1 @@
-pin_yolo_dep
+0dab1dd97709096e8129f8a08115ee83f64f2194
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -1534,6 +1534,59 @@ exclude_patterns = [
     'torch/linalg/__init__.py',
     'torch/monitor/__init__.py',
     'torch/nested/__init__.py',
+    'torch/nn/intrinsic/__init__.py',
+    'torch/nn/intrinsic/modules/__init__.py',
+    'torch/nn/intrinsic/modules/fused.py',
+    'torch/nn/intrinsic/qat/__init__.py',
+    'torch/nn/intrinsic/qat/modules/__init__.py',
+    'torch/nn/intrinsic/qat/modules/conv_fused.py',
+    'torch/nn/intrinsic/qat/modules/linear_fused.py',
+    'torch/nn/intrinsic/qat/modules/linear_relu.py',
+    'torch/nn/intrinsic/quantized/__init__.py',
+    'torch/nn/intrinsic/quantized/dynamic/__init__.py',
+    'torch/nn/intrinsic/quantized/dynamic/modules/__init__.py',
+    'torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py',
+    'torch/nn/intrinsic/quantized/modules/__init__.py',
+    'torch/nn/intrinsic/quantized/modules/bn_relu.py',
+    'torch/nn/intrinsic/quantized/modules/conv_relu.py',
+    'torch/nn/intrinsic/quantized/modules/linear_relu.py',
+    'torch/nn/qat/__init__.py',
+    'torch/nn/qat/dynamic/__init__.py',
+    'torch/nn/qat/dynamic/modules/__init__.py',
+    'torch/nn/qat/dynamic/modules/linear.py',
+    'torch/nn/qat/modules/__init__.py',
+    'torch/nn/qat/modules/conv.py',
+    'torch/nn/qat/modules/embedding_ops.py',
+    'torch/nn/qat/modules/linear.py',
+    'torch/nn/quantizable/__init__.py',
+    'torch/nn/quantizable/modules/__init__.py',
+    'torch/nn/quantizable/modules/activation.py',
+    'torch/nn/quantizable/modules/rnn.py',
+    'torch/nn/quantized/__init__.py',
+    'torch/nn/quantized/_reference/__init__.py',
+    'torch/nn/quantized/_reference/modules/__init__.py',
+    'torch/nn/quantized/_reference/modules/conv.py',
+    'torch/nn/quantized/_reference/modules/linear.py',
+    'torch/nn/quantized/_reference/modules/rnn.py',
+    'torch/nn/quantized/_reference/modules/sparse.py',
+    'torch/nn/quantized/_reference/modules/utils.py',
+    'torch/nn/quantized/dynamic/__init__.py',
+    'torch/nn/quantized/dynamic/modules/__init__.py',
+    'torch/nn/quantized/dynamic/modules/conv.py',
+    'torch/nn/quantized/dynamic/modules/linear.py',
+    'torch/nn/quantized/dynamic/modules/rnn.py',
+    'torch/nn/quantized/functional.py',
+    'torch/nn/quantized/modules/__init__.py',
+    'torch/nn/quantized/modules/activation.py',
+    'torch/nn/quantized/modules/batchnorm.py',
+    'torch/nn/quantized/modules/conv.py',
+    'torch/nn/quantized/modules/dropout.py',
+    'torch/nn/quantized/modules/embedding_ops.py',
+    'torch/nn/quantized/modules/functional_modules.py',
+    'torch/nn/quantized/modules/linear.py',
+    'torch/nn/quantized/modules/normalization.py',
+    'torch/nn/quantized/modules/rnn.py',
+    'torch/nn/quantized/modules/utils.py',
     'torch/signal/__init__.py',
     'torch/signal/windows/__init__.py',
     'torch/signal/windows/windows.py',

diff --git a/Dockerfile b/Dockerfile
@@ -77,6 +77,11 @@ RUN case ${TARGETPLATFORM} in \
     esac && \
     /opt/conda/bin/conda clean -ya
 RUN /opt/conda/bin/pip install torchelastic
+RUN IS_CUDA=$(python -c 'import torch ; print(torch.cuda._is_compiled())'); \
+    echo "Is torch compiled with cuda: ${IS_CUDA}"; \
+    if test "${IS_CUDA}" != "True" -a ! -z "${CUDA_VERSION}"; then \
+        exit 1; \
+    fi
 
 FROM ${BASE_IMAGE} as official
 ARG PYTORCH_VERSION

diff --git a/aten/src/ATen/cpu/vec/vec_mask.h b/aten/src/ATen/cpu/vec/vec_mask.h
@@ -259,6 +259,7 @@ VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator<, ~a& b)
 VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator==, ~(a ^ b))
 VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator>=, (a == b) | (a > b))
 VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator<=, (a == b) | (a < b))
+VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator!=, (a ^ b))
 
 #undef VEC_MASK_DEFINE_UNARY_OP_GLOBAL
 #undef VEC_MASK_DEFINE_BINARY_OP_GLOBAL

diff --git a/aten/src/ATen/cuda/tunable/GemmCommon.h b/aten/src/ATen/cuda/tunable/GemmCommon.h
@@ -81,8 +81,7 @@ struct GemmParams : OpParams {
   }
 
   std::string Signature() const override {
-    static std::string val = c10::str(transa, transb, "_", m, "_", n, "_", k);
-    return val;
+    return c10::str(transa, transb, "_", m, "_", n, "_", k);
   }
 
   size_t GetSize(bool duplicate_inputs) const {
@@ -144,82 +143,14 @@ struct GemmParams : OpParams {
   bool duplicate_inputs_;
 };
 
-template <typename T>
-struct GemmAndBiasParams : OpParams {
-  std::string Signature() const override {
-    static std::string val = c10::str(transa, transb, "_", m, "_", n, "_", k);
-    return val;
-  }
-
-  size_t GetSize(bool duplicate_inputs) const {
-    size_t size = sizeof(T) * ldc * n;
-    if (duplicate_inputs) {
-      size += sizeof(T) * lda * ((transa == 'n' || transa == 'N') ? k : m);
-      size += sizeof(T) * ldb * ((transb == 'n' || transb == 'N') ? n : k);
-    }
-    return size;
-  }
-
-  GemmAndBiasParams* DeepCopy(bool duplicate_inputs) const {
-    GemmAndBiasParams* copy = new GemmAndBiasParams;
-    *copy = *this;
-    c10::DeviceIndex device = 0;
-    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
-    size_t c_size = ldc * n * sizeof(T);
-    copy->c = static_cast<T*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
-    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
-        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
-    if (duplicate_inputs) {
-      size_t a_size = sizeof(T) * lda * ((transa == 'n' || transa == 'N') ? k : m);
-      size_t b_size = sizeof(T) * ldb * ((transb == 'n' || transb == 'N') ? n : k);
-      copy->a = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(a_size));
-      copy->b = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(b_size));
-      copy->duplicate_inputs_ = true;
-    }
-    return copy;
-  }
-
-  // only call on object returned by DeepCopy
-  void Delete() {
-    c10::cuda::CUDACachingAllocator::raw_delete(c);
-    if (duplicate_inputs_) {
-      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(a));
-      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(b));
-    }
-  }
-
-  TuningStatus NumericalCheck(GemmAndBiasParams<T> *other) {
-    auto c_dtype = c10::CppTypeToScalarType<T>::value;
-    return detail::NumericalCheck(c_dtype, c, other->c, ldc*n) ? OK : FAIL;
-  }
-
-  char transa;
-  char transb;
-  int64_t m;
-  int64_t n;
-  int64_t k;
-  at::opmath_type<T> alpha;
-  const T* a;
-  int64_t lda;
-  const T* b;
-  int64_t ldb;
-  T* c;
-  int64_t ldc;
-  const T* bias;
-  at::cuda::blas::GEMMAndBiasActivationEpilogue activation;
-private:
-  bool duplicate_inputs_;
-};
-
 template <typename T>
 struct GemmStridedBatchedParams : OpParams {
   GemmStridedBatchedParams() {
     duplicate_inputs_ = false;
   }
 
   std::string Signature() const override {
-    static std::string val = c10::str(transa, transb, "_", m, "_", n, "_", k, "_B_", batch);
-    return val;
+    return c10::str(transa, transb, "_", m, "_", n, "_", k, "_B_", batch);
   }
 
   size_t GetSize(bool duplicate_inputs) const {
@@ -292,8 +223,7 @@ struct ScaledGemmParams : OpParams {
   }
 
   std::string Signature() const override {
-    static std::string val = c10::str(transa, transb, "_", m, "_", n, "_", k);
-    return val;
+    return c10::str(transa, transb, "_", m, "_", n, "_", k);
   }
 
   size_t GetSize(bool duplicate_inputs) const {