Update on "[inductor] graph replayer"

Recently I feel it's a bit painful to run benchmark scripts on my dev environment. E.g., the command below ``` python benchmarks/dynamo/huggingface.py --backend inductor --amp --performance --only YituTechConvBert --training ``` took about 2 minutes to run. It may take even longer for some other models. The command is slow since it - need do dynamo work - verify the model on CPU - run perf tests - compile all the graphs However, often times I only need to debug inductor specific logic like loop ordering and fusion. A lot of the things the script is done are useless for me. Also I only need test one graph at a time (e.g. check fwd graph first and when I'm done, continue to check bwd graph) rather than compiling all the graphs. The graph replayer add a `save_args` decorator to compile_fx_inner function. When `config.save_args` is true, it will pickle all the arguments to `comple_fx_inner` to the file system. Later on, we can call `load_args_and_run_compile_fx_inner("/tmp/inductor_saved_args/compile_fx_inner_0.pkl")` to replay the graph and compile it with inductor. Replaying the fwd graph took around 60 seconds (maybe this can be further reduced but this is already 2x speedup for dev efficiency) , and it only took around 20 seconds to reach `Scheduler.__init__` method. I also checked `TORCH_COMPILE_DEBUG` flag that already exists. The most similar part of `TORCH_COMPILE_DEBUG` is it can save a graph and it's arguments and later on rerun it. But the difference here is, rather than run the model, we want to call inductor API to compile the model (without even going thru dynamo or aot-autograd). [ghstack-poisoned]
pytorch · Aug 11, 2023 · d2a86d8 · d2a86d8
2 parents 13c755f + 904eed6
commit d2a86d8
Show file tree

Hide file tree

Showing 417 changed files with 11,827 additions and 8,574 deletions.
diff --git a/.ci/docker/ci_commit_pins/triton-rocm.txt b/.ci/docker/ci_commit_pins/triton-rocm.txt
@@ -1 +1 @@
-9dc100afb538d39da17621e0f8ad233f2078e6ff
+34887ff8ca7a264c2c75972f5421a1ed3b7d8f6c
diff --git a/.ci/docker/common/install_onnx.sh b/.ci/docker/common/install_onnx.sh
@@ -14,21 +14,24 @@ pip_install \
   networkx==2.0 \
   numpy==1.22.4
 
-# Using 1.15dev branch for the following not yet released features and fixes.
-# - Segfault fix for shape inference.
-# - Inliner to workaround ORT segfault.
-pip_install onnx-weekly==1.15.0.dev20230717
-
+# ONNXRuntime should be installed before installing
+# onnx-weekly. Otherwise, onnx-weekly could be
+# overwritten by onnx.
 pip_install \
-  onnxruntime==1.15.0 \
+  onnxruntime==1.15.1 \
   parameterized==0.8.1 \
   pytest-cov==4.0.0 \
   pytest-subtests==0.10.0 \
   tabulate==0.9.0 \
   transformers==4.25.1
 
+# Using 1.15dev branch for the following not yet released features and fixes.
+# - Segfault fix for shape inference.
+# - Inliner to workaround ORT segfault.
+pip_install onnx-weekly==1.15.0.dev20230717
+
 # TODO: change this when onnx-script is on testPypi
-pip_install onnxscript-preview==0.1.0.dev20230801 --no-deps
+pip_install onnxscript-preview==0.1.0.dev20230809 --no-deps
 
 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/

diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
@@ -505,40 +505,38 @@ test_aten() {
   # Test ATen
   # The following test(s) of ATen have already been skipped by caffe2 in rocm environment:
   # scalar_tensor_test, basic, native_test
-  if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
-    echo "Running ATen tests with pytorch lib"
-
-    if [[ -n "$IN_WHEEL_TEST" ]]; then
-      echo "Running test with the install folder"
-      # Rename the build folder when running test to ensure it
-      # is not depended on the folder
-      mv "$BUILD_DIR" "$BUILD_RENAMED_DIR"
-      TEST_BASE_DIR="$TORCH_TEST_DIR"
-    else
-      echo "Running test with the build folder"
-      TEST_BASE_DIR="$BUILD_BIN_DIR"
-    fi
-
-    # NB: the ATen test binaries don't have RPATH set, so it's necessary to
-    # put the dynamic libraries somewhere were the dynamic linker can find them.
-    # This is a bit of a hack.
-    ${SUDO} ln -sf "$TORCH_LIB_DIR"/libc10* "$TEST_BASE_DIR"
-    ${SUDO} ln -sf "$TORCH_LIB_DIR"/libcaffe2* "$TEST_BASE_DIR"
-    ${SUDO} ln -sf "$TORCH_LIB_DIR"/libmkldnn* "$TEST_BASE_DIR"
-    ${SUDO} ln -sf "$TORCH_LIB_DIR"/libnccl* "$TEST_BASE_DIR"
-    ${SUDO} ln -sf "$TORCH_LIB_DIR"/libtorch* "$TEST_BASE_DIR"
-    ${SUDO} ln -sf "$TORCH_LIB_DIR"/libtbb* "$TEST_BASE_DIR"
-
-    ls "$TEST_BASE_DIR"
-    aten/tools/run_tests.sh "$TEST_BASE_DIR"
-
-    if [[ -n "$IN_WHEEL_TEST" ]]; then
-      # Restore the build folder to avoid any impact on other tests
-      mv "$BUILD_RENAMED_DIR" "$BUILD_DIR"
-    fi
+  echo "Running ATen tests with pytorch lib"
+
+  if [[ -n "$IN_WHEEL_TEST" ]]; then
+    echo "Running test with the install folder"
+    # Rename the build folder when running test to ensure it
+    # is not depended on the folder
+    mv "$BUILD_DIR" "$BUILD_RENAMED_DIR"
+    TEST_BASE_DIR="$TORCH_TEST_DIR"
+  else
+    echo "Running test with the build folder"
+    TEST_BASE_DIR="$BUILD_BIN_DIR"
+  fi
 
-    assert_git_not_dirty
+  # NB: the ATen test binaries don't have RPATH set, so it's necessary to
+  # put the dynamic libraries somewhere were the dynamic linker can find them.
+  # This is a bit of a hack.
+  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libc10* "$TEST_BASE_DIR"
+  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libcaffe2* "$TEST_BASE_DIR"
+  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libmkldnn* "$TEST_BASE_DIR"
+  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libnccl* "$TEST_BASE_DIR"
+  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libtorch* "$TEST_BASE_DIR"
+  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libtbb* "$TEST_BASE_DIR"
+
+  ls "$TEST_BASE_DIR"
+  aten/tools/run_tests.sh "$TEST_BASE_DIR"
+
+  if [[ -n "$IN_WHEEL_TEST" ]]; then
+    # Restore the build folder to avoid any impact on other tests
+    mv "$BUILD_RENAMED_DIR" "$BUILD_DIR"
   fi
+
+  assert_git_not_dirty
 }
 
 test_without_numpy() {
@@ -1023,6 +1021,8 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
   install_torchtext
   install_torchvision
   id=$((SHARD_NUMBER-1))
+  # https://github.com/opencv/opencv-python/issues/885
+  pip_install opencv-python==4.8.0.74
   if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
     checkout_install_torchbench hf_Bert hf_Albert timm_vision_transformer
     PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf

diff --git a/.github/ci_commit_pins/torchbench.txt b/.github/ci_commit_pins/torchbench.txt
@@ -1 +1 @@
-770d5cf793c283bdc5e55a313fc068bd2fc8c109
+8a0f5e3678bef55148743ab987baa3c89f8dfb5e
diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-bf03f4edfe8fd7543e98ed9a3771b01ab6c6f062
+f2b6f43a85452fe47eaa042ce684183add17fcac
diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-6bb6ab5aa44f238394763ce2e55d96308b3efbf0
+56a6a02a706367290ce54a1b2602a74af52fa34f
diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml
@@ -11,6 +11,7 @@
   - test/onnx/**
   - tools/onnx/**
   - torch/_C/__init__.pyi.in
+  - torch/_C/_onnx.pyi
   - torch/csrc/jit/passes/onnx.*
   - torch/csrc/jit/passes/onnx/**
   - torch/csrc/jit/serialization/export.*

diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
@@ -49,7 +49,7 @@ def arch_type(arch_version: str) -> str:
     },
     "cpu": "pytorch/manylinux-builder:cpu",
     "cpu-cxx11-abi": "pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi",
-    "cpu-aarch64": "quay.io/pypa/manylinux2014_aarch64",
+    "cpu-aarch64": "pytorch/manylinuxaarch64-builder:cpu-aarch64",
 }
 
 CONDA_CONTAINER_IMAGES = {

diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
diff --git a/.gitignore b/.gitignore
@@ -19,6 +19,7 @@ coverage.xml
 **/.pytorch-disabled-tests.json
 **/.pytorch-slow-tests.json
 **/.pytorch-test-times.json
+**/.pytorch-test-file-ratings.json
 */*.pyc
 */*.so*
 */**/__pycache__

diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -185,10 +185,13 @@ include_patterns = [
     'torch/_dynamo/repro/**/*.py',
     'torch/_inductor/autotune_process.py',
     'torch/_inductor/graph.py',
+    'torch/_inductor/codegen/common.py',
     'torch/_inductor/codegen/wrapper.py',
     'torch/_inductor/cudagraph_trees.py',
+    'torch/_inductor/compile_fx.py',
     'torch/_inductor/lowering.py',
     'torch/_inductor/metrics.py',
+    'torch/_inductor/select_algorithm.py',
     'torch/_C/_dynamo/**/*.py',
     'test/test_utils.py',  # used to by in MYPY but after importing op_db it took 10+ minutes
 ]

diff --git a/RELEASE.md b/RELEASE.md
@@ -43,6 +43,7 @@ Following is the Release Compatibility Matrix for PyTorch releases:
 
 | PyTorch version | Python | Stable CUDA | Experimental CUDA |
 | --- | --- | --- | --- |
+| 2.1 | >=3.8, <=3.11 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 |
 | 2.0 | >=3.8, <=3.11 | CUDA 11.7, CUDNN 8.5.0.96 | CUDA 11.8, CUDNN 8.7.0.84 |
 | 1.13 | >=3.7, <=3.10 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 |
 | 1.12 | >=3.7, <=3.10 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 |

diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
@@ -872,26 +872,29 @@ c10::intrusive_ptr<ivalue::Object> ivalue::Object::create(
       StrongTypePtr(nullptr, std::move(classType)), numSlots);
 }
 
-
-IValue IValue::deepcopy() const {
+IValue IValue::deepcopy(c10::optional<at::Device> device) const {
   IValue::HashAliasedIValueMap memo;
-  return deepcopy(memo);
+  return deepcopy(memo, device);
 }
 
 IValue IValue::deepcopy(
-    IValue::HashAliasedIValueMap& memo) const {
+    IValue::HashAliasedIValueMap& memo,
+    c10::optional<at::Device> device) const {
   if (memo.count(*this)) {
     return memo.at(*this);
   }
   IValue copy;
   switch(tag) {
-    case IValue::Tag::Tensor:
-      copy = IValue(toTensor().clone());
-      break;
+    case IValue::Tag::Tensor: {
+      const at::Tensor& src_tensor = toTensor();
+      copy = device.has_value() && !src_tensor.device().is_meta()
+          ? IValue(src_tensor.to(*device))
+          : IValue(src_tensor.clone());
+    } break;
     case IValue::Tag::Tuple: {
       std::vector<IValue> copied_tuple;
       for (const auto& e : toTupleRef().elements()) {
-        copied_tuple.emplace_back(e.deepcopy(memo));
+        copied_tuple.emplace_back(e.deepcopy(memo, device));
       }
       copy = IValue(ivalue::Tuple::create(std::move(copied_tuple)));
     }
@@ -900,7 +903,7 @@ IValue IValue::deepcopy(
       auto list = toList();
       auto copied_list = c10::impl::GenericList(list.elementType());
       for (IValue v : list) {
-        copied_list.push_back(v.deepcopy(memo));
+        copied_list.push_back(v.deepcopy(memo, device));
       }
       copy = IValue(copied_list);
     }
@@ -909,7 +912,9 @@ IValue IValue::deepcopy(
       auto dict = toGenericDict();
       auto copied_dict = c10::impl::GenericDict(dict.keyType(), dict.valueType());
       for (const auto& entry : dict) {
-        copied_dict.insert(entry.key().deepcopy(memo), entry.value().deepcopy(memo));
+        copied_dict.insert(
+            entry.key().deepcopy(memo, device),
+            entry.value().deepcopy(memo, device));
       }
       copy = IValue(copied_dict);
     }
@@ -924,15 +929,15 @@ IValue IValue::deepcopy(
         auto state = class_type->getMethod("__getstate__")({*this});
         class_type->getMethod("__setstate__")({copy, std::move(state)});
       } else {
-        copy = IValue(toObject()->deepcopy(memo));
+        copy = IValue(toObject()->deepcopy(memo, device));
       }
     } break;
     case IValue::Tag::Enum: {
       auto enum_holder = toEnumHolder();
       copy = IValue(c10::make_intrusive<ivalue::EnumHolder>(
           enum_holder->type(),
           enum_holder->name(),
-          enum_holder->value().deepcopy(memo)));
+          enum_holder->value().deepcopy(memo, device)));
     } break;
     case IValue::Tag::String:
     case IValue::Tag::None:
@@ -1005,12 +1010,15 @@ c10::intrusive_ptr<ivalue::Object> ivalue::Object::copy_to_weak_compilation_ref(
   return object;
 }
 
-c10::intrusive_ptr<ivalue::Object> ivalue::Object::deepcopy() const {
+c10::intrusive_ptr<ivalue::Object> ivalue::Object::deepcopy(
+    c10::optional<at::Device> device) const {
   IValue::HashAliasedIValueMap memo;
-  return deepcopy(memo);
+  return deepcopy(memo, device);
 }
 
-c10::intrusive_ptr<ivalue::Object> ivalue::Object::deepcopy(IValue::HashAliasedIValueMap& memo) const {
+c10::intrusive_ptr<ivalue::Object> ivalue::Object::deepcopy(
+    IValue::HashAliasedIValueMap& memo,
+    c10::optional<at::Device> device) const {
   auto cu = type_.cu_;
   auto object = ivalue::Object::create(WeakOrStrongTypePtr(type_.cu_, type_.type_), type()->numAttributes());
   for (const auto i : c10::irange(slots_.size())) {
@@ -1028,7 +1036,7 @@ c10::intrusive_ptr<ivalue::Object> ivalue::Object::deepcopy(IValue::HashAliasedI
             "this class.";
       AT_ERROR(err.str());
     }
-    object->setSlot(i, slots_[i].deepcopy(memo));
+    object->setSlot(i, slots_[i].deepcopy(memo, device));
   }
   return object;
 }

diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
@@ -1112,8 +1112,10 @@ struct TORCH_API IValue final {
   // TODO: There are several places that recurse over IValue. This is fragile.
   // This visitor should be used to recurse over ivalues.
   void visit(const std::function<bool(const IValue&)>& visitor) const;
-  IValue deepcopy() const;
-  IValue deepcopy(HashAliasedIValueMap& memo) const;
+  IValue deepcopy(c10::optional<at::Device> device = c10::nullopt) const;
+  IValue deepcopy(
+      HashAliasedIValueMap& memo,
+      c10::optional<at::Device> device = c10::nullopt) const;
 
  private:
   static c10::intrusive_ptr_target* null_to_undefined_tensor(c10::intrusive_ptr_target* p) {