Update on "[FSDP] Add record_function for explicit prefetching"

Example: <img width="568" alt="Screenshot 2023-07-25 at 7 41 43 PM" src="https://github.com/pytorch/pytorch/assets/31054793/5f3f07b3-97f4-4493-9cab-5619484e2f6d"> [ghstack-poisoned]
pytorch · Jul 26, 2023 · 6fd6f82 · 6fd6f82
2 parents a8b6bc9 + abafbf4
commit 6fd6f82
Show file tree

Hide file tree

Showing 465 changed files with 7,418 additions and 3,604 deletions.
diff --git a/.ci/docker/build_docker.sh b/.ci/docker/build_docker.sh
diff --git a/.ci/docker/common/install_onnx.sh b/.ci/docker/common/install_onnx.sh
@@ -12,8 +12,12 @@ pip_install \
   mock==5.0.1 \
   ninja==1.10.2 \
   networkx==2.0 \
-  numpy==1.22.4 \
-  onnx==1.14.0
+  numpy==1.22.4
+
+# Using 1.15dev branch for the following not yet released features and fixes.
+# - Segfault fix for shape inference.
+# - Inliner to workaround ORT segfault.
+pip_install onnx-weekly==1.15.0.dev20230717
 
 pip_install \
   onnxruntime==1.15.0 \
@@ -24,7 +28,7 @@ pip_install \
   transformers==4.25.1
 
 # TODO: change this when onnx-script is on testPypi
-pip_install "onnxscript@git+https://github.com/microsoft/onnxscript@2bb3e9f2d094912f81cb63cecb412efb14c65738"
+pip_install onnxscript-preview==0.1.0.dev20230724 --no-deps
 
 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
@@ -180,7 +180,7 @@ xdoctest==1.1.0
 #Pinned versions: 1.1.0
 #test that import:
 
-pygments==2.12.0
+pygments==2.15.0
 #Description: support doctest highlighting
 #Pinned versions: 2.12.0
 #test that import: the doctests

diff --git a/.ci/onnx/test.sh b/.ci/onnx/test.sh
@@ -9,7 +9,6 @@ retry () {
 }
 
 if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
-  pip -q install --user "file:///var/lib/jenkins/workspace/third_party/onnx#egg=onnx"
   # TODO: This can be removed later once vision is also part of the Docker image
   pip install -q --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
   # JIT C++ extensions require ninja, so put it into PATH.

diff --git a/.ci/pytorch/perf_test/compare_with_baseline.py b/.ci/pytorch/perf_test/compare_with_baseline.py
@@ -59,12 +59,12 @@
 print("z-value: ", z_value)
 
 if z_value >= 3:
-    raise Exception('''\n
+    raise Exception(f'''\n
 z-value >= 3, there is high chance of perf regression.\n
 To reproduce this regression, run
-`cd .ci/pytorch/perf_test/ && bash {}.sh` on your local machine
+`cd .ci/pytorch/perf_test/ && bash {test_name}.sh` on your local machine
 and compare the runtime before/after your code change.
-'''.format(test_name))
+''')
 else:
     print("z-value < 3, no perf regression detected.")
     if args.update:

diff --git a/.github/actions/calculate-docker-image/action.yml b/.github/actions/calculate-docker-image/action.yml
diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-46a0117baa7ddf841a8d63798c28d1e26ded9b1f
+f5edcb2088195db71bcd36d0f8f1b6a5e663afd8
diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py
@@ -620,7 +620,7 @@ def get_ghstack_prs(repo: GitRepo, pr: "GitHubPR") -> List[Tuple["GitHubPR", str
     Get the open PRs in the stack that are below this PR.  Throws error if any of the PRs are out of sync.
     """
     assert pr.is_ghstack_pr()
-    entire_stack: List[Tuple["GitHubPR", str]] = []
+    entire_stack: List[Tuple[GitHubPR, str]] = []
     # For ghstack, cherry-pick commits based from origin
     orig_ref = f"{repo.remote}/{re.sub(r'/head$', '/orig', pr.head_ref())}"
     rev_list = repo.revlist(f"{pr.default_branch()}..{orig_ref}")

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -15,7 +15,7 @@ on:
 # When any other step fails, it's job will be retried once by retryBot.
 jobs:
   lintrunner:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@use-shareable-gha-calculate-docker-image
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       runner: linux.2xlarge
       docker-image: pytorch-linux-focal-linter
@@ -62,7 +62,7 @@ jobs:
         exit $RC
 
   quick-checks:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@use-shareable-gha-calculate-docker-image
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       runner: linux.2xlarge
       docker-image: pytorch-linux-focal-linter
@@ -116,7 +116,7 @@ jobs:
           bash .github/scripts/pr-sanity-check.sh
 
   workflow-checks:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@use-shareable-gha-calculate-docker-image
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       runner: linux.2xlarge
       docker-image: pytorch-linux-focal-linter
@@ -151,7 +151,7 @@ jobs:
         exit $RC
 
   toc:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@use-shareable-gha-calculate-docker-image
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       runner: linux.2xlarge
       docker-image: pytorch-linux-focal-linter
@@ -189,7 +189,7 @@ jobs:
   test-tools:
     name: Test tools
     if: ${{ github.repository == 'pytorch/pytorch' }}
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@use-shareable-gha-calculate-docker-image
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       runner: linux.2xlarge
       docker-image: pytorch-linux-focal-linter

diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
@@ -47,9 +47,10 @@ jobs:
     with:
       build-environment: linux-bionic-cuda11.8-py3.9-gcc7
       docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7
+      cuda-arch-list: 8.6
       test-matrix: |
         { include: [
-          { config: "multigpu", shard: 1, num_shards: 1, runner: "linux.16xlarge.nvidia.gpu" },
+          { config: "multigpu", shard: 1, num_shards: 1, runner: "linux.g5.12xlarge.nvidia.gpu" },
         ]}
       build-with-debug: false
 

diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -3086,6 +3086,6 @@ init_command = [
     'python3',
     'tools/linter/adapters/pip_init.py',
     '--dry-run={{DRYRUN}}',
-    'ruff==0.0.277',
+    'ruff==0.0.280',
 ]
 is_formatter = true
diff --git a/aten/src/ATen/SparseCsrTensorUtils.h b/aten/src/ATen/SparseCsrTensorUtils.h
@@ -9,6 +9,7 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/Operators.h>
 #else
+#include <ATen/ops/_sparse_compressed_tensor_unsafe.h>
 #include <ATen/ops/resize_as_sparse_native.h>
 #endif
 
@@ -366,5 +367,49 @@ inline bool only_sparse_compressed_add_trivial_cases(
       });
 }
 
+inline Tensor to_type(Tensor input, ScalarType dtype) {
+  Tensor compressed_indices, plain_indices;
+  std::tie(compressed_indices, plain_indices) =
+      at::sparse_csr::getCompressedPlainIndices(input);
+  return at::_sparse_compressed_tensor_unsafe(
+      std::move(compressed_indices),
+      std::move(plain_indices),
+      std::move(input.values()).to(dtype),
+      input.sizes(),
+      dtype,
+      input.layout(),
+      input.device(),
+      input.options().pinned_memory_opt());
+}
+
+template <typename acc_t, typename scalar_t>
+inline std::tuple<Tensor, Tensor> create_acc_buffer(
+    TensorOptions option,
+    ScalarType type,
+    int64_t nnz = -1) {
+  Tensor new_values, new_values_acc;
+  constexpr bool need_acc = !std::is_same<scalar_t, acc_t>::value;
+  bool is_integral = at::isIntegralType(type, /*includeBool=*/true);
+  if constexpr (need_acc) {
+    auto acc_dtype = CppTypeToScalarType<acc_t>::value;
+    new_values_acc = at::empty({}, option.dtype(acc_dtype));
+    new_values = is_integral ? new_values_acc : at::empty({}, option);
+  } else {
+    new_values = new_values_acc = at::empty({}, option);
+  }
+  if (nnz != -1) {
+    return std::make_tuple(
+        new_values.resize_(nnz), new_values_acc.resize_(nnz));
+  } else {
+    return std::make_tuple(new_values, new_values_acc);
+  }
+}
+
+inline void copy_from_acc_buffer(Tensor& new_values, Tensor& new_values_acc) {
+  if (!new_values_acc.is_same(new_values)) {
+    new_values.copy_(new_values_acc);
+  }
+}
+
 } // namespace sparse_csr
 } // namespace at