Merge remote-tracking branch 'origin' into ragulpr/requirements-txt-i…

…n-dev-setup * origin: (898 commits) Move dynamo.optimizations.distributed to backends (pytorch#93408) Remove cuda 11.6 from nightly (pytorch#93979) Refactor dynamo register_backend/BACKENDS (pytorch#93389) Remove cuda 11.6 from CI replace with 11.7 (pytorch#93406) [Dynamo] Rename `GuardBuilder.guarded_code` -> `check_fn_manager` (pytorch#93934) Revert "Remove CUDA 11.6 from nightly builds (pytorch#93404)" Revert "[inductor] fix crash issue when input is a view tensor (pytorch#90150)" Basic Validation for FSDP `state_dict` transformations of modules with persistent buffers (pytorch#93396) Merge Inductor perf smoke test with other inductor CI tests (pytorch#93395) [inductor] Don't import torchvision (pytorch#93027) [FSDP][3/N] Refactor `summon_full_params` unit tests (pytorch#92298) [FSDP][2/N] `_summon_full_params` -> `_unshard_params` (pytorch#92297) Remove CUDA 11.6 from nightly builds (pytorch#93404) Mark buffers that reuse other buffers (pytorch#93329) Refactor to allow reuse of SchedulerNode.allocate (pytorch#93328) retire sparse_mask_helper (pytorch#91714) update fbgemm third party (pytorch#93907) [inductor] fix crash issue when input is a view tensor (pytorch#90150) [Inductor] add config for weight prepacking (pytorch#93811) Check for none for NNModuleVariable.__module__ (pytorch#93326) ...
ragulpr · Feb 2, 2023 · e30a5ca · e30a5ca
2 parents 5e9426e + 2b0d7e6
commit e30a5ca
Show file tree

Hide file tree

Showing 2,226 changed files with 88,658 additions and 55,562 deletions.
diff --git a/.jenkins/caffe2/README.md → .ci/caffe2/README.md b/.jenkins/caffe2/README.md → .ci/caffe2/README.md
diff --git a/.jenkins/caffe2/common.sh → .ci/caffe2/common.sh b/.jenkins/caffe2/common.sh → .ci/caffe2/common.sh
@@ -28,7 +28,7 @@ fi
 
 # /usr/local/caffe2 is where the cpp bits are installed to in cmake-only
 # builds. In +python builds the cpp tests are copied to /usr/local/caffe2 so
-# that the test code in .jenkins/test.sh is the same
+# that the test code in .ci/test.sh is the same
 INSTALL_PREFIX="/usr/local/caffe2"
 
 mkdir -p "$gtest_reports_dir" || true

diff --git a/.jenkins/caffe2/test.sh → .ci/caffe2/test.sh b/.jenkins/caffe2/test.sh → .ci/caffe2/test.sh
diff --git a/.jenkins/onnx/README.md → .ci/onnx/README.md b/.jenkins/onnx/README.md → .ci/onnx/README.md
diff --git a/.jenkins/onnx/common.sh → .ci/onnx/common.sh b/.jenkins/onnx/common.sh → .ci/onnx/common.sh
diff --git a/.jenkins/onnx/test.sh → .ci/onnx/test.sh b/.jenkins/onnx/test.sh → .ci/onnx/test.sh
@@ -52,16 +52,16 @@ $MAYBE_SUDO pip -q uninstall -y coverage
 # CircleCI, so we host a copy on S3 instead
 $MAYBE_SUDO pip -q install attrs==18.1.0 -f https://s3.amazonaws.com/ossci-linux/wheels/attrs-18.1.0-py2.py3-none-any.whl
 $MAYBE_SUDO pip -q install coverage==4.5.1 -f https://s3.amazonaws.com/ossci-linux/wheels/coverage-4.5.1-cp36-cp36m-macosx_10_12_x86_64.whl
-$MAYBE_SUDO pip -q install hypothesis==3.44.6 -f https://s3.amazonaws.com/ossci-linux/wheels/hypothesis-3.44.6-py3-none-any.whl
+$MAYBE_SUDO pip -q install hypothesis==4.57.1
 
 ##############
 # ONNX tests #
 ##############
 if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
   pip install -q --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
-  pip install -q --user ninja flatbuffers==2.0 numpy==1.21.5 onnxruntime==1.12.1 beartype==0.10.4 onnx==1.12.0
+  pip install -q --user ninja flatbuffers==2.0 numpy==1.22.4 onnxruntime==1.12.1 beartype==0.10.4 onnx==1.12.0
   # TODO: change this when onnx-script is on testPypi
-  pip install 'onnx-script @ git+https://github.com/microsoft/onnx-script'
+  pip install 'onnx-script @ git+https://github.com/microsoft/onnx-script@4f3ff0d806d0d0f30cecdfd3e8b094b1e492d44a'
   # numba requires numpy <= 1.20, onnxruntime requires numpy >= 1.21.
   # We don't actually need it for our tests, but it's imported if it's present, so uninstall.
   pip uninstall -q --yes numba

diff --git a/.jenkins/pytorch/.shellcheckrc → .ci/pytorch/.shellcheckrc b/.jenkins/pytorch/.shellcheckrc → .ci/pytorch/.shellcheckrc
diff --git a/.jenkins/pytorch/README.md → .ci/pytorch/README.md b/.jenkins/pytorch/README.md → .ci/pytorch/README.md
diff --git a/.jenkins/pytorch/build-asan.sh → .ci/pytorch/build-asan.sh b/.jenkins/pytorch/build-asan.sh → .ci/pytorch/build-asan.sh
diff --git a/.jenkins/pytorch/build-mobile.sh → .ci/pytorch/build-mobile.sh b/.jenkins/pytorch/build-mobile.sh → .ci/pytorch/build-mobile.sh
diff --git a/.jenkins/pytorch/build-tsan.sh → .ci/pytorch/build-tsan.sh b/.jenkins/pytorch/build-tsan.sh → .ci/pytorch/build-tsan.sh
diff --git a/.jenkins/pytorch/build.sh → .ci/pytorch/build.sh b/.jenkins/pytorch/build.sh → .ci/pytorch/build.sh
@@ -192,9 +192,14 @@ if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
 
   get_bazel
 
-  tools/bazel build --config=no-tty //...
+  # Leave 1 CPU free and use only up to 80% of memory to reduce the change of crashing
+  # the runner
+  BAZEL_MEM_LIMIT="--local_ram_resources=HOST_RAM*.8"
+  BAZEL_CPU_LIMIT="--local_cpu_resources=HOST_CPUS-1"
+
+  tools/bazel build --config=no-tty "${BAZEL_MEM_LIMIT}" "${BAZEL_CPU_LIMIT}" //...
   # Build torch, the Python module, and tests for CPU-only
-  tools/bazel build --config=no-tty --config=cpu-only :torch :_C.so :all_tests
+  tools/bazel build --config=no-tty "${BAZEL_MEM_LIMIT}" "${BAZEL_CPU_LIMIT}" --config=cpu-only :torch :_C.so :all_tests
 
 else
   # check that setup.py would fail with bad arguments

diff --git a/.jenkins/pytorch/codegen-test.sh → .ci/pytorch/codegen-test.sh b/.jenkins/pytorch/codegen-test.sh → .ci/pytorch/codegen-test.sh
@@ -3,8 +3,8 @@
 # This script can also be used to test whether your diff changes any codegen output.
 #
 # Run it before and after your change:
-#   .jenkins/pytorch/codegen-test.sh <baseline_output_dir>
-#   .jenkins/pytorch/codegen-test.sh <test_output_dir>
+#   .ci/pytorch/codegen-test.sh <baseline_output_dir>
+#   .ci/pytorch/codegen-test.sh <test_output_dir>
 #
 # Then run diff to compare the generated files:
 #   diff -Naur <baseline_output_dir> <test_output_dir>

diff --git a/.ci/pytorch/common-build.sh b/.ci/pytorch/common-build.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+# Required environment variables:
+#   $BUILD_ENVIRONMENT (should be set by your Docker image)
+
+if [[ "$BUILD_ENVIRONMENT" != *win-* ]]; then
+    # Save the absolute path in case later we chdir (as occurs in the gpu perf test)
+    script_dir="$( cd "$(dirname "${BASH_SOURCE[0]}")" || exit ; pwd -P )"
+
+    if which sccache > /dev/null; then
+        # Save sccache logs to file
+        sccache --stop-server > /dev/null  2>&1 || true
+        rm -f ~/sccache_error.log || true
+
+        function sccache_epilogue() {
+            echo "::group::Sccache Compilation Log"
+            echo '=================== sccache compilation log ==================='
+            python "$script_dir/print_sccache_log.py" ~/sccache_error.log 2>/dev/null || true
+            echo '=========== If your build fails, please take a look at the log above for possible reasons ==========='
+            sccache --show-stats
+            sccache --stop-server || true
+            echo "::endgroup::"
+        }
+
+        # Register the function here so that the error log can be printed even when
+        # sccache fails to start, i.e. timeout error
+        trap_add sccache_epilogue EXIT
+
+        if [[ -n "${SKIP_SCCACHE_INITIALIZATION:-}" ]]; then
+            # sccache --start-server seems to hang forever on self hosted runners for GHA
+            # so let's just go ahead and skip the --start-server altogether since it seems
+            # as though sccache still gets used even when the sscache server isn't started
+            # explicitly
+            echo "Skipping sccache server initialization, setting environment variables"
+            export SCCACHE_IDLE_TIMEOUT=1200
+            export SCCACHE_ERROR_LOG=~/sccache_error.log
+            export RUST_LOG=sccache::server=error
+        elif [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
+            SCCACHE_ERROR_LOG=~/sccache_error.log SCCACHE_IDLE_TIMEOUT=0 sccache --start-server
+        else
+            # increasing SCCACHE_IDLE_TIMEOUT so that extension_backend_test.cpp can build after this PR:
+            # https://github.com/pytorch/pytorch/pull/16645
+            SCCACHE_ERROR_LOG=~/sccache_error.log SCCACHE_IDLE_TIMEOUT=1200 RUST_LOG=sccache::server=error sccache --start-server
+        fi
+
+        # Report sccache stats for easier debugging
+        sccache --zero-stats
+    fi
+
+    if which ccache > /dev/null; then
+        # Report ccache stats for easier debugging
+        ccache --zero-stats
+        ccache --show-stats
+        function ccache_epilogue() {
+            ccache --show-stats
+        }
+        trap_add ccache_epilogue EXIT
+    fi
+fi
diff --git a/.jenkins/pytorch/common.sh → .ci/pytorch/common.sh b/.jenkins/pytorch/common.sh → .ci/pytorch/common.sh
diff --git a/.jenkins/pytorch/common_utils.sh → .ci/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh → .ci/pytorch/common_utils.sh
@@ -145,8 +145,7 @@ function install_triton() {
 }
 
 function setup_torchdeploy_deps(){
-  conda install -y cmake
-  conda install -y -c conda-forge libpython-static=3.10
+  conda install -y -n "py_${ANACONDA_PYTHON_VERSION}" "libpython-static=${ANACONDA_PYTHON_VERSION}"
   local CC
   local CXX
   CC="$(which gcc)"

diff --git a/.jenkins/pytorch/create_test_cert.py → .ci/pytorch/create_test_cert.py b/.jenkins/pytorch/create_test_cert.py → .ci/pytorch/create_test_cert.py
diff --git a/.jenkins/pytorch/docker-build-test.sh → .ci/pytorch/docker-build-test.sh b/.jenkins/pytorch/docker-build-test.sh → .ci/pytorch/docker-build-test.sh
diff --git a/.jenkins/pytorch/docs-test.sh → .ci/pytorch/docs-test.sh b/.jenkins/pytorch/docs-test.sh → .ci/pytorch/docs-test.sh
diff --git a/.jenkins/pytorch/fake_numpy/numpy.py → .ci/pytorch/fake_numpy/numpy.py b/.jenkins/pytorch/fake_numpy/numpy.py → .ci/pytorch/fake_numpy/numpy.py
diff --git a/.jenkins/pytorch/macos-build-test.sh → .ci/pytorch/macos-build-test.sh b/.jenkins/pytorch/macos-build-test.sh → .ci/pytorch/macos-build-test.sh
diff --git a/.jenkins/pytorch/macos-build.sh → .ci/pytorch/macos-build.sh b/.jenkins/pytorch/macos-build.sh → .ci/pytorch/macos-build.sh
diff --git a/.jenkins/pytorch/macos-common.sh → .ci/pytorch/macos-common.sh b/.jenkins/pytorch/macos-common.sh → .ci/pytorch/macos-common.sh
diff --git a/.jenkins/pytorch/macos-test.sh → .ci/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh → .ci/pytorch/macos-test.sh
@@ -89,6 +89,16 @@ print_cmake_info() {
   CONDA_INSTALLATION_DIR=$(dirname "$CMAKE_EXEC")
   # Print all libraries under cmake rpath for debugging
   ls -la "$CONDA_INSTALLATION_DIR/../lib"
+
+  export CMAKE_EXEC
+  # Explicitly add conda env lib folder to cmake rpath to address the flaky issue
+  # where cmake dependencies couldn't be found. This seems to point to how conda
+  # links $CMAKE_EXEC to its package cache when cloning a new environment
+  install_name_tool -add_rpath @executable_path/../lib "${CMAKE_EXEC}" || true
+  # Adding the rpath will invalidate cmake signature, so signing it again here
+  # to trust the executable. EXC_BAD_ACCESS (SIGKILL (Code Signature Invalid))
+  # with an exit code 137 otherwise
+  codesign -f -s - "${CMAKE_EXEC}" || true
 }
 
 test_custom_backend() {
@@ -99,7 +109,7 @@ test_custom_backend() {
   rm -rf build && mkdir build
   pushd build
   SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-  CMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" cmake ..
+  CMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" "${CMAKE_EXEC}" ..
   make VERBOSE=1
   popd
 
@@ -122,7 +132,7 @@ test_custom_script_ops() {
   rm -rf build && mkdir build
   pushd build
   SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-  CMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" cmake ..
+  CMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" "${CMAKE_EXEC}" ..
   make VERBOSE=1
   popd
 
@@ -144,7 +154,7 @@ test_jit_hooks() {
   rm -rf build && mkdir build
   pushd build
   SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-  CMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" cmake ..
+  CMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" "${CMAKE_EXEC}" ..
   make VERBOSE=1
   popd
 

diff --git a/.jenkins/pytorch/multigpu-test.sh → .ci/pytorch/multigpu-test.sh b/.jenkins/pytorch/multigpu-test.sh → .ci/pytorch/multigpu-test.sh
@@ -45,4 +45,5 @@ time python test/run_test.py --verbose -i distributed/_shard/test_partial_tensor
 time python test/run_test.py --verbose -i distributed/_shard/test_replicated_tensor
 # Other tests
 time python test/run_test.py --verbose -i test_cuda_primary_ctx
+time python test/run_test.py --verbose -i test_optim -- -k optimizers_with_varying_tensors
 assert_git_not_dirty
diff --git a/.jenkins/pytorch/perf_test/common.sh → .ci/pytorch/perf_test/common.sh b/.jenkins/pytorch/perf_test/common.sh → .ci/pytorch/perf_test/common.sh
diff --git a/...ytorch/perf_test/compare_with_baseline.py → ...ytorch/perf_test/compare_with_baseline.py b/...ytorch/perf_test/compare_with_baseline.py → ...ytorch/perf_test/compare_with_baseline.py
@@ -62,7 +62,7 @@
     raise Exception('''\n
 z-value >= 3, there is high chance of perf regression.\n
 To reproduce this regression, run
-`cd .jenkins/pytorch/perf_test/ && bash {}.sh` on your local machine
+`cd .ci/pytorch/perf_test/ && bash {}.sh` on your local machine
 and compare the runtime before/after your code change.
 '''.format(test_name))
 else:

diff --git a/.jenkins/pytorch/perf_test/get_stats.py → .ci/pytorch/perf_test/get_stats.py b/.jenkins/pytorch/perf_test/get_stats.py → .ci/pytorch/perf_test/get_stats.py
diff --git a/...t/test_cpu_speed_mini_sequence_labeler.sh → ...t/test_cpu_speed_mini_sequence_labeler.sh b/...t/test_cpu_speed_mini_sequence_labeler.sh → ...t/test_cpu_speed_mini_sequence_labeler.sh
diff --git a/...pytorch/perf_test/test_cpu_speed_mnist.sh → ...pytorch/perf_test/test_cpu_speed_mnist.sh b/...pytorch/perf_test/test_cpu_speed_mnist.sh → ...pytorch/perf_test/test_cpu_speed_mnist.sh
diff --git a/...pytorch/perf_test/test_cpu_speed_torch.sh → ...pytorch/perf_test/test_cpu_speed_torch.sh b/...pytorch/perf_test/test_cpu_speed_torch.sh → ...pytorch/perf_test/test_cpu_speed_torch.sh
@@ -19,7 +19,7 @@ test_cpu_speed_torch () {
   fi
 
   if ! python perf-tests/modules/test_cpu_torch.py "${ARGS[@]}"; then
-    echo "To reproduce this regression, run \`cd .jenkins/pytorch/perf_test/ && bash ${FUNCNAME[0]}.sh\` on your local machine and compare the runtime before/after your code change."
+    echo "To reproduce this regression, run \`cd .ci/pytorch/perf_test/ && bash ${FUNCNAME[0]}.sh\` on your local machine and compare the runtime before/after your code change."
     exit 1
   fi
 }

diff --git a/.../perf_test/test_cpu_speed_torch_tensor.sh → .../perf_test/test_cpu_speed_torch_tensor.sh b/.../perf_test/test_cpu_speed_torch_tensor.sh → .../perf_test/test_cpu_speed_torch_tensor.sh
@@ -19,7 +19,7 @@ test_cpu_speed_torch_tensor () {
   fi
 
   if ! python perf-tests/modules/test_cpu_torch_tensor.py "${ARGS[@]}"; then
-    echo "To reproduce this regression, run \`cd .jenkins/pytorch/perf_test/ && bash ${FUNCNAME[0]}.sh\` on your local machine and compare the runtime before/after your code change."
+    echo "To reproduce this regression, run \`cd .ci/pytorch/perf_test/ && bash ${FUNCNAME[0]}.sh\` on your local machine and compare the runtime before/after your code change."
     exit 1
   fi
 }

diff --git a/...ch/perf_test/test_gpu_speed_cudnn_lstm.sh → ...ch/perf_test/test_gpu_speed_cudnn_lstm.sh b/...ch/perf_test/test_gpu_speed_cudnn_lstm.sh → ...ch/perf_test/test_gpu_speed_cudnn_lstm.sh
diff --git a/.../pytorch/perf_test/test_gpu_speed_lstm.sh → .ci/pytorch/perf_test/test_gpu_speed_lstm.sh b/.../pytorch/perf_test/test_gpu_speed_lstm.sh → .ci/pytorch/perf_test/test_gpu_speed_lstm.sh
diff --git a/...pytorch/perf_test/test_gpu_speed_mlstm.sh → ...pytorch/perf_test/test_gpu_speed_mlstm.sh b/...pytorch/perf_test/test_gpu_speed_mlstm.sh → ...pytorch/perf_test/test_gpu_speed_mlstm.sh
diff --git a/...pytorch/perf_test/test_gpu_speed_mnist.sh → ...pytorch/perf_test/test_gpu_speed_mnist.sh b/...pytorch/perf_test/test_gpu_speed_mnist.sh → ...pytorch/perf_test/test_gpu_speed_mnist.sh
diff --git a/...est/test_gpu_speed_word_language_model.sh → ...est/test_gpu_speed_word_language_model.sh b/...est/test_gpu_speed_word_language_model.sh → ...est/test_gpu_speed_word_language_model.sh
diff --git a/...s/pytorch/perf_test/update_commit_hash.py → .ci/pytorch/perf_test/update_commit_hash.py b/...s/pytorch/perf_test/update_commit_hash.py → .ci/pytorch/perf_test/update_commit_hash.py
diff --git a/.jenkins/pytorch/print_sccache_log.py → .ci/pytorch/print_sccache_log.py b/.jenkins/pytorch/print_sccache_log.py → .ci/pytorch/print_sccache_log.py
diff --git a/.jenkins/pytorch/run_glootls_test.sh → .ci/pytorch/run_glootls_test.sh b/.jenkins/pytorch/run_glootls_test.sh → .ci/pytorch/run_glootls_test.sh
diff --git a/.jenkins/pytorch/short-perf-test-cpu.sh → .ci/pytorch/short-perf-test-cpu.sh b/.jenkins/pytorch/short-perf-test-cpu.sh → .ci/pytorch/short-perf-test-cpu.sh
@@ -2,10 +2,10 @@
 
 SCRIPT_PARENT_DIR=$(dirname "${BASH_SOURCE[0]}")
 
-# shellcheck source=.jenkins/pytorch/common.sh
+# shellcheck source=.ci/pytorch/common.sh
 source "$SCRIPT_PARENT_DIR/common.sh"
 
-cd .jenkins/pytorch/perf_test
+cd .ci/pytorch/perf_test
 
 echo "Running CPU perf test for PyTorch..."
 

diff --git a/.jenkins/pytorch/short-perf-test-gpu.sh → .ci/pytorch/short-perf-test-gpu.sh b/.jenkins/pytorch/short-perf-test-gpu.sh → .ci/pytorch/short-perf-test-gpu.sh
@@ -3,7 +3,7 @@
 # shellcheck source=./common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 
-pushd .jenkins/pytorch/perf_test
+pushd .ci/pytorch/perf_test
 
 echo "Running GPU perf test for PyTorch..."