Update on "Improve complex lerp performance"

The complex lerp kernel uses `std::abs(z) < 0.5` which involves computing a sqrt. Instead compare the square against 0.25 has much lower latency and so performs much better overall. In a simple timeit benchmark I see more than 10x speedup on CPU for a 4096 element complex lerp, from 84 us to 6.7 us. [ghstack-poisoned]
pytorch · Oct 13, 2022 · 92b3719 · 92b3719
2 parents aba1a66 + 6a8e13b
commit 92b3719
Show file tree

Hide file tree

Showing 313 changed files with 12,061 additions and 6,060 deletions.
diff --git a/.github/ci_commit_pins/torchdynamo.txt b/.github/ci_commit_pins/torchdynamo.txt
@@ -1 +1 @@
-92630636ef6c23f8bfa028c88c55259ec222fe3e
+d4b62d69a2d5ae1823c1d6695237b44faee05acd
diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-6e203b44098c3371689f56abc17b7c02bd51a261
+12adc5426ef345ab7999661538a60da99dd85281
diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-d6b1971d4f40364cd6f2d23d047818d295971f7a
+8332ac9072832c67b2680509bb1ef1abc5813c6a
diff --git a/.github/scripts/fetch_latest_green_commit.py b/.github/scripts/fetch_latest_green_commit.py
@@ -84,8 +84,6 @@ def isGreen(commit: str, results: Dict[str, Any]) -> Tuple[bool, str]:
                     return (False, workflowName + " checks were not successful")
                 else:
                     regex[required_check] = True
-        if workflowName in ["periodic", "docker-release-builds"] and conclusion not in ["success", "skipped"]:
-            return (False, workflowName + " checks were not successful")
 
     missing_workflows = [x for x in regex.keys() if not regex[x]]
     if len(missing_workflows) > 0:
@@ -110,7 +108,7 @@ def main() -> None:
     )
     qlambda = rs.QueryLambda.retrieve(
         'commit_jobs_batch_query',
-        version='15aba20837ae9d75',
+        version='8003fdfd18b64696',
         workspace='commons')
 
     commits = get_latest_commits()

diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py
@@ -27,6 +27,7 @@
     "nogpu_AVX512",
     "nogpu_NO_AVX2",
     "slow",
+    "tsan",
     "xla",
 }}
 

diff --git a/.github/scripts/test_fetch_latest_green_commit.py b/.github/scripts/test_fetch_latest_green_commit.py
@@ -81,13 +81,12 @@ def test_necessary_failed(self, mock_get_commit_results: Any) -> None:
 
     @mock.patch('fetch_latest_green_commit.get_commit_results', return_value=TestChecks().make_test_checks())
     def test_skippable_failed(self, mock_get_commit_results: Any) -> None:
-        "Test with skippable job (ex: docker-release-builds) failing"
+        "Test with failing skippable jobs (ex: docker-release-builds) should pass"
         workflow_checks = mock_get_commit_results()
         workflow_checks = set_workflow_job_status(workflow_checks, "periodic", "skipped")
         workflow_checks = set_workflow_job_status(workflow_checks, "docker-release-builds", "failed")
         result = isGreen("sha", workflow_checks)
-        self.assertFalse(result[0])
-        self.assertEqual(result[1], "docker-release-builds checks were not successful")
+        self.assertTrue(result[0])
 
     @mock.patch('fetch_latest_green_commit.get_commit_results', return_value={})
     def test_no_workflows(self, mock_get_commit_results: Any) -> None:

diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py
@@ -1376,7 +1376,9 @@ def merge(pr_num: int, repo: GitRepo,
     if (datetime.utcnow() - pr.last_pushed_at()).days > stale_pr_days:
         if land_checks and not dry_run:
             pr.delete_land_time_check_branch(repo)
-        raise RuntimeError("This PR is too stale; the last push date was more than 3 days ago. Please rebase and try again.")
+        raise RuntimeError(f"This PR is too stale; the last push date was more than {stale_pr_days} days ago. "
+                           "Please rebase and try again. You can rebase by leaving the following comment on this PR:\n"
+                           "`@pytorchbot rebase`")
 
     start_time = time.time()
     last_exception = ''

diff --git a/.github/workflows/_mac-test-mps.yml b/.github/workflows/_mac-test-mps.yml
@@ -50,7 +50,7 @@ jobs:
         run: |
           # shellcheck disable=SC1090
           set -ex
-          ${CONDA_INSTALL} numpy expecttest pyyaml
+          ${CONDA_INSTALL} expecttest numpy=1.22.3 pyyaml=6.0
           # As wheels are cross-compiled they are reported as x86_64 ones
           ORIG_WHLNAME=$(ls -1 dist/*.whl); ARM_WHLNAME=${ORIG_WHLNAME/x86_64/arm64}; mv ${ORIG_WHLNAME} ${ARM_WHLNAME}
           ${CONDA_RUN} python3 -mpip install dist/*.whl

diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml
@@ -96,6 +96,7 @@ jobs:
           SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
           DOCKER_IMAGE: ${{ inputs.docker-image }}
           XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
+          PYTORCH_JIT_ENABLE_NVFUSER: 1
         timeout-minutes: 270
         run: |
           set -x

diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml
@@ -6,6 +6,7 @@ on:
     paths:
       - Dockerfile
       - docker.Makefile
+      - .github/workflows/docker-release.yml
   push:
     branches:
       - nightly
@@ -47,7 +48,10 @@ jobs:
       # [see note: pytorch repo ref]
       # deep clone (fetch-depth 0) required for git merge-base
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          submodules: 'recursive'
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
       - name: Setup SSH (Click me for login details)

diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
@@ -148,6 +148,31 @@ jobs:
       docker-image: ${{ needs.linux-bionic-cuda11_7-py3_7-gcc7-debug-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_7-gcc7-debug-build.outputs.test-matrix }}
 
+  linux-bionic-cuda11_6-py3_10-gcc7-sm86-build:
+    name: cuda11.6-py3.10-gcc7-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-bionic-cuda11.6-py3.10-gcc7-sm86
+      docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
+      cuda-arch-list: 8.6
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+
+  linux-bionic-cuda11_6-py3_10-gcc7-sm86-test:
+    name: cuda11.6-py3.10-gcc7-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-bionic-cuda11_6-py3_10-gcc7-sm86-build
+    with:
+      build-environment: linux-bionic-cuda11.6-py3.10-gcc7-sm86
+      docker-image: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-sm86-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-sm86-build.outputs.test-matrix }}
+
   libtorch-linux-bionic-cuda11_7-py3_7-gcc7-build:
     name: libtorch-linux-bionic-cuda11.7-py3.7-gcc7
     uses: ./.github/workflows/_linux-build.yml

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -120,6 +120,26 @@ jobs:
       docker-image: ${{ needs.linux-bionic-py3_7-clang9-slow-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-bionic-py3_7-clang9-slow-build.outputs.test-matrix }}
 
+  linux-focal-py3_7-clang7-tsan-build:
+    name: linux-focal-py3.7-clang7-tsan
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-focal-py3.7-clang7-tsan
+      docker-image-name: pytorch-linux-focal-py3-clang7-asan
+      test-matrix: |
+        { include: [
+          { config: "tsan", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+        ]}
+
+  linux-focal-py3_7-clang7-tsan-test:
+    name: linux-focal-py3.7-clang7-tsan
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-focal-py3_7-clang7-tsan-build
+    with:
+      build-environment: linux-focal-py3.7-clang7-tsan
+      docker-image: ${{ needs.linux-focal-py3_7-clang7-tsan-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-py3_7-clang7-tsan-build.outputs.test-matrix }}
+
   ios-12-5-1-x86-64:
     name: ios-12-5-1-x86-64
     uses: ./.github/workflows/_ios-build-test.yml

diff --git a/.gitignore b/.gitignore
@@ -113,6 +113,7 @@ torch/test/
 torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h
 torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h
 torch/version.py
+minifier_launcher.py
 # Root level file used in CI to specify certain env configs.
 # E.g., see .circleci/config.yaml
 env

diff --git a/.jenkins/pytorch/build-tsan.sh b/.jenkins/pytorch/build-tsan.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Required environment variable: $BUILD_ENVIRONMENT
+# (This is set by default in the Docker images we build, so you don't
+# need to set it yourself.
+
+# shellcheck source=./common.sh
+source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+# shellcheck source=./common-build.sh
+source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
+
+echo "Clang version:"
+clang --version
+
+python tools/stats/export_test_times.py
+
+if [ -n "$(which conda)" ]; then
+  export CMAKE_PREFIX_PATH=/opt/conda
+fi
+
+CC="clang" CXX="clang++" LDSHARED="clang --shared" \
+  CFLAGS="-fsanitize=thread" \
+  USE_TSAN=1 USE_CUDA=0 USE_MKLDNN=0 \
+  python setup.py bdist_wheel
+  python -mpip install dist/*.whl
+
+print_sccache_stats
+
+assert_git_not_dirty
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
@@ -15,6 +15,10 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang7-asan* ]]; then
   exec "$(dirname "${BASH_SOURCE[0]}")/build-asan.sh" "$@"
 fi
 
+if [[ "$BUILD_ENVIRONMENT" == *-clang7-tsan* ]]; then
+  exec "$(dirname "${BASH_SOURCE[0]}")/build-tsan.sh" "$@"
+fi
+
 if [[ "$BUILD_ENVIRONMENT" == *-mobile-*build* ]]; then
   exec "$(dirname "${BASH_SOURCE[0]}")/build-mobile.sh" "$@"
 fi

diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh
@@ -123,6 +123,10 @@ function clone_pytorch_xla() {
   fi
 }
 
+function install_filelock() {
+  pip_install filelock
+}
+
 function install_torchdynamo() {
   local commit
   commit=$(get_pinned_commit torchdynamo)

diff --git a/.jenkins/pytorch/macos-build.sh b/.jenkins/pytorch/macos-build.sh
@@ -35,11 +35,13 @@ fi
 
 cross_compile_arm64() {
   # Cross compilation for arm64
-  USE_DISTRIBUTED=1 CMAKE_OSX_ARCHITECTURES=arm64 MACOSX_DEPLOYMENT_TARGET=11.0 USE_MKLDNN=OFF USE_QNNPACK=OFF WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
+  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
+  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
+  USE_DISTRIBUTED=0 CMAKE_OSX_ARCHITECTURES=arm64 MACOSX_DEPLOYMENT_TARGET=11.0 USE_MKLDNN=OFF USE_QNNPACK=OFF WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
 }
 
 compile_x86_64() {
-  USE_DISTRIBUTED=1 WERROR=1 python setup.py bdist_wheel
+  USE_DISTRIBUTED=0 WERROR=1 python setup.py bdist_wheel
 }
 
 build_lite_interpreter() {

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
@@ -186,6 +186,10 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
     (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_aten_asan(3)")
 fi
 
+if [[ "$BUILD_ENVIRONMENT" == *-tsan* ]]; then
+  export PYTORCH_TEST_WITH_TSAN=1
+fi
+
 if [[ $TEST_CONFIG == 'nogpu_NO_AVX2' ]]; then
   export ATEN_CPU_CAPABILITY=default
 elif [[ $TEST_CONFIG == 'nogpu_AVX512' ]]; then
@@ -333,8 +337,11 @@ test_libtorch() {
     TEST_REPORTS_DIR=test/test-reports/cpp-unittest/test_libtorch
     mkdir -p $TEST_REPORTS_DIR
 
-    # Run JIT cpp tests
-    python test/cpp/jit/tests_setup.py setup
+    if [[ "$BUILD_ENVIRONMENT" != *-tsan* ]]; then
+        # Run JIT cpp tests
+        python test/cpp/jit/tests_setup.py setup
+    fi
+
     if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
       "$TORCH_BIN_DIR"/test_jit  --gtest_output=xml:$TEST_REPORTS_DIR/test_jit.xml
     else
@@ -348,7 +355,10 @@ test_libtorch() {
       "$TORCH_BIN_DIR"/test_lazy  --gtest_output=xml:$TEST_REPORTS_DIR/test_lazy.xml
     fi
 
-    python test/cpp/jit/tests_setup.py shutdown
+    if [[ "$BUILD_ENVIRONMENT" != *-tsan* ]]; then
+        python test/cpp/jit/tests_setup.py shutdown
+    fi
+
     # Wait for background download to finish
     wait
     # Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy.
@@ -651,15 +661,15 @@ test_vec256() {
 
 test_dynamo() {
   pushd ../torchdynamo
-  pytest test
+  pytest test/dynamo
   popd
 }
 
 test_docs_test() {
   .jenkins/pytorch/docs-test.sh
 }
 
-if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
+if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* || "${BUILD_ENVIRONMENT}" == *-tsan* ]]; then
   (cd test && python -c "import torch; print(torch.__config__.show())")
   (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
 fi
@@ -693,8 +703,8 @@ elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHAR
   install_torchvision
   checkout_install_torchdynamo
   test_dynamo_shard 2
-  # Skip running test in the dynamo repo to unblock the dynamo pin update
-  # test_dynamo
+  install_filelock
+  test_dynamo
 elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
   test_without_numpy
   install_torchvision
@@ -720,6 +730,10 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
   test_bazel
 elif [[ "${BUILD_ENVIRONMENT}" == *-mobile-lightweight-dispatch* ]]; then
   test_libtorch
+elif [[ "${BUILD_ENVIRONMENT}" == *-tsan* ]]; then
+  # TODO: TSAN check is currently failing with 415 data race warnings. This will
+  # be addressed later, the first PR can be merged first to setup the CI jobs
+  test_libtorch || true
 elif [[ "${TEST_CONFIG}" = docs_test ]]; then
   test_docs_test
 elif [[ "${TEST_CONFIG}" == *functorch* ]]; then

diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -99,6 +99,8 @@ include_patterns = [
 exclude_patterns = [
     'torch/include/**',
     'torch/csrc/**',
+    'torch/_dynamo/**/*.py',
+    'torch/_inductor/**/*.py',
     'torch/distributed/elastic/agent/server/api.py',
     'torch/testing/_internal/**',
     'torch/distributed/fsdp/fully_sharded_data_parallel.py',
@@ -703,6 +705,11 @@ include_patterns = [
     'test/onnx/**/*.py',
     'test/test_dynamo_cudagraphs.py',
     'tools/**/*.py',
+    'torch/_dynamo/**/*.py',
+    'test/dynamo/**/*.py',
+    'benchmarks/dynamo/**/*.py',
+    'torch/_inductor/**/*.py',
+    'test/inductor/**/*.py',
     'torch/onnx/**/*.py',
     'torch/package/**/*.py',
     'torch/_decomp/**/*.py',

diff --git a/BUILD.bazel b/BUILD.bazel
@@ -429,6 +429,7 @@ cu_library(
         "@cuda//:cublas",
         "@cuda//:cufft",
         "@cuda//:cusparse",
+        "@cutlass",
     ],
     alwayslink = True,
 )
@@ -1673,6 +1674,7 @@ cc_library(
     ] + if_cuda([
         ":torch_distributed_cuda",
         "@cuda//:nvToolsExt",
+        "@cutlass",
     ]),
     alwayslink = True,
 )

diff --git a/CITATION b/CITATION