Skip to content

Commit

Permalink
Update on "Improve complex lerp performance"
Browse files Browse the repository at this point in the history
The complex lerp kernel uses `std::abs(z) < 0.5` which involves
computing a sqrt. Instead compare the square against 0.25 has much
lower latency and so performs much better overall.

In a simple timeit benchmark I see more than 10x speedup on CPU for a 4096
element complex lerp, from 84 us to 6.7 us.

[ghstack-poisoned]
  • Loading branch information
peterbell10 committed Oct 13, 2022
2 parents aba1a66 + 6a8e13b commit 92b3719
Show file tree
Hide file tree
Showing 313 changed files with 12,061 additions and 6,060 deletions.
2 changes: 1 addition & 1 deletion .github/ci_commit_pins/torchdynamo.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
92630636ef6c23f8bfa028c88c55259ec222fe3e
d4b62d69a2d5ae1823c1d6695237b44faee05acd
2 changes: 1 addition & 1 deletion .github/ci_commit_pins/vision.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
6e203b44098c3371689f56abc17b7c02bd51a261
12adc5426ef345ab7999661538a60da99dd85281
2 changes: 1 addition & 1 deletion .github/ci_commit_pins/xla.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
d6b1971d4f40364cd6f2d23d047818d295971f7a
8332ac9072832c67b2680509bb1ef1abc5813c6a
4 changes: 1 addition & 3 deletions .github/scripts/fetch_latest_green_commit.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,6 @@ def isGreen(commit: str, results: Dict[str, Any]) -> Tuple[bool, str]:
return (False, workflowName + " checks were not successful")
else:
regex[required_check] = True
if workflowName in ["periodic", "docker-release-builds"] and conclusion not in ["success", "skipped"]:
return (False, workflowName + " checks were not successful")

missing_workflows = [x for x in regex.keys() if not regex[x]]
if len(missing_workflows) > 0:
Expand All @@ -110,7 +108,7 @@ def main() -> None:
)
qlambda = rs.QueryLambda.retrieve(
'commit_jobs_batch_query',
version='15aba20837ae9d75',
version='8003fdfd18b64696',
workspace='commons')

commits = get_latest_commits()
Expand Down
1 change: 1 addition & 0 deletions .github/scripts/filter_test_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"nogpu_AVX512",
"nogpu_NO_AVX2",
"slow",
"tsan",
"xla",
}}

Expand Down
5 changes: 2 additions & 3 deletions .github/scripts/test_fetch_latest_green_commit.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,12 @@ def test_necessary_failed(self, mock_get_commit_results: Any) -> None:

@mock.patch('fetch_latest_green_commit.get_commit_results', return_value=TestChecks().make_test_checks())
def test_skippable_failed(self, mock_get_commit_results: Any) -> None:
"Test with skippable job (ex: docker-release-builds) failing"
"Test with failing skippable jobs (ex: docker-release-builds) should pass"
workflow_checks = mock_get_commit_results()
workflow_checks = set_workflow_job_status(workflow_checks, "periodic", "skipped")
workflow_checks = set_workflow_job_status(workflow_checks, "docker-release-builds", "failed")
result = isGreen("sha", workflow_checks)
self.assertFalse(result[0])
self.assertEqual(result[1], "docker-release-builds checks were not successful")
self.assertTrue(result[0])

@mock.patch('fetch_latest_green_commit.get_commit_results', return_value={})
def test_no_workflows(self, mock_get_commit_results: Any) -> None:
Expand Down
4 changes: 3 additions & 1 deletion .github/scripts/trymerge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1376,7 +1376,9 @@ def merge(pr_num: int, repo: GitRepo,
if (datetime.utcnow() - pr.last_pushed_at()).days > stale_pr_days:
if land_checks and not dry_run:
pr.delete_land_time_check_branch(repo)
raise RuntimeError("This PR is too stale; the last push date was more than 3 days ago. Please rebase and try again.")
raise RuntimeError(f"This PR is too stale; the last push date was more than {stale_pr_days} days ago. "
"Please rebase and try again. You can rebase by leaving the following comment on this PR:\n"
"`@pytorchbot rebase`")

start_time = time.time()
last_exception = ''
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/_mac-test-mps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ jobs:
run: |
# shellcheck disable=SC1090
set -ex
${CONDA_INSTALL} numpy expecttest pyyaml
${CONDA_INSTALL} expecttest numpy=1.22.3 pyyaml=6.0
# As wheels are cross-compiled they are reported as x86_64 ones
ORIG_WHLNAME=$(ls -1 dist/*.whl); ARM_WHLNAME=${ORIG_WHLNAME/x86_64/arm64}; mv ${ORIG_WHLNAME} ${ARM_WHLNAME}
${CONDA_RUN} python3 -mpip install dist/*.whl
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/_rocm-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ jobs:
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
DOCKER_IMAGE: ${{ inputs.docker-image }}
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
PYTORCH_JIT_ENABLE_NVFUSER: 1
timeout-minutes: 270
run: |
set -x
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/docker-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ on:
paths:
- Dockerfile
- docker.Makefile
- .github/workflows/docker-release.yml
push:
branches:
- nightly
Expand Down Expand Up @@ -47,7 +48,10 @@ jobs:
# [see note: pytorch repo ref]
# deep clone (fetch-depth 0) required for git merge-base
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
uses: actions/checkout@v3
with:
fetch-depth: 0
submodules: 'recursive'
- name: Setup Linux
uses: ./.github/actions/setup-linux
- name: Setup SSH (Click me for login details)
Expand Down
25 changes: 25 additions & 0 deletions .github/workflows/periodic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,31 @@ jobs:
docker-image: ${{ needs.linux-bionic-cuda11_7-py3_7-gcc7-debug-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_7-gcc7-debug-build.outputs.test-matrix }}

linux-bionic-cuda11_6-py3_10-gcc7-sm86-build:
name: cuda11.6-py3.10-gcc7-sm86
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-bionic-cuda11.6-py3.10-gcc7-sm86
docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
cuda-arch-list: 8.6
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "default", shard: 2, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "default", shard: 3, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "default", shard: 4, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "functorch", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
]}
linux-bionic-cuda11_6-py3_10-gcc7-sm86-test:
name: cuda11.6-py3.10-gcc7-sm86
uses: ./.github/workflows/_linux-test.yml
needs: linux-bionic-cuda11_6-py3_10-gcc7-sm86-build
with:
build-environment: linux-bionic-cuda11.6-py3.10-gcc7-sm86
docker-image: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-sm86-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-sm86-build.outputs.test-matrix }}

libtorch-linux-bionic-cuda11_7-py3_7-gcc7-build:
name: libtorch-linux-bionic-cuda11.7-py3.7-gcc7
uses: ./.github/workflows/_linux-build.yml
Expand Down
20 changes: 20 additions & 0 deletions .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,26 @@ jobs:
docker-image: ${{ needs.linux-bionic-py3_7-clang9-slow-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-bionic-py3_7-clang9-slow-build.outputs.test-matrix }}

linux-focal-py3_7-clang7-tsan-build:
name: linux-focal-py3.7-clang7-tsan
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-py3.7-clang7-tsan
docker-image-name: pytorch-linux-focal-py3-clang7-asan
test-matrix: |
{ include: [
{ config: "tsan", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
]}
linux-focal-py3_7-clang7-tsan-test:
name: linux-focal-py3.7-clang7-tsan
uses: ./.github/workflows/_linux-test.yml
needs: linux-focal-py3_7-clang7-tsan-build
with:
build-environment: linux-focal-py3.7-clang7-tsan
docker-image: ${{ needs.linux-focal-py3_7-clang7-tsan-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-py3_7-clang7-tsan-build.outputs.test-matrix }}

ios-12-5-1-x86-64:
name: ios-12-5-1-x86-64
uses: ./.github/workflows/_ios-build-test.yml
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ torch/test/
torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h
torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h
torch/version.py
minifier_launcher.py
# Root level file used in CI to specify certain env configs.
# E.g., see .circleci/config.yaml
env
Expand Down
29 changes: 29 additions & 0 deletions .jenkins/pytorch/build-tsan.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash

# Required environment variable: $BUILD_ENVIRONMENT
# (This is set by default in the Docker images we build, so you don't
# need to set it yourself.

# shellcheck source=./common.sh
source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
# shellcheck source=./common-build.sh
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"

echo "Clang version:"
clang --version

python tools/stats/export_test_times.py

if [ -n "$(which conda)" ]; then
export CMAKE_PREFIX_PATH=/opt/conda
fi

CC="clang" CXX="clang++" LDSHARED="clang --shared" \
CFLAGS="-fsanitize=thread" \
USE_TSAN=1 USE_CUDA=0 USE_MKLDNN=0 \
python setup.py bdist_wheel
python -mpip install dist/*.whl

print_sccache_stats

assert_git_not_dirty
4 changes: 4 additions & 0 deletions .jenkins/pytorch/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang7-asan* ]]; then
exec "$(dirname "${BASH_SOURCE[0]}")/build-asan.sh" "$@"
fi

if [[ "$BUILD_ENVIRONMENT" == *-clang7-tsan* ]]; then
exec "$(dirname "${BASH_SOURCE[0]}")/build-tsan.sh" "$@"
fi

if [[ "$BUILD_ENVIRONMENT" == *-mobile-*build* ]]; then
exec "$(dirname "${BASH_SOURCE[0]}")/build-mobile.sh" "$@"
fi
Expand Down
4 changes: 4 additions & 0 deletions .jenkins/pytorch/common_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,10 @@ function clone_pytorch_xla() {
fi
}

function install_filelock() {
pip_install filelock
}

function install_torchdynamo() {
local commit
commit=$(get_pinned_commit torchdynamo)
Expand Down
6 changes: 4 additions & 2 deletions .jenkins/pytorch/macos-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,13 @@ fi

cross_compile_arm64() {
# Cross compilation for arm64
USE_DISTRIBUTED=1 CMAKE_OSX_ARCHITECTURES=arm64 MACOSX_DEPLOYMENT_TARGET=11.0 USE_MKLDNN=OFF USE_QNNPACK=OFF WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
USE_DISTRIBUTED=0 CMAKE_OSX_ARCHITECTURES=arm64 MACOSX_DEPLOYMENT_TARGET=11.0 USE_MKLDNN=OFF USE_QNNPACK=OFF WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
}

compile_x86_64() {
USE_DISTRIBUTED=1 WERROR=1 python setup.py bdist_wheel
USE_DISTRIBUTED=0 WERROR=1 python setup.py bdist_wheel
}

build_lite_interpreter() {
Expand Down
28 changes: 21 additions & 7 deletions .jenkins/pytorch/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,10 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
(cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_aten_asan(3)")
fi

if [[ "$BUILD_ENVIRONMENT" == *-tsan* ]]; then
export PYTORCH_TEST_WITH_TSAN=1
fi

if [[ $TEST_CONFIG == 'nogpu_NO_AVX2' ]]; then
export ATEN_CPU_CAPABILITY=default
elif [[ $TEST_CONFIG == 'nogpu_AVX512' ]]; then
Expand Down Expand Up @@ -333,8 +337,11 @@ test_libtorch() {
TEST_REPORTS_DIR=test/test-reports/cpp-unittest/test_libtorch
mkdir -p $TEST_REPORTS_DIR

# Run JIT cpp tests
python test/cpp/jit/tests_setup.py setup
if [[ "$BUILD_ENVIRONMENT" != *-tsan* ]]; then
# Run JIT cpp tests
python test/cpp/jit/tests_setup.py setup
fi

if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
"$TORCH_BIN_DIR"/test_jit --gtest_output=xml:$TEST_REPORTS_DIR/test_jit.xml
else
Expand All @@ -348,7 +355,10 @@ test_libtorch() {
"$TORCH_BIN_DIR"/test_lazy --gtest_output=xml:$TEST_REPORTS_DIR/test_lazy.xml
fi

python test/cpp/jit/tests_setup.py shutdown
if [[ "$BUILD_ENVIRONMENT" != *-tsan* ]]; then
python test/cpp/jit/tests_setup.py shutdown
fi

# Wait for background download to finish
wait
# Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy.
Expand Down Expand Up @@ -651,15 +661,15 @@ test_vec256() {

test_dynamo() {
pushd ../torchdynamo
pytest test
pytest test/dynamo
popd
}

test_docs_test() {
.jenkins/pytorch/docs-test.sh
}

if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* || "${BUILD_ENVIRONMENT}" == *-tsan* ]]; then
(cd test && python -c "import torch; print(torch.__config__.show())")
(cd test && python -c "import torch; print(torch.__config__.parallel_info())")
fi
Expand Down Expand Up @@ -693,8 +703,8 @@ elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHAR
install_torchvision
checkout_install_torchdynamo
test_dynamo_shard 2
# Skip running test in the dynamo repo to unblock the dynamo pin update
# test_dynamo
install_filelock
test_dynamo
elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
test_without_numpy
install_torchvision
Expand All @@ -720,6 +730,10 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
test_bazel
elif [[ "${BUILD_ENVIRONMENT}" == *-mobile-lightweight-dispatch* ]]; then
test_libtorch
elif [[ "${BUILD_ENVIRONMENT}" == *-tsan* ]]; then
# TODO: TSAN check is currently failing with 415 data race warnings. This will
# be addressed later, the first PR can be merged first to setup the CI jobs
test_libtorch || true
elif [[ "${TEST_CONFIG}" = docs_test ]]; then
test_docs_test
elif [[ "${TEST_CONFIG}" == *functorch* ]]; then
Expand Down
7 changes: 7 additions & 0 deletions .lintrunner.toml
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ include_patterns = [
exclude_patterns = [
'torch/include/**',
'torch/csrc/**',
'torch/_dynamo/**/*.py',
'torch/_inductor/**/*.py',
'torch/distributed/elastic/agent/server/api.py',
'torch/testing/_internal/**',
'torch/distributed/fsdp/fully_sharded_data_parallel.py',
Expand Down Expand Up @@ -703,6 +705,11 @@ include_patterns = [
'test/onnx/**/*.py',
'test/test_dynamo_cudagraphs.py',
'tools/**/*.py',
'torch/_dynamo/**/*.py',
'test/dynamo/**/*.py',
'benchmarks/dynamo/**/*.py',
'torch/_inductor/**/*.py',
'test/inductor/**/*.py',
'torch/onnx/**/*.py',
'torch/package/**/*.py',
'torch/_decomp/**/*.py',
Expand Down
2 changes: 2 additions & 0 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,7 @@ cu_library(
"@cuda//:cublas",
"@cuda//:cufft",
"@cuda//:cusparse",
"@cutlass",
],
alwayslink = True,
)
Expand Down Expand Up @@ -1673,6 +1674,7 @@ cc_library(
] + if_cuda([
":torch_distributed_cuda",
"@cuda//:nvToolsExt",
"@cutlass",
]),
alwayslink = True,
)
Expand Down
10 changes: 0 additions & 10 deletions CITATION

This file was deleted.

Loading

0 comments on commit 92b3719

Please sign in to comment.