Skip to content

Commit

Permalink
Update on "add Half support for sigmoid on CPU"
Browse files Browse the repository at this point in the history
cc jgong5 XiaobingSuper sanchitintel ashokei jingxu10

[ghstack-poisoned]
  • Loading branch information
mingfeima committed Apr 2, 2023
2 parents e67e848 + ac9a474 commit 38e0fab
Show file tree
Hide file tree
Showing 445 changed files with 13,642 additions and 9,179 deletions.
2 changes: 1 addition & 1 deletion .ci/docker/ci_commit_pins/triton.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
e650d3708be4dca12cc3491a2f8ab18ded47c368
46672772b46b103db7341c9e10fbad7f643557d4
6 changes: 3 additions & 3 deletions .ci/docker/common/install_conda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,12 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
elif [ "$ANACONDA_PYTHON_VERSION" = "3.10" ]; then
conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
elif [ "$ANACONDA_PYTHON_VERSION" = "3.9" ]; then
conda_install numpy=1.19.2 ${CONDA_COMMON_DEPS}
conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
elif [ "$ANACONDA_PYTHON_VERSION" = "3.8" ]; then
conda_install numpy=1.18.5 ${CONDA_COMMON_DEPS}
conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
else
# Install `typing-extensions` for 3.7
conda_install numpy=1.18.5 ${CONDA_COMMON_DEPS} typing-extensions
conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS} typing-extensions
fi

# This is only supported in 3.8 upward
Expand Down
9 changes: 7 additions & 2 deletions .ci/docker/common/install_onnx.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,13 @@ pip_install \
mock==5.0.1 \
ninja==1.10.2 \
networkx==2.0 \
numpy==1.22.4 \
onnx==1.13.1 \
numpy==1.22.4

# TODO: use official onnx package once it's released
# for now, use the commit from 1.13.1-protobuf4.21 branch
pip_install "onnx@git+https://github.com/onnx/onnx@389b6bcb05b9479d149d29b2461fbffe8472ed14"

pip_install \
onnxruntime==1.14.0 \
parameterized==0.8.1 \
pytest-cov==4.0.0 \
Expand Down
12 changes: 8 additions & 4 deletions .ci/pytorch/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -301,17 +301,23 @@ test_perf_for_dashboard() {
python "benchmarks/dynamo/$suite.py" \
--accuracy --"$dtype" --backend "$backend" "$@" \
--output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_training_cuda_accuracy.csv"
python "benchmarks/dynamo/$suite.py" \
--accuracy --"$dtype" --backend "$backend" --dynamic-shapes --dynamic-batch-only --disable-cudagraphs "$@" \
--output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_training_cuda_accuracy.csv"

# Run performance test
# Skip dynamo-eager and aot-eager for performance test
# Run performance test for inductor with different configs
# TODO: add more configs here, e.g. dynamic-shapes, max-autotune, etc.
# TODO: add more configs here, e.g. max-autotune, etc.
python "benchmarks/dynamo/$suite.py" \
--performance --cold-start-latency --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
--output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_training_cuda_performance.csv"
python "benchmarks/dynamo/$suite.py" \
--performance --cold-start-latency --"$dtype" --backend "$backend" "$@" \
--output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_training_cuda_performance.csv"
python "benchmarks/dynamo/$suite.py" \
--performance --cold-start-latency --"$dtype" --backend "$backend" --dynamic-shapes --dynamic-batch-only --disable-cudagraphs "$@" \
--output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_training_cuda_performance.csv"
done
}

Expand Down Expand Up @@ -587,9 +593,7 @@ test_distributed() {
"$TORCH_BIN_DIR"/TCPStoreTest --gtest_output=xml:$TEST_REPORTS_DIR/TCPStoreTest.xml

MPIEXEC=$(command -v mpiexec)
# TODO: this is disabled on GitHub Actions until this issue is resolved
# https://github.com/pytorch/pytorch/issues/60756
if [[ -n "$MPIEXEC" ]] && [[ -z "$GITHUB_ACTIONS" ]]; then
if [[ -n "$MPIEXEC" ]]; then
MPICMD="${MPIEXEC} -np 2 $TORCH_BIN_DIR/ProcessGroupMPITest"
eval "$MPICMD"
fi
Expand Down
14 changes: 10 additions & 4 deletions .flake8
Original file line number Diff line number Diff line change
@@ -1,20 +1,26 @@
[flake8]
enable-extensions = G
select = B,C,E,F,G,P,T4,W,B9
select = B,C,E,F,G,P,SIM1,T4,W,B9
max-line-length = 120
# C408 ignored because we like the dict keyword argument syntax
# E501 is not flexible enough, we're using B950 instead
ignore =
E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,
# fix these lints in the future
E275,
# shebang has extra meaning in fbcode lints, so I think it's not worth trying
# to line this up with executable bit
EXE001,
# these ignores are from flake8-bugbear; please fix!
B007,B008,
B007,B008,B017,B019,B020,B023,B024,B026,B027,B028,B903,B904,B905,B906,B907
# these ignores are from flake8-comprehensions; please fix!
C407,C417
C407
# these ignores are from flake8-logging-format; please fix!
G001,G002,G003,G004,G100,G101,G200,G201,G202
G004,G100,G101,G200,G201,G202
# these ignores are from flake8-simplify. please fix or ignore with commented reason
SIM105,SIM108,SIM109,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12,
# flake8-simplify code styles
SIM102,SIM103,SIM106,SIM112,
per-file-ignores =
__init__.py: F401
torch/utils/cpp_extension.py: B950
Expand Down
2 changes: 1 addition & 1 deletion .github/ci_commit_pins/torchbench.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0f02ca657f791d874c390af5eaab489b426336d3
159e58f0b36ee22e2b89d74bd7dc8a79376de01d
2 changes: 1 addition & 1 deletion .github/ci_commit_pins/vision.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
18a2e8eb5c6e30e2bc22416379b10f5dfaccc4d4
78c271974f94585f45cd696f66d08dae538a9207
2 changes: 1 addition & 1 deletion .github/ci_commit_pins/xla.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
015ebcba441dbd5dd21dc02ef12af2c29791a7f0
5444e06e5b851211af8a83e024c6703acfc095eb
2 changes: 1 addition & 1 deletion .github/requirements/conda-env-macOS-X64
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
mkl=2021.2.0
mkl-include=2021.2.0
numpy=1.18.5
numpy=1.21.2
pyyaml=5.3
setuptools=46.0.0
cmake=3.22.*
Expand Down
3 changes: 0 additions & 3 deletions .github/scripts/generate_binary_build_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,9 +209,6 @@ def generate_wheels_matrix(
if arch_version == "cpu" or arch_version == "cpu-cxx11-abi"
else arch_version
)
# Skip rocm 3.11 binaries for now as the docker image are not correct
if python_version == "3.11" and gpu_arch_type == "rocm":
continue

# special 11.7 wheels package without dependencies
# dependency downloaded via pip install
Expand Down
12 changes: 6 additions & 6 deletions .github/scripts/run_torchbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def is_valid_ub_dir(ub_path: str) -> bool:
[os.path.join(ub_path, ubdir) for ubdir in os.listdir(ub_path)],
)
)
valid_ubs = list(map(lambda x: os.path.basename(x), ubs))
valid_ubs = [os.path.basename(x) for x in ubs]
return valid_ubs


Expand All @@ -130,13 +130,13 @@ def extract_models_from_pr(
userbenchmark_list = []
pr_list = []
with open(prbody_file, "r") as pf:
lines = map(lambda x: x.strip(), pf.read().splitlines())
lines = (x.strip() for x in pf.read().splitlines())
magic_lines = list(filter(lambda x: x.startswith(MAGIC_PREFIX), lines))
if magic_lines:
# Only the first magic line will be recognized.
pr_list = list(
map(lambda x: x.strip(), magic_lines[0][len(MAGIC_PREFIX) :].split(","))
)
pr_list = [
x.strip() for x in magic_lines[0][len(MAGIC_PREFIX) :].split(",")
]
valid_models = get_valid_models(torchbench_path)
valid_ubs = get_valid_userbenchmarks(torchbench_path)
for pr_bm in pr_list:
Expand All @@ -158,7 +158,7 @@ def extract_models_from_pr(
def find_torchbench_branch(prbody_file: str) -> str:
branch_name: str = ""
with open(prbody_file, "r") as pf:
lines = map(lambda x: x.strip(), pf.read().splitlines())
lines = (x.strip() for x in pf.read().splitlines())
magic_lines = list(
filter(lambda x: x.startswith(MAGIC_TORCHBENCH_PREFIX), lines)
)
Expand Down
24 changes: 24 additions & 0 deletions .github/scripts/stop_runner_service.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash

set +e
set -x

# Get the service name
RUNNER_SERVICE=$(cat "${RUNNER_WORKSPACE}/../../.service")
echo "GitHub self-hosted runner service: ${RUNNER_SERVICE}"

if [[ -n "${RUNNER_SERVICE}" ]]; then
echo "The self-hosted runner has encountered an unrecoverable error and will be shutdown"

pushd "${RUNNER_WORKSPACE}/../../"
# Stop it to prevent the runner from receiving new jobs
sudo ./svc.sh stop
# then uninstall the service
sudo ./svc.sh uninstall
# Finally, shutting down the runner completely
sudo shutdown -P now
# NB: In my test, cleaning up and shutting down the runner this way would already
# remove the runner from the list of registered runners. Calling config.sh remove
# seems redundant as it would require an org token to use, which I don't want to
# add as yet another secret to the CI if there is no need
fi
5 changes: 3 additions & 2 deletions .github/scripts/trymerge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1802,8 +1802,9 @@ def merge(
elif (datetime.utcnow() - cast(datetime, pr.last_pushed_at())).days > stale_pr_days:
raise RuntimeError(
f"This PR is too stale; the last push date was more than {stale_pr_days} days ago. "
"Please rebase and try again. You can rebase by leaving the following comment on this PR:\n"
"`@pytorchbot rebase`"
"Please rebase and try again. You can rebase and merge by leaving the following comment on this PR:\n"
"`@pytorchbot merge -r`\n"
"Or just rebase by leaving `@pytorchbot rebase` comment"
)

start_time = time.time()
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/_bazel-build-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ jobs:
- name: Print remaining test logs
shell: bash
if: always()
if: always() && steps.test.conclusion
run: |
cat test/**/*.log || true
Expand Down
35 changes: 35 additions & 0 deletions .github/workflows/_docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,41 @@ jobs:
if-no-files-found: error
path: functorch_ghpages/nightly/
s3-prefix: pytorch/${{ github.event.pull_request.number }}/functorchdocs

# The three upload steps below duplicate the upload from above, but to a different path. This is needed since we
# are in the process of changing the path, but want to keep the disruption to a minimum.
# See https://github.com/pytorch/test-infra/issues/3894
# After a grace period the s3-prefix should start with pytorch/pytorch/
- name: Upload Python Docs Preview (forward compatibility)
uses: seemethere/upload-artifact-s3@v5
if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' && steps.build-docs.outcome == 'success' }}
with:
retention-days: 14
s3-bucket: doc-previews
if-no-files-found: error
path: pytorch.github.io/docs/master/
s3-prefix: pytorch/pytorch/pytorch/${{ github.event.pull_request.number }}

- name: Upload C++ Docs Preview (forward compatibility)
uses: seemethere/upload-artifact-s3@v5
if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cpp' && steps.build-docs.outcome == 'success' }}
with:
retention-days: 14
if-no-files-found: error
s3-bucket: doc-previews
path: cppdocs/
s3-prefix: pytorch/pytorch/pytorch/${{ github.event.pull_request.number }}/cppdocs

- name: Upload functorch Docs Preview (forward compatibility)
uses: seemethere/upload-artifact-s3@v5
if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'functorch' && steps.build-docs.outcome == 'success' }}
with:
retention-days: 14
s3-bucket: doc-previews
if-no-files-found: error
path: functorch_ghpages/nightly/
s3-prefix: pytorch/pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs

- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always()
36 changes: 33 additions & 3 deletions .github/workflows/_linux-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,9 @@ jobs:
docker-image: ${{ inputs.docker-image }}

- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
id: install-nvidia-driver
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
if: contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu')
with:
driver-version: "525.85.05"

- name: Lock NVIDIA A100 40GB Frequency
run: |
Expand Down Expand Up @@ -227,7 +226,7 @@ jobs:
- name: Print remaining test logs
shell: bash
if: always()
if: always() && steps.test.conclusion
run: |
cat test/**/*.log || true
Expand Down Expand Up @@ -272,3 +271,34 @@ jobs:
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always()

# NB: We are currently having an intermittent GPU-related issue on G5 runners with
# A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
# not seem to help. Here are some symptoms:
# * Calling nvidia-smi timeouts after 60 second
# * Fail to run nvidia-smi with an unable to determine the device handle for GPU
# unknown error
# * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch
# * Run docker --gpus all fails with error response from daemon
#
# As both the root cause and recovery path are unclear, let's take the runner out of
# service so that it doesn't get any more jobs
- name: Check NVIDIA driver installation step
if:
failure() &&
((steps.install-nvidia-driver.conclusion && steps.install-nvidia-driver.conclusion == 'failure') || (contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu')))
shell: bash
env:
RUNNER_WORKSPACE: ${{ runner.workspace }}
run: |
set +e
set -x
nvidia-smi
NVIDIA_SMI_STATUS=$?
# These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
echo "NVIDIA driver installation has failed, shutting down the runner..."
.github/scripts/stop_runner_service.sh
fi
2 changes: 1 addition & 1 deletion .github/workflows/_mac-test-mps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ jobs:
- name: Print remaining test logs
shell: bash
if: always()
if: always() && steps.test.conclusion
run: |
cat test/**/*.log || true
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/_mac-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ jobs:
- name: Print remaining test logs
shell: bash
if: always()
if: always() && steps.test.conclusion
run: |
cat test/**/*.log || true
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/_rocm-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ jobs:
- name: Print remaining test logs
shell: bash
if: always()
if: always() && steps.test.conclusion
run: |
cat test/**/*.log || true
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/_win-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ jobs:
- name: Print remaining test logs
shell: bash
if: always()
if: always() && steps.test.conclusion
run: |
cat test/**/*.log || true
Expand Down

0 comments on commit 38e0fab

Please sign in to comment.