Skip to content

Commit

Permalink
Update on "[dtensor][7/n] remove reduction rule"
Browse files Browse the repository at this point in the history
[ghstack-poisoned]
  • Loading branch information
wanchaol committed Sep 15, 2023
2 parents cf2a5cf + b875b80 commit c51b2ff
Show file tree
Hide file tree
Showing 480 changed files with 18,816 additions and 16,272 deletions.
43 changes: 0 additions & 43 deletions .ci/docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -129,35 +129,6 @@ case "$image" in
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc7)
CUDA_VERSION=11.8.0
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=7
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc7-inductor-benchmarks)
CUDA_VERSION=11.8.0
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=7
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
CUDA_VERSION=12.1.1
CUDNN_VERSION=8
Expand Down Expand Up @@ -244,17 +215,6 @@ case "$image" in
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-py3.8-gcc7)
ANACONDA_PYTHON_VERSION=3.8
GCC_VERSION=7
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
CONDA_CMAKE=yes
TRITON=yes
DOCS=yes
;;
pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks)
ANACONDA_PYTHON_VERSION=3.8
GCC_VERSION=11
Expand Down Expand Up @@ -354,14 +314,11 @@ if [[ "$image" == *cuda* && ${OS} == "ubuntu" ]]; then
fi

# Build image
# TODO: build-arg THRIFT is not turned on for any image, remove it once we confirm
# it's no longer needed.
docker build \
--no-cache \
--progress=plain \
--build-arg "BUILD_ENVIRONMENT=${image}" \
--build-arg "PROTOBUF=${PROTOBUF:-}" \
--build-arg "THRIFT=${THRIFT:-}" \
--build-arg "LLVMDEV=${LLVMDEV:-}" \
--build-arg "DB=${DB:-}" \
--build-arg "VISION=${VISION:-}" \
Expand Down
2 changes: 1 addition & 1 deletion .ci/docker/common/install_onnx.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ pip_install \
transformers==4.32.1

pip_install coloredlogs packaging
retry pip_install -i https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ --no-cache-dir --no-input ort-nightly==1.16.0.dev20230908001
retry pip_install -i https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ --no-cache-dir --no-input ort-nightly==1.16.0.dev20230912006

pip_install onnx==1.14.1
pip_install onnxscript-preview==0.1.0.dev20230828 --no-deps
Expand Down
14 changes: 0 additions & 14 deletions .ci/docker/common/install_thrift.sh

This file was deleted.

5 changes: 5 additions & 0 deletions .ci/docker/requirements-ci.txt
Original file line number Diff line number Diff line change
Expand Up @@ -275,3 +275,8 @@ z3-solver==4.12.2.0
#Description: The Z3 Theorem Prover Project
#Pinned versions:
#test that import:

tensorboard==2.13.0
#Description: Also included in .ci/docker/requirements-docs.txt
#Pinned versions:
#test that import: test_tensorboard
7 changes: 0 additions & 7 deletions .ci/docker/ubuntu/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,6 @@ ARG LLVMDEV
COPY ./common/install_clang.sh install_clang.sh
RUN bash ./install_clang.sh && rm install_clang.sh

# (optional) Install thrift.
ARG THRIFT
COPY ./common/install_thrift.sh install_thrift.sh
RUN if [ -n "${THRIFT}" ]; then bash ./install_thrift.sh; fi
RUN rm install_thrift.sh
ENV INSTALLED_THRIFT ${THRIFT}

# Install user
COPY ./common/install_user.sh install_user.sh
RUN bash ./install_user.sh && rm install_user.sh
Expand Down
8 changes: 8 additions & 0 deletions .ci/pytorch/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,14 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* && -z "$TORCH_CUDA_ARCH_LIST" ]]; then
exit 1
fi

# We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
# memory to build and will OOM
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ "$TORCH_CUDA_ARCH_LIST" == *"8.6"* || "$TORCH_CUDA_ARCH_LIST" == *"8.0"* ]]; then
echo "WARNING: FlashAttention files require large amounts of memory to build and will OOM"
echo "Setting MAX_JOBS=(nproc-2)/3 to reduce memory usage"
export MAX_JOBS="$(( $(nproc --ignore=2) / 3 ))"
fi

if [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then
export CC=clang
export CXX=clang++
Expand Down
2 changes: 1 addition & 1 deletion .ci/pytorch/win-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
fi

# TODO: Move both of them to Windows AMI
python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0
python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0

# Install Z3 optional dependency for Windows builds.
python -m pip install z3-solver
Expand Down
4 changes: 2 additions & 2 deletions .circleci/scripts/binary_populate_env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,8 @@ EOL

# nproc doesn't exist on darwin
if [[ "$(uname)" != Darwin ]]; then
# Because most Circle executors only have 20 CPUs, using more causes OOMs w/ Ninja and nvcc parallelization
MEMORY_LIMIT_MAX_JOBS=18
# This was lowered from 18 to 12 to avoid OOMs when compiling FlashAttentionV2
MEMORY_LIMIT_MAX_JOBS=12
NUM_CPUS=$(( $(nproc) - 2 ))

# Defaults here for **binary** linux builds so they can be changed in one place
Expand Down
2 changes: 1 addition & 1 deletion .github/ci_commit_pins/torchbench.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
9371b9e13c826f3930e54346b4d619cb59182f68
0a64c552d35dfb9252215a7e3c794a16fff6cb0b
1 change: 1 addition & 0 deletions .github/merge_rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
- docs/source/_static/img/onnx/**
- scripts/onnx/**
- test/onnx/**
- test/onnx_caffe2/**
- tools/onnx/**
- torch/_dynamo/backends/onnxrt.py
- torch/_C/__init__.pyi.in
Expand Down
1 change: 1 addition & 0 deletions .github/requirements/pip-requirements-macOS.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ sympy==1.11.1
pytest-cpp==2.3.0
rockset==1.0.3
z3-solver==4.12.2.0
tensorboard==2.13.0
13 changes: 7 additions & 6 deletions .github/scripts/filter_test_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,16 +410,17 @@ def process_jobs(
if target_job in (TEST_JOB_NAME, BUILD_AND_TEST_JOB_NAME):
target_cfg = m.group("cfg")

return _filter_jobs(
# NB: There can be multiple unstable configurations, i.e. inductor, inductor_huggingface
test_matrix = _filter_jobs(
test_matrix=test_matrix,
issue_type=issue_type,
target_cfg=target_cfg,
)

warnings.warn(
f"Found a matching {issue_type.value} issue {target_url} for {workflow} / {job_name}, "
+ f"but the name {target_job_cfg} is invalid"
)
else:
warnings.warn(
f"Found a matching {issue_type.value} issue {target_url} for {workflow} / {job_name}, "
+ f"but the name {target_job_cfg} is invalid"
)

# Found no matching target, return the same input test matrix
return test_matrix
Expand Down
57 changes: 56 additions & 1 deletion .github/scripts/test_filter_test_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,30 @@
"manywheel-py3_8-cuda11_8-build",
"",
],
"inductor / cuda12.1-py3.10-gcc9-sm86 / test (inductor)": [
"pytorchbot",
"107079",
"https://github.com/pytorch/pytorch/issues/107079",
"inductor",
"cuda12.1-py3.10-gcc9-sm86",
"test (inductor)",
],
"inductor / cuda12.1-py3.10-gcc9-sm86 / test (inductor_huggingface)": [
"pytorchbot",
"109153",
"https://github.com/pytorch/pytorch/issues/109153",
"inductor",
"cuda12.1-py3.10-gcc9-sm86",
"test (inductor_huggingface)",
],
"inductor / cuda12.1-py3.10-gcc9-sm86 / test (inductor_huggingface_dynamic)": [
"pytorchbot",
"109154",
"https://github.com/pytorch/pytorch/issues/109154",
"inductor",
"cuda12.1-py3.10-gcc9-sm86",
"test (inductor_huggingface_dynamic)",
],
}

MOCKED_PR_INFO = {
Expand Down Expand Up @@ -569,6 +593,37 @@ def test_mark_unstable_jobs(self, mock_download_json: Any) -> None:
"expected": '{"include": [{"config": "default", "unstable": "unstable"}]}',
"description": "Both binary build and test jobs are unstable",
},
{
"workflow": "inductor",
"job_name": "cuda12.1-py3.10-gcc9-sm86 / build",
"test_matrix": """
{ include: [
{ config: "inductor" },
{ config: "inductor_huggingface", shard: 1 },
{ config: "inductor_huggingface", shard: 2 },
{ config: "inductor_timm", shard: 1 },
{ config: "inductor_timm", shard: 2 },
{ config: "inductor_torchbench" },
{ config: "inductor_huggingface_dynamic" },
{ config: "inductor_torchbench_dynamic" },
{ config: "inductor_distributed" },
]}
""",
"expected": """
{ "include": [
{ "config": "inductor", "unstable": "unstable" },
{ "config": "inductor_huggingface", "shard": 1, "unstable": "unstable" },
{ "config": "inductor_huggingface", "shard": 2, "unstable": "unstable" },
{ "config": "inductor_timm", "shard": 1 },
{ "config": "inductor_timm", "shard": 2 },
{ "config": "inductor_torchbench" },
{ "config": "inductor_huggingface_dynamic", "unstable": "unstable" },
{ "config": "inductor_torchbench_dynamic" },
{ "config": "inductor_distributed" }
]}
""",
"description": "Marking multiple unstable configurations",
},
]

for case in testcases:
Expand All @@ -577,7 +632,7 @@ def test_mark_unstable_jobs(self, mock_download_json: Any) -> None:
test_matrix = yaml.safe_load(case["test_matrix"])

filtered_test_matrix = mark_unstable_jobs(workflow, job_name, test_matrix)
self.assertEqual(case["expected"], json.dumps(filtered_test_matrix))
self.assertEqual(json.loads(case["expected"]), filtered_test_matrix)

@mock.patch("subprocess.check_output")
def test_perform_misc_tasks(self, mocked_subprocess: Any) -> None:
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/build-triton-wheel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ jobs:
needs: build-wheel
container:
image: continuumio/miniconda3:4.12.0
environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
steps:
- uses: actions/checkout@v3

Expand Down Expand Up @@ -244,7 +244,7 @@ jobs:
needs: build-conda
container:
image: continuumio/miniconda3:4.12.0
environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
steps:
- uses: actions/checkout@v3

Expand Down Expand Up @@ -283,7 +283,7 @@ jobs:
run: |
set -ex
if [[ "${UPLOAD_CHANNEL}" = "nightly" ]]; then
if [[ "${UPLOAD_CHANNEL:-nightly}" == "nightly" ]]; then
export ANACONDA_API_TOKEN="${CONDA_PYTORCHBOT_TOKEN}"
else
export ANACONDA_API_TOKEN="${CONDA_PYTORCHBOT_TOKEN_TEST}"
Expand Down
24 changes: 0 additions & 24 deletions .github/workflows/update_s3_htmls.yml

This file was deleted.

16 changes: 0 additions & 16 deletions .lintrunner.toml
Original file line number Diff line number Diff line change
Expand Up @@ -195,37 +195,21 @@ include_patterns = [
exclude_patterns = [
'**/fb/**',
'torch/_inductor/index_propagation.py',
'torch/_inductor/coordinate_descent_tuner.py',
'torch/_inductor/debug.py',
'torch/_inductor/hooks.py',
'torch/_inductor/bounds.py',
'torch/_inductor/config.py',
'torch/_inductor/ir.py',
'torch/_inductor/codecache.py',
'torch/_inductor/test_operators.py',
'torch/_inductor/inductor_prims.py',
'torch/_inductor/scheduler.py',
'torch/_inductor/exc.py',
'torch/_inductor/sizevars.py',
'torch/_inductor/triton_helpers.py',
'torch/_inductor/freezing.py',
'torch/_inductor/pattern_matcher.py',
'torch/_inductor/fx_utils.py',
'torch/_inductor/virtualized.py',
'torch/_inductor/cuda_properties.py',
'torch/_inductor/codegen/triton_foreach.py',
'torch/_inductor/codegen/__init__.py',
'torch/_inductor/codegen/cpp.py',
'torch/_inductor/codegen/triton.py',
'torch/_inductor/fx_passes/split_cat.py',
'torch/_inductor/fx_passes/binary_folding.py',
'torch/_inductor/fx_passes/replace_random.py',
'torch/_inductor/fx_passes/joint_graph.py',
'torch/_inductor/fx_passes/pad_mm.py',
'torch/_inductor/fx_passes/__init__.py',
'torch/_inductor/fx_passes/group_batch_fusion.py',
'torch/_inductor/fx_passes/pre_grad.py',
'torch/_inductor/fx_passes/freezing_patterns.py',
]
command = [
'python3',
Expand Down
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
endif()

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_LINK_WHAT_YOU_USE TRUE)

# One variable that determines whether the current cmake process is being run
# with the main Caffe2 library. This is useful for building modules - if
Expand Down Expand Up @@ -730,7 +731,7 @@ include(cmake/Dependencies.cmake)
cmake_dependent_option(
USE_FLASH_ATTENTION
"Whether to build the flash_attention kernel for scaled dot product attention" ON
"USE_CUDA AND NOT ROCM AND NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.6" OFF)
"USE_CUDA AND NOT ROCM AND NOT MSVC AND NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.6" OFF)

# Flash Attention2 will error while building for sm52 while Mem Eff Attention won't
cmake_dependent_option(
Expand Down
Loading

0 comments on commit c51b2ff

Please sign in to comment.