Skip to content

Commit 8c45856

Browse files
committed
Update on "Support of AllPermute for redistribution"
***Not Ready For Review!*** ***Not Ready For Review!*** ***Not Ready For Review!*** ### Summary Introduce the `AllPermute` collective operation as mentioned in https://arxiv.org/pdf/2112.01075 "section 2.6 Collective operations". ### What is AllPermute? AllPermute can transform any 𝜏1 to 𝜏2 if their local and global shapes match. For example: Given mesh and size {X:4, Y:4, Z:16}, we have - example 1: [32{X,Y}}512, 128] -> [32{Y,X}512, 128] - example 2: [128{Y}512, 32{X}128] -> [128{X}512, 32{Y}128] - example 3: [32{X,Y}512, 128] -> [32{Z}512, 128] Note: annotation borrowed from https://arxiv.org/pdf/2112.01075 "section 2.1 Distributed array types" ### Why we need AllPermute? With AllPermute, we can eliminate some AllGather ops during redistribution. This plays an important role in reducing the memory overhead. In theory, at most one AllPermute is needed to redistribute from any 𝜏1 to 𝜏2. The `AllPermute` can be performed as the final step, or moved before the last `AllGather` to minimize the amount of data relocated between shards in the `AllPermute`. Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom): * __->__ #163772 * #162294 * #160903 * #160266 cc H-Huang awgu wanchaol fegin fduwjj wz337 wconstab d4l3k pragupta ezyang msaroufim dcci [ghstack-poisoned]
2 parents 098fd1a + 6b5e9b1 commit 8c45856

File tree

284 files changed

+5738
-3584
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

284 files changed

+5738
-3584
lines changed

.ci/docker/almalinux/Dockerfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,8 @@ RUN bash ./install_cuda.sh 13.0
6969
ENV DESIRED_CUDA=13.0
7070

7171
FROM ${ROCM_IMAGE} as rocm
72-
ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
72+
ARG PYTORCH_ROCM_ARCH
73+
ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
7374
ADD ./common/install_mkl.sh install_mkl.sh
7475
RUN bash ./install_mkl.sh && rm install_mkl.sh
7576
ENV MKLROOT /opt/intel

.ci/docker/almalinux/build.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ case ${DOCKER_TAG_PREFIX} in
3636
;;
3737
rocm*)
3838
BASE_TARGET=rocm
39+
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
40+
# add gfx950 conditionally starting in ROCm 7.0
41+
if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
42+
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
43+
fi
44+
EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
3945
;;
4046
*)
4147
echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"

.ci/docker/libtorch/build.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,16 @@ case ${DOCKER_TAG_PREFIX} in
4040
;;
4141
rocm*)
4242
# we want the patch version of 6.4 instead
43-
if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
43+
if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
4444
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
4545
fi
4646
BASE_TARGET=rocm
4747
GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
4848
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
49+
# add gfx950 conditionally starting in ROCm 7.0
50+
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
51+
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
52+
fi
4953
DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
5054
;;
5155
*)

.ci/docker/manywheel/build.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,14 +82,18 @@ case ${image} in
8282
;;
8383
manylinux2_28-builder:rocm*)
8484
# we want the patch version of 6.4 instead
85-
if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
85+
if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
8686
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
8787
fi
8888
TARGET=rocm_final
8989
MANY_LINUX_VERSION="2_28"
9090
DEVTOOLSET_VERSION="11"
9191
GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
9292
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
93+
# add gfx950 conditionally starting in ROCm 7.0
94+
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
95+
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
96+
fi
9397
DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
9498
;;
9599
manylinux2_28-builder:xpu)

.ci/magma-rocm/Makefile

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
SHELL=/usr/bin/env bash
22

33
DOCKER_CMD ?= docker
4-
DESIRED_ROCM ?= 6.4
4+
DESIRED_ROCM ?= 7.0
55
DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
66
PACKAGE_NAME = magma-rocm
77
# inherit this from underlying docker image, do not pass this env var to docker
8-
#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
8+
#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
99

1010
DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
1111
-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
@@ -16,6 +16,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
1616
magma-rocm/build_magma.sh
1717

1818
.PHONY: all
19+
all: magma-rocm70
1920
all: magma-rocm64
2021
all: magma-rocm63
2122

@@ -24,6 +25,11 @@ clean:
2425
$(RM) -r magma-*
2526
$(RM) -r output
2627

28+
.PHONY: magma-rocm70
29+
magma-rocm70: DESIRED_ROCM := 7.0
30+
magma-rocm70:
31+
$(DOCKER_RUN)
32+
2733
.PHONY: magma-rocm64
2834
magma-rocm64: DESIRED_ROCM := 6.4
2935
magma-rocm64:

.ci/magma-rocm/build_magma.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ set -eou pipefail
66
# The script expects DESIRED_CUDA and PACKAGE_NAME to be set
77
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
88

9-
# Version 2.7.2 + ROCm related updates
10-
MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
9+
# https://github.com/icl-utk-edu/magma/pull/65
10+
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
1111

1212
# Folders for the build
1313
PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
@@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE
2020

2121
# Fetch magma sources and verify checksum
2222
pushd ${PACKAGE_DIR}
23-
git clone https://bitbucket.org/icl/magma.git
23+
git clone https://github.com/jeffdaily/magma
2424
pushd magma
2525
git checkout ${MAGMA_VERSION}
2626
popd

.ci/pytorch/cpp_doc_push_script.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ time python tools/setup_helpers/generate_code.py \
5858

5959
# Build the docs
6060
pushd docs/cpp
61-
time make VERBOSE=1 html -j
61+
time make VERBOSE=1 html
6262

6363
popd
6464
popd

.ci/pytorch/macos-build.sh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,11 @@ fi
3535

3636
print_cmake_info
3737
if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
38-
USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
38+
# Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
39+
USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
3940
else
40-
# NB: we always build with distributed; USE_DISTRIBUTED turns off all
41-
# backends (specifically the gloo backend), so test that this case works too
41+
# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
42+
# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
4243
USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
4344
fi
4445
if which sccache > /dev/null; then

.ci/pytorch/macos-test.sh

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,9 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
1313
fi
1414
popd
1515

16-
python -mpip install -r requirements.txt
17-
1816
# enable debug asserts in serialization
1917
export TORCH_SERIALIZATION_DEBUG=1
2018

21-
python -mpip install --no-input -r requirements.txt
22-
2319
setup_test_python() {
2420
# The CircleCI worker hostname doesn't resolve to an address.
2521
# This environment variable makes ProcessGroupGloo default to

.ci/wheel/build_wheel.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,7 @@ source ~/${desired_python}-build/bin/activate
177177
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
178178
retry brew install libomp
179179

180-
# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
181-
# is build as part of tensorpipe submodule
180+
# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
182181
export USE_DISTRIBUTED=1
183182

184183
export USE_MKLDNN=OFF

0 commit comments

Comments
 (0)