Skip to content

Commit

Permalink
Update on "[Inductor] [Quant] Enable lowering of quant per tensor and…
Browse files Browse the repository at this point in the history
… refactor quant pattern"


**Summary**
Per the discussion in #123444, the `decomposed quant/dequant` patterns changed after #123445, we can move the optimization of `decomposed quant/dequant` from inductor decomposition into lowering phase to avoid the changes. In this way, we can:

- Avoid the pattern matcher failure introduced in #123445
- Make the quantization pattern clearer in the pattern matcher phase, since the `quant/dequant` nodes have not been decomposed.

**Changes in this PR**

- Move optimization of `decomposed quant/dequant` from inductor decomposition into lowering phase.
- Corresponding changes in the quantization pattern matcher to ensure no bc-breaking.

**TestPlan**
```
python -u -m pytest -s -v test/inductor/test_mkldnn_pattern_matcher.py -k test_q
```


cc jgong5 mingfeima XiaobingSuper sanchitintel ashokei jingxu10 voznesenskym penguinwu EikanWang Guobing-Chen zhuhaozhe blzheng wenzhe-nrv jiayisunx peterbell10 ipiszy yf225 chenyang78 kadeng muchulee8 aakhundov ColinPeppler amjames desertfire chauhang

[ghstack-poisoned]
  • Loading branch information
leslie-fang-intel committed May 6, 2024
2 parents 88a1243 + 5b08a4f commit 8b0ea3a
Show file tree
Hide file tree
Showing 803 changed files with 23,576 additions and 55,756 deletions.
12 changes: 10 additions & 2 deletions .ci/docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ case "$image" in
PROTOBUF=yes
DB=yes
VISION=yes
ROCM_VERSION=5.7
ROCM_VERSION=6.0
NINJA_VERSION=1.9.0
CONDA_CMAKE=yes
TRITON=yes
Expand All @@ -215,7 +215,7 @@ case "$image" in
PROTOBUF=yes
DB=yes
VISION=yes
ROCM_VERSION=6.0
ROCM_VERSION=6.1
NINJA_VERSION=1.9.0
CONDA_CMAKE=yes
TRITON=yes
Expand Down Expand Up @@ -306,6 +306,12 @@ case "$image" in
DB=yes
VISION=yes
CONDA_CMAKE=yes
# snadampal: skipping sccache due to the following issue
# https://github.com/pytorch/pytorch/issues/121559
SKIP_SCCACHE_INSTALL=yes
# snadampal: skipping llvm src build install because the current version
# from pytorch/llvm:9.0.1 is x86 specific
SKIP_LLVM_SRC_BUILD_INSTALL=yes
;;
*)
# Catch-all for builds that are not hardcoded.
Expand Down Expand Up @@ -399,6 +405,8 @@ DOCKER_BUILDKIT=1 docker build \
--build-arg "EXECUTORCH=${EXECUTORCH}" \
--build-arg "BASEKIT_VERSION=${BASEKIT_VERSION}" \
--build-arg "ACL=${ACL:-}" \
--build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
--build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
-f $(dirname ${DOCKERFILE})/Dockerfile \
-t "$tmp_tag" \
"$@" \
Expand Down
1 change: 0 additions & 1 deletion .ci/docker/common/install_base.sh
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ install_centos() {
glibc-devel \
glibc-headers \
glog-devel \
hiredis-devel \
libstdc++-devel \
libsndfile-devel \
make \
Expand Down
11 changes: 0 additions & 11 deletions .ci/docker/common/install_db.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,6 @@ set -ex

install_ubuntu() {
apt-get update
apt-get install -y --no-install-recommends \
libhiredis-dev \
libleveldb-dev \
liblmdb-dev \
libsnappy-dev

# Cleanup
apt-get autoclean && apt-get clean
Expand All @@ -20,12 +15,6 @@ install_centos() {
# See http://fedoraproject.org/wiki/EPEL
yum --enablerepo=extras install -y epel-release

yum install -y \
hiredis-devel \
leveldb-devel \
lmdb-devel \
snappy-devel

# Cleanup
yum clean all
rm -rf /var/cache/yum
Expand Down
4 changes: 4 additions & 0 deletions .ci/docker/common/install_rocm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ install_ubuntu() {
rocprofiler-dev \
roctracer-dev

if [[ $(ver $ROCM_VERSION) -ge $(ver 6.1) ]]; then
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
fi

# precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5
# search for all unversioned packages
# if search fails it will abort this script; use true to avoid case where search fails
Expand Down
6 changes: 3 additions & 3 deletions .ci/docker/requirements-ci.txt
Original file line number Diff line number Diff line change
Expand Up @@ -263,10 +263,10 @@ unittest-xml-reporting<=3.2.0,>=2.0.0
#Pinned versions:
#test that import:

#wheel not found on aarch64, and source build requires rust
lintrunner==0.10.7 ; platform_machine == "x86_64"
#lintrunner is supported on aarch64-linux only from 0.12.4 version
lintrunner==0.12.5
#Description: all about linters!
#Pinned versions: 0.10.7
#Pinned versions: 0.12.5
#test that import:

rockset==1.0.3
Expand Down
6 changes: 5 additions & 1 deletion .ci/docker/ubuntu/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,11 @@ RUN rm install_acl.sh
ENV INSTALLED_ACL ${ACL}

# Install ccache/sccache (do this last, so we get priority in PATH)
ARG SKIP_SCCACHE_INSTALL
COPY ./common/install_cache.sh install_cache.sh
ENV PATH /opt/cache/bin:$PATH
RUN bash ./install_cache.sh && rm install_cache.sh
RUN if [ -z "${SKIP_SCCACHE_INSTALL}" ]; then bash ./install_cache.sh; fi
RUN rm install_cache.sh

# Add jni.h for java host build
COPY ./common/install_jni.sh install_jni.sh
Expand All @@ -188,7 +190,9 @@ ARG BUILD_ENVIRONMENT
ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}

# Install LLVM dev version (Defined in the pytorch/builder github repository)
ARG SKIP_LLVM_SRC_BUILD_INSTALL
COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
RUN if [ -n "${SKIP_LLVM_SRC_BUILD_INSTALL}" ]; then set -eu; rm -rf /opt/llvm; fi

# AWS specific CUDA build guidance
ENV TORCH_CUDA_ARCH_LIST Maxwell
Expand Down
23 changes: 21 additions & 2 deletions .ci/pytorch/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,22 @@ if ! which conda; then
export USE_MKLDNN=0
fi
else
export CMAKE_PREFIX_PATH=/opt/conda
# CMAKE_PREFIX_PATH precedences
# 1. $CONDA_PREFIX, if defined. This follows the pytorch official build instructions.
# 2. /opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}, if ANACONDA_PYTHON_VERSION defined.
# This is for CI, which defines ANACONDA_PYTHON_VERSION but not CONDA_PREFIX.
# 3. $(conda info --base). The fallback value of pytorch official build
# instructions actually refers to this.
# Commonly this is /opt/conda/
if [[ -v CONDA_PREFIX ]]; then
export CMAKE_PREFIX_PATH=${CONDA_PREFIX}
elif [[ -v ANACONDA_PYTHON_VERSION ]]; then
export CMAKE_PREFIX_PATH="/opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}"
else
# already checked by `! which conda`
CMAKE_PREFIX_PATH="$(conda info --base)"
export CMAKE_PREFIX_PATH
fi

# Workaround required for MKL library linkage
# https://github.com/pytorch/pytorch/issues/119557
Expand Down Expand Up @@ -376,4 +391,8 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]];
python tools/stats/export_test_times.py
fi

print_sccache_stats
# snadampal: skipping it till sccache support added for aarch64
# https://github.com/pytorch/pytorch/issues/121559
if [[ "$BUILD_ENVIRONMENT" != *aarch64* ]]; then
print_sccache_stats
fi
30 changes: 29 additions & 1 deletion .ci/pytorch/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,11 @@ if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then
export PATH="$HOME/.local/bin:$PATH"
fi

if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
# TODO: revisit this once the CI is stabilized on aarch64 linux
export VALGRIND=OFF
fi

install_tlparse

# DANGER WILL ROBINSON. The LD_PRELOAD here could cause you problems
Expand Down Expand Up @@ -317,6 +322,7 @@ test_inductor_distributed() {
pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp
pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp
pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume
pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_gradient_accumulation
pytest test/distributed/_composable/fsdp/test_fully_shard_frozen.py
pytest test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype
pytest test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype
Expand Down Expand Up @@ -1152,11 +1158,33 @@ test_executorch() {
assert_git_not_dirty
}

test_linux_aarch64(){
python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
test_transformers test_multiprocessing test_numpy_interop --verbose

# Dynamo tests
python test/run_test.py --include dynamo/test_compile dynamo/test_backends dynamo/test_comptime dynamo/test_config \
dynamo/test_functions dynamo/test_fx_passes_pre_grad dynamo/test_interop dynamo/test_model_output dynamo/test_modules \
dynamo/test_optimizers dynamo/test_recompile_ux dynamo/test_recompiles --verbose

# Inductor tests
python test/run_test.py --include inductor/test_torchinductor inductor/test_benchmark_fusion inductor/test_codecache \
inductor/test_config inductor/test_control_flow inductor/test_coordinate_descent_tuner inductor/test_fx_fusion \
inductor/test_group_batch_fusion inductor/test_inductor_freezing inductor/test_inductor_utils \
inductor/test_inplacing_pass inductor/test_kernel_benchmark inductor/test_layout_optim \
inductor/test_max_autotune inductor/test_memory_planning inductor/test_metrics inductor/test_multi_kernel inductor/test_pad_mm \
inductor/test_pattern_matcher inductor/test_perf inductor/test_profiler inductor/test_select_algorithm inductor/test_smoke \
inductor/test_split_cat_fx_passes inductor/test_standalone_compile inductor/test_torchinductor \
inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes --verbose
}

if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
(cd test && python -c "import torch; print(torch.__config__.show())")
(cd test && python -c "import torch; print(torch.__config__.parallel_info())")
fi
if [[ "${TEST_CONFIG}" == *backward* ]]; then
if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
test_linux_aarch64
elif [[ "${TEST_CONFIG}" == *backward* ]]; then
test_forward_backward_compatibility
# Do NOT add tests after bc check tests, see its comment.
elif [[ "${TEST_CONFIG}" == *xla* ]]; then
Expand Down
37 changes: 21 additions & 16 deletions .ci/pytorch/win-test-helpers/build_pytorch.bat
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,22 @@ set PATH=C:\Program Files\CMake\bin;C:\Program Files\7-Zip;C:\ProgramData\chocol
set INSTALLER_DIR=%SCRIPT_HELPERS_DIR%\installation-helpers

call %INSTALLER_DIR%\install_magma.bat
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
if errorlevel 1 goto fail
if not errorlevel 0 goto fail

call %INSTALLER_DIR%\install_sccache.bat
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
if errorlevel 1 goto fail
if not errorlevel 0 goto fail

:: Miniconda has been installed as part of the Windows AMI with all the dependencies.
:: We just need to activate it here
call %INSTALLER_DIR%\activate_miniconda3.bat
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
if errorlevel 1 goto fail
if not errorlevel 0 goto fail

call pip install mkl-include==2021.4.0 mkl-devel==2021.4.0
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
if errorlevel 1 goto fail
if not errorlevel 0 goto fail

:: Override VS env here
pushd .
Expand All @@ -41,8 +41,8 @@ if "%VC_VERSION%" == "" (
) else (
call "C:\Program Files (x86)\Microsoft Visual Studio\%VC_YEAR%\%VC_PRODUCT%\VC\Auxiliary\Build\vcvarsall.bat" x64 -vcvars_ver=%VC_VERSION%
)
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
if errorlevel 1 goto fail
if not errorlevel 0 goto fail
@echo on
popd

Expand All @@ -52,12 +52,12 @@ set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION%

if x%CUDA_VERSION:.=%==x%CUDA_VERSION% (
echo CUDA version %CUDA_VERSION% format isn't correct, which doesn't contain '.'
exit /b 1
goto fail
)
rem version transformer, for example 10.1 to 10_1.
if x%CUDA_VERSION:.=%==x%CUDA_VERSION% (
echo CUDA version %CUDA_VERSION% format isn't correct, which doesn't contain '.'
exit /b 1
goto fail
)
set VERSION_SUFFIX=%CUDA_VERSION:.=_%
set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%
Expand Down Expand Up @@ -101,8 +101,8 @@ if "%USE_CUDA%"=="1" (
:: CMake requires a single command as CUDA_NVCC_EXECUTABLE, so we push the wrappers
:: randomtemp.exe and sccache.exe into a batch file which CMake invokes.
curl -kL https://github.com/peterjc123/randomtemp-rust/releases/download/v0.4/randomtemp.exe --output %TMP_DIR_WIN%\bin\randomtemp.exe
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
if errorlevel 1 goto fail
if not errorlevel 0 goto fail
echo @"%TMP_DIR_WIN%\bin\randomtemp.exe" "%TMP_DIR_WIN%\bin\sccache.exe" "%CUDA_PATH%\bin\nvcc.exe" %%* > "%TMP_DIR%/bin/nvcc.bat"
cat %TMP_DIR%/bin/nvcc.bat
set CUDA_NVCC_EXECUTABLE=%TMP_DIR%/bin/nvcc.bat
Expand All @@ -114,8 +114,8 @@ if "%USE_CUDA%"=="1" (
set

python setup.py bdist_wheel
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
if errorlevel 1 goto fail
if not errorlevel 0 goto fail
sccache --show-stats
python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])"
(
Expand All @@ -135,3 +135,8 @@ python -c "import os, glob; os.system('python -mpip install --no-index --no-deps

sccache --show-stats --stats-format json | jq .stats > sccache-stats-%BUILD_ENVIRONMENT%-%OUR_GITHUB_JOB_ID%.json
sccache --stop-server

exit /b 0

:fail
exit /b 1
2 changes: 1 addition & 1 deletion .github/ci_commit_pins/vision.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2c4665ffbb64f03f5d18016d3398af4ac4da5f03
06ad737628abc3a1e617571dc03cbdd5b36ea96a
2 changes: 1 addition & 1 deletion .github/ci_commit_pins/xla.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
58a412cb271a3f98ae2e01fd1d24bdbb66645d4e
73b915b55d96553a0e370b2bab01f47b8c2a9e7c
1 change: 1 addition & 0 deletions .github/pytorch-probot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ ciflow_push_tags:
- ciflow/binaries_wheel
- ciflow/inductor
- ciflow/inductor-perf-compare
- ciflow/linux-aarch64
- ciflow/mps
- ciflow/nightly
- ciflow/periodic
Expand Down
25 changes: 20 additions & 5 deletions .github/scripts/generate_binary_build_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,16 @@
import os
from typing import Dict, List, Optional, Tuple

CUDA_ARCHES = ["11.8", "12.1"]
CUDA_ARCHES = ["11.8", "12.1", "12.4"]


CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1"}
CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1", "12.4": "12.4.0"}


CUDA_ARCHES_CUDNN_VERSION = {"11.8": "8", "12.1": "8"}
CUDA_ARCHES_CUDNN_VERSION = {"11.8": "8", "12.1": "8", "12.4": "8"}


ROCM_ARCHES = ["5.7", "6.0"]
ROCM_ARCHES = ["6.0", "6.1"]


CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]
Expand Down Expand Up @@ -58,6 +58,20 @@
"nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'"
),
"12.4": (
"nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'"
),
}


Expand Down Expand Up @@ -324,7 +338,7 @@ def generate_wheels_matrix(
)

# 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
if arch_version in ["12.1", "11.8"] and os == "linux":
if arch_version in ["12.4", "12.1", "11.8"] and os == "linux":
ret.append(
{
"python_version": python_version,
Expand Down Expand Up @@ -367,5 +381,6 @@ def generate_wheels_matrix(
return ret


validate_nccl_dep_consistency("12.4")
validate_nccl_dep_consistency("12.1")
validate_nccl_dep_consistency("11.8")
2 changes: 1 addition & 1 deletion .github/templates/linux_binary_build_workflow.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ env:
PYTORCH_FINAL_PACKAGE_DIR: /artifacts
PYTORCH_ROOT: /pytorch
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
SKIP_ALL_TESTS: 1
SKIP_ALL_TESTS: 0
!{{ common.concurrency(build_environment) }}

jobs:
Expand Down
2 changes: 1 addition & 1 deletion .github/templates/macos_binary_build_workflow.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ env:
BUILD_ENVIRONMENT: !{{ build_environment }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ github.event.pull_request.number }}
SKIP_ALL_TESTS: 1
SKIP_ALL_TESTS: 0
{%- if cross_compile_arm64 %}
CROSS_COMPILE_ARM64: 1
{% endif %}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/build-triton-wheel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
device: ["cuda", "rocm"]
include:
- device: "rocm"
rocm_version: "6.0"
rocm_version: "6.1"
- device: "cuda"
rocm_version: ""
timeout-minutes: 40
Expand Down
Loading

0 comments on commit 8b0ea3a

Please sign in to comment.