Skip to content

Commit

Permalink
Update on "[Inductor][CPP] Enable Quantized Linear GEMM Template with…
Browse files Browse the repository at this point in the history
… FP32 output"


**Summary**
Support int8 GEMM Template with refer MicroInt8GEMM kernel for case:

- Activation dtype: uint8
- Weight dtype: int8
- Output dtype: float32/bfloat16
- Post Op Fusion: without unary post operator fusion

**Test Plan**
```
clear && python -u -m pytest -s -v test/inductor/test_cpu_select_algorithm.py -k test_quantized_linear_with_pointwise
```

**Next Step**
- [ ] Unary post op fusion
- [ ] Int8 output
- [ ] Binary Fusion
- [ ] AMX int8 MicroGEMM Kernel

cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx peterbell10 ipiszy yf225 chenyang78 kadeng muchulee8 ColinPeppler amjames desertfire chauhang

[ghstack-poisoned]
  • Loading branch information
leslie-fang-intel committed Jun 21, 2024
2 parents 26c0111 + 5d22715 commit 8a9a5ba
Show file tree
Hide file tree
Showing 332 changed files with 9,530 additions and 9,002 deletions.
2 changes: 1 addition & 1 deletion .ci/docker/ci_commit_pins/executorch.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
d4b3e5cc607e97afdba79dc90f8ef968142f347c
172574a6be5910a4609e4ed1bef2b6b8475ddb3d
14 changes: 9 additions & 5 deletions .ci/docker/common/install_executorch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,20 +37,24 @@ install_conda_dependencies() {

install_pip_dependencies() {
pushd executorch/.ci/docker
# Install PyTorch CPU build beforehand to avoid installing the much bigger CUDA
# binaries later, ExecuTorch only needs CPU
pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
# Install all Python dependencies
pip_install -r requirements-ci.txt
popd
}

setup_executorch() {
pushd executorch
source .ci/scripts/utils.sh
# Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate
as_jenkins bash .ci/scripts/setup-vulkan-linux-deps.sh

install_flatc_from_source
pip_install .
export PYTHON_EXECUTABLE=python
export EXECUTORCH_BUILD_PYBIND=ON
export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"

# Make sure that all the newly generate files are owned by Jenkins
chown -R jenkins .
as_jenkins .ci/scripts/setup-linux.sh cmake
popd
}

Expand Down
25 changes: 20 additions & 5 deletions .ci/pytorch/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -284,12 +284,26 @@ else
# Which should be backward compatible with Numpy-1.X
python -mpip install --pre numpy==2.0.0rc1
fi
WERROR=1 python setup.py bdist_wheel

WERROR=1 python setup.py clean

if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 python setup.py bdist_wheel
BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 python setup.py bdist_wheel --cmake
else
WERROR=1 python setup.py bdist_wheel
fi
else
python setup.py clean
if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
source .ci/pytorch/install_cache_xla.sh
fi
python setup.py bdist_wheel
if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
echo "USE_SPLIT_BUILD cannot be used with xla or rocm"
exit 1
else
python setup.py bdist_wheel
fi
fi
pip_install_whl "$(echo dist/*.whl)"

Expand Down Expand Up @@ -328,9 +342,10 @@ else
CUSTOM_OP_TEST="$PWD/test/custom_operator"
python --version
SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"

mkdir -p "$CUSTOM_OP_BUILD"
pushd "$CUSTOM_OP_BUILD"
cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPython_EXECUTABLE="$(which python)" \
cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
-DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
make VERBOSE=1
popd
Expand All @@ -343,7 +358,7 @@ else
SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
mkdir -p "$JIT_HOOK_BUILD"
pushd "$JIT_HOOK_BUILD"
cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPython_EXECUTABLE="$(which python)" \
cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
-DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
make VERBOSE=1
popd
Expand All @@ -355,7 +370,7 @@ else
python --version
mkdir -p "$CUSTOM_BACKEND_BUILD"
pushd "$CUSTOM_BACKEND_BUILD"
cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPython_EXECUTABLE="$(which python)" \
cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
-DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
make VERBOSE=1
popd
Expand Down
22 changes: 21 additions & 1 deletion .ci/pytorch/common_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,29 @@ function assert_git_not_dirty() {
function pip_install_whl() {
# This is used to install PyTorch and other build artifacts wheel locally
# without using any network connection
python3 -mpip install --no-index --no-deps "$@"

# Convert the input arguments into an array
local args=("$@")

# Check if the first argument contains multiple paths separated by spaces
if [[ "${args[0]}" == *" "* ]]; then
# Split the string by spaces into an array
IFS=' ' read -r -a paths <<< "${args[0]}"
# Loop through each path and install individually
for path in "${paths[@]}"; do
echo "Installing $path"
python3 -mpip install --no-index --no-deps "$path"
done
else
# Loop through each argument and install individually
for path in "${args[@]}"; do
echo "Installing $path"
python3 -mpip install --no-index --no-deps "$path"
done
fi
}


function pip_install() {
# retry 3 times
# old versions of pip don't have the "--progress-bar" flag
Expand Down
53 changes: 40 additions & 13 deletions .ci/pytorch/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,9 @@ test_python_shard() {

# Bare --include flag is not supported and quoting for lint ends up with flag not being interpreted correctly
# shellcheck disable=SC2086

# modify LD_LIBRARY_PATH to ensure it has the conda env.
# This set of tests has been shown to be buggy without it for the split-build
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION

assert_git_not_dirty
Expand Down Expand Up @@ -347,17 +350,31 @@ test_inductor_distributed() {
assert_git_not_dirty
}

test_inductor() {
test_inductor_shard() {
if [[ -z "$NUM_TEST_SHARDS" ]]; then
echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
exit 1
fi

python tools/dynamo/verify_dynamo.py
python test/run_test.py --inductor --include test_modules test_ops test_ops_gradients test_torch --verbose
python test/run_test.py --inductor \
--include test_modules test_ops test_ops_gradients test_torch \
--shard "$1" "$NUM_TEST_SHARDS" \
--verbose

# Do not add --inductor for the following inductor unit tests, otherwise we will fail because of nested dynamo state
python test/run_test.py --include inductor/test_torchinductor inductor/test_torchinductor_opinfo inductor/test_aot_inductor --verbose
python test/run_test.py \
--include inductor/test_torchinductor inductor/test_torchinductor_opinfo inductor/test_aot_inductor \
--shard "$1" "$NUM_TEST_SHARDS" \
--verbose
}

test_inductor_aoti() {
# docker build uses bdist_wheel which does not work with test_aot_inductor
# TODO: need a faster way to build
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
fi
}

Expand Down Expand Up @@ -1174,15 +1191,21 @@ test_executorch() {

pushd /executorch

# NB: We need to build ExecuTorch runner here and not inside the Docker image
# because it depends on PyTorch
export PYTHON_EXECUTABLE=python
export EXECUTORCH_BUILD_PYBIND=ON
export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"

# NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
# from the PR
# shellcheck disable=SC1091
source .ci/scripts/utils.sh
build_executorch_runner "cmake"
source .ci/scripts/setup-linux.sh cmake

echo "Run ExecuTorch unit tests"
pytest -v -n auto
# shellcheck disable=SC1091
LLVM_PROFDATA=llvm-profdata-12 LLVM_COV=llvm-cov-12 bash test/run_oss_cpp_tests.sh

echo "Run ExecuTorch regression tests for some models"
# NB: This is a sample model, more can be added here
export PYTHON_EXECUTABLE=python
# TODO(huydhn): Add more coverage here using ExecuTorch's gather models script
# shellcheck disable=SC1091
source .ci/scripts/test.sh mv3 cmake xnnpack-quantization-delegation ''
Expand Down Expand Up @@ -1290,10 +1313,14 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper_abi_compatible* ]]; then
install_torchvision
test_inductor_cpp_wrapper_abi_compatible
elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 ]]; then
elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
install_torchvision
test_inductor
test_inductor_shard 1
test_inductor_aoti
test_inductor_distributed
elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" -gt 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
install_torchvision
test_inductor_shard "${SHARD_NUMBER}"
elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
install_torchvision
test_dynamo_shard 1
Expand Down
12 changes: 6 additions & 6 deletions .circleci/scripts/binary_populate_env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ if [[ -z "$DOCKER_IMAGE" ]]; then
if [[ "$PACKAGE_TYPE" == conda ]]; then
export DOCKER_IMAGE="pytorch/conda-cuda"
elif [[ "$DESIRED_CUDA" == cpu ]]; then
export DOCKER_IMAGE="pytorch/manylinux-cpu"
export DOCKER_IMAGE="pytorch/manylinux:cpu"
else
export DOCKER_IMAGE="pytorch/manylinux-cuda${DESIRED_CUDA:2}"
export DOCKER_IMAGE="pytorch/manylinux-builder:${DESIRED_CUDA:2}"
fi
fi

Expand Down Expand Up @@ -75,9 +75,9 @@ export PYTORCH_BUILD_NUMBER=1
TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)

# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.13'"
if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
# Only linux Python < 3.13 are supported wheels for triton
TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.13'"
TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
Expand All @@ -87,11 +87,11 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:
fi

# Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton rocm package
if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" && "$DESIRED_PYTHON" != "3.12" ]]; then
TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}"
if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" ]]; then
TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-rocm.txt)
TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}"
TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
fi
if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
Expand Down
21 changes: 20 additions & 1 deletion .github/actions/linux-build/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ inputs:
description: Hugging Face Hub token
required: false
default: ""
use_split_build:
description: |
[Experimental] Build a libtorch only wheel and build pytorch such that
are built from the libtorch wheel.
required: false
type: boolean
default: false
outputs:
docker-image:
value: ${{ steps.calculate-docker-image.outputs.docker-image }}
Expand Down Expand Up @@ -144,6 +151,7 @@ runs:
DEBUG: ${{ inputs.build-with-debug == 'true' && '1' || '0' }}
OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }}
USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
shell: bash
run: |
# detached container should get cleaned up by teardown_ec2_linux
Expand All @@ -163,6 +171,7 @@ runs:
-e PR_LABELS \
-e OUR_GITHUB_JOB_ID \
-e HUGGING_FACE_HUB_TOKEN \
-e USE_SPLIT_BUILD \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
Expand All @@ -183,14 +192,24 @@ runs:
- name: Store PyTorch Build Artifacts on S3
uses: seemethere/upload-artifact-s3@v5
if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped'
if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped' && inputs.use_split_build != 'true'
with:
name: ${{ inputs.build-environment }}
retention-days: 14
if-no-files-found: error
path: artifacts.zip
s3-bucket: ${{ inputs.s3-bucket }}

- name: Store PyTorch Build Artifacts on S3 for split build
uses: seemethere/upload-artifact-s3@v5
if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped' && inputs.use_split_build == 'true'
with:
name: ${{ inputs.build-environment }}-experimental-split-build
retention-days: 14
if-no-files-found: error
path: artifacts.zip
s3-bucket: ${{ inputs.s3-bucket }}

- name: Upload sccache stats
if: steps.build.outcome != 'skipped'
uses: seemethere/upload-artifact-s3@v5
Expand Down
2 changes: 1 addition & 1 deletion .github/ci_commit_pins/torchbench.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0dab1dd97709096e8129f8a08115ee83f64f2194
23512dbebd44a11eb84afbf53c3c071dd105297e
4 changes: 0 additions & 4 deletions .github/scripts/generate_binary_build_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,10 +347,6 @@ def generate_wheels_matrix(
for python_version in python_versions:
for arch_version in arches:
gpu_arch_type = arch_type(arch_version)
# Disable py3.12 builds for ROCm because of triton dependency
# on llnl-hatchet, which doesn't have py3.12 wheels available
if gpu_arch_type == "rocm" and python_version == "3.12":
continue
gpu_arch_version = (
""
if arch_version == "cpu"
Expand Down
Loading

0 comments on commit 8a9a5ba

Please sign in to comment.