Skip to content

Commit

Permalink
Update on "[inductor][cpp] epilogue support for gemm template"
Browse files Browse the repository at this point in the history
As part of #125683, this PR adds the epilogue support for c++ gemm template by reusing the c++ vector codegen on sub-slices of tensors. This is implemented by retracing the epilogue IR nodes with new ranges and offsets. The new `codegen_loop_bodies` and `codegen_functions` methods are added to c++ vector codegen for this purpose. This is leveraged by the `store_output` method of the template kernel for epilogue codegen and store to the final result.

cc voznesenskym penguinwu EikanWang Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx peterbell10 ipiszy yf225 chenyang78 kadeng muchulee8 ColinPeppler amjames desertfire chauhang

[ghstack-poisoned]
  • Loading branch information
jgong5 committed May 20, 2024
2 parents 49a5171 + 45be4bf commit df28970
Show file tree
Hide file tree
Showing 2,097 changed files with 25,783 additions and 135,293 deletions.
62 changes: 60 additions & 2 deletions .ci/docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,27 @@ fi
# CMake 3.18 is needed to support CUDA17 language variant
CMAKE_VERSION=3.18.5

_UCX_COMMIT=00bcc6bb18fc282eb160623b4c0d300147f579af
_UCC_COMMIT=7cb07a76ccedad7e56ceb136b865eb9319c258ea
_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b

# It's annoying to rename jobs every time you want to rewrite a
# configuration, so we hardcode everything here rather than do it
# from scratch
case "$image" in
pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9)
CUDA_VERSION=12.4.0
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
CUDA_VERSION=12.1.1
CUDNN_VERSION=8
Expand All @@ -105,6 +119,21 @@ case "$image" in
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9-inductor-benchmarks)
CUDA_VERSION=12.4.0
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks)
CUDA_VERSION=12.1.1
CUDNN_VERSION=8
Expand All @@ -120,6 +149,21 @@ case "$image" in
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-focal-cuda12.1-cudnn8-py3.12-gcc9-inductor-benchmarks)
CUDA_VERSION=12.1.1
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9)
CUDA_VERSION=11.8.0
CUDNN_VERSION=8
Expand All @@ -134,6 +178,20 @@ case "$image" in
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9)
CUDA_VERSION=12.4.0
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
CUDA_VERSION=12.1.1
CUDNN_VERSION=8
Expand Down
2 changes: 1 addition & 1 deletion .ci/docker/common/install_acl.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
set -euo pipefail

readonly version=v23.08
readonly version=v24.04
readonly src_host=https://review.mlplatform.org/ml
readonly src_repo=ComputeLibrary

Expand Down
5 changes: 4 additions & 1 deletion .ci/docker/common/install_cudnn.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ if [[ ${CUDNN_VERSION} == 8 ]]; then
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
mkdir tmp_cudnn
pushd tmp_cudnn
if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
if [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-8.9.7.29_cuda12-archive"
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
elif [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-8.9.2.26_cuda12-archive"
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
Expand Down
11 changes: 8 additions & 3 deletions .ci/docker/common/install_cusparselt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,14 @@ set -ex
# cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
mkdir tmp_cusparselt && cd tmp_cusparselt

if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.5.2.1-archive"
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
if [[ ${CUDA_VERSION:0:4} =~ ^12\.[1-4]$ ]]; then
arch_path='sbsa'
export TARGETARCH=${TARGETARCH:-$(uname -m)}
if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
arch_path='x86_64'
fi
CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.5.2.1-archive"
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.4.0.7-archive"
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
Expand Down
1 change: 1 addition & 0 deletions .ci/docker/ubuntu-cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ RUN rm install_cusparselt.sh
RUN if [ -h /usr/local/cuda-11.6/cuda-11.6 ]; then rm /usr/local/cuda-11.6/cuda-11.6; fi
RUN if [ -h /usr/local/cuda-11.7/cuda-11.7 ]; then rm /usr/local/cuda-11.7/cuda-11.7; fi
RUN if [ -h /usr/local/cuda-12.1/cuda-12.1 ]; then rm /usr/local/cuda-12.1/cuda-12.1; fi
RUN if [ -h /usr/local/cuda-12.1/cuda-12.4 ]; then rm /usr/local/cuda-12.1/cuda-12.4; fi

USER jenkins
CMD ["bash"]
2 changes: 1 addition & 1 deletion .ci/pytorch/docs-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
echo "Testing pytorch docs"

cd docs
make doctest
TERM=vt100 make doctest
17 changes: 13 additions & 4 deletions .ci/pytorch/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,15 @@ test_inductor_torchbench_smoketest_perf() {
"$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv" \
--expected benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
done

# Perform some "warm-start" runs for a few huggingface models.
for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
--only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
python benchmarks/dynamo/check_accuracy.py \
--actual "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv" \
--expected "benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv"
done
}

test_inductor_torchbench_cpu_smoketest_perf(){
Expand Down Expand Up @@ -1269,6 +1278,10 @@ elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHAR
elif [[ "${TEST_CONFIG}" == *dynamo* && $SHARD_NUMBER -gt 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
install_torchvision
test_dynamo_shard "${SHARD_NUMBER}"
elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
install_torchvision
test_python_shard "$SHARD_NUMBER"
test_aten
elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
test_without_numpy
install_torchvision
Expand Down Expand Up @@ -1298,10 +1311,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-mobile-lightweight-dispatch* ]]; then
test_libtorch
elif [[ "${TEST_CONFIG}" = docs_test ]]; then
test_docs_test
elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
install_torchvision
test_python
test_aten
elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
install_torchvision
test_python
Expand Down
2 changes: 1 addition & 1 deletion .github/ci_commit_pins/audio.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
ea437b31ce316ea3d66fe73768c0dcb94edb79ad
1980f8af5bcd0bb2ce51965cf79d8d4c25dad8a0
1 change: 0 additions & 1 deletion .github/pytorch-probot.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
tracking_issue: 24422
ciflow_tracking_issue: 64124
TD_rollout_issue: 123120
ciflow_push_tags:
- ciflow/binaries
- ciflow/binaries_conda
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/docker-builds.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,11 @@ jobs:
matrix:
runner: [linux.12xlarge]
docker-image-name: [
pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9,
pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9-inductor-benchmarks,
pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9,
pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks,
pytorch-linux-focal-cuda12.1-cudnn8-py3.12-gcc9-inductor-benchmarks,
pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9,
pytorch-linux-focal-py3.8-clang10,
pytorch-linux-focal-py3.11-clang10,
Expand Down
23 changes: 22 additions & 1 deletion .github/workflows/inductor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,27 @@ jobs:
secrets:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}

linux-focal-cuda12_1-py3_12-gcc9-inductor-build:
name: cuda12.1-py3.12-gcc9-sm86
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-cuda12.1-py3.12-gcc9-sm86
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3.12-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
test-matrix: |
{ include: [
{ config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
]}
linux-focal-cuda12_1-py3_12-gcc9-inductor-test:
name: cuda12.1-py3.12-gcc9-sm86
uses: ./.github/workflows/_linux-test.yml
needs: linux-focal-cuda12_1-py3_12-gcc9-inductor-build
with:
build-environment: linux-focal-cuda12.1-py3.12-gcc9-sm86
docker-image: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.test-matrix }}

linux-jammy-cpu-py3_8-gcc11-inductor-build:
name: linux-jammy-cpu-py3.8-gcc11-inductor
uses: ./.github/workflows/_linux-build.yml
Expand All @@ -125,7 +146,7 @@ jobs:
{ config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
{ config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
{ config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
{ config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
{ config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.24xl.spr-metal" },
]}
secrets:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
Expand Down
6 changes: 4 additions & 2 deletions .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,9 @@ jobs:
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
{ config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
]}
linux-focal-rocm6_1-py3_8-test:
Expand All @@ -208,4 +210,4 @@ jobs:
build-environment: linux-focal-rocm6.1-py3.8
docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
43 changes: 43 additions & 0 deletions .github/workflows/upload_test_stats_intermediate.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
name: Upload test stats intermediate

on:
workflow_dispatch:
inputs:
workflow_id:
description: workflow_id of the run
required: true
workflow_run_attempt:
description: workflow_run_attempt of the run
required: true

jobs:
intermediate_upload_test_stats:
name: Intermediate upload test stats for ${{ inputs.workflow_id }}
runs-on: ubuntu-22.04
environment: upload-stats
steps:
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
with:
fetch-depth: 1
submodules: false

- uses: actions/setup-python@v4
with:
python-version: '3.11'
cache: pip

- run: |
pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12
- name: Upload test stats
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
WORKFLOW_RUN_ID: ${{ inputs.workflow_id }}
WORKFLOW_RUN_ATTEMPT: ${{ inputs.workflow_run_attempt }}
run: |
python3 -m tools.stats.upload_test_stats_intermediate \
--workflow-run-id "${WORKFLOW_RUN_ID}" \
--workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" \
16 changes: 4 additions & 12 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@
ignore = dirty
path = third_party/pybind11
url = https://github.com/pybind/pybind11.git
[submodule "third_party/cub"]
ignore = dirty
path = third_party/cub
url = https://github.com/NVlabs/cub.git
[submodule "third_party/eigen"]
ignore = dirty
path = third_party/eigen
Expand Down Expand Up @@ -50,10 +46,6 @@
ignore = dirty
path = third_party/psimd
url = https://github.com/Maratyszcza/psimd.git
[submodule "third_party/zstd"]
ignore = dirty
path = third_party/zstd
url = https://github.com/facebook/zstd.git
[submodule "third_party/cpuinfo"]
ignore = dirty
path = third_party/cpuinfo
Expand All @@ -66,10 +58,6 @@
ignore = dirty
path = third_party/onnx
url = https://github.com/onnx/onnx.git
[submodule "third_party/onnx-tensorrt"]
ignore = dirty
path = third_party/onnx-tensorrt
url = https://github.com/onnx/onnx-tensorrt
[submodule "third_party/sleef"]
ignore = dirty
path = third_party/sleef
Expand Down Expand Up @@ -152,3 +140,7 @@
[submodule "third_party/opentelemetry-cpp"]
path = third_party/opentelemetry-cpp
url = https://github.com/open-telemetry/opentelemetry-cpp.git
[submodule "third_party/cpp-httplib"]
path = third_party/cpp-httplib
url = https://github.com/yhirose/cpp-httplib.git
branch = v0.15.3
5 changes: 1 addition & 4 deletions .lintrunner.toml
Original file line number Diff line number Diff line change
Expand Up @@ -1052,13 +1052,13 @@ exclude_patterns = [
'test/quantization/fx/test_numeric_suite_fx.py',
'test/quantization/fx/test_quantize_fx.py',
'test/quantization/fx/test_subgraph_rewriter.py',
'test/test_datapipe.py',
'test/test_fake_tensor.py',
'test/test_flop_counter.py',
'test/test_function_schema.py',
'test/test_functional_autograd_benchmark.py',
'test/test_functional_optim.py',
'test/test_functionalization_of_rng_ops.py',
'test/test_datapipe.py',
'test/test_futures.py',
'test/test_fx.py',
'test/test_fx_experimental.py',
Expand Down Expand Up @@ -1143,7 +1143,6 @@ exclude_patterns = [
'test/test_transformers.py',
'test/test_type_promotion.py',
'test/test_unary_ufuncs.py',
'test/test_utils.py',
'test/test_vulkan.py',
'test/test_xnnpack_integration.py',
'test/torch_np/numpy_test/**/*.py',
Expand Down Expand Up @@ -1930,8 +1929,6 @@ exclude_patterns = [
'torch/utils/_mode_utils.py',
'torch/utils/_python_dispatch.py',
'torch/utils/_stats.py',
'torch/utils/_sympy/__init__.py',
'torch/utils/_sympy/functions.py',
'torch/utils/_traceback.py',
'torch/utils/_zip.py',
'torch/utils/backcompat/__init__.py',
Expand Down
2 changes: 1 addition & 1 deletion BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -772,7 +772,7 @@ cc_library(
[
"torch/*.h",
"torch/csrc/**/*.h",
"torch/csrc/distributed/c10d/*.hpp",
"torch/csrc/distributed/c10d/**/*.hpp",
"torch/lib/libshm/*.h",
],
exclude = [
Expand Down
Loading

0 comments on commit df28970

Please sign in to comment.