Skip to content

Commit

Permalink
Update on "[Inductor][Quant] Change the QConv output scale name"
Browse files Browse the repository at this point in the history
**Summary**
Change the name of QConv output scale from `inv_output_scale` to `output_scale` after we move the optimization of quant/dequant from decomposition to lowering phase.


cc jgong5 mingfeima XiaobingSuper sanchitintel ashokei jingxu10 voznesenskym penguinwu EikanWang Guobing-Chen zhuhaozhe blzheng wenzhe-nrv jiayisunx peterbell10 ipiszy yf225 chenyang78 kadeng muchulee8 aakhundov ColinPeppler amjames desertfire chauhang

[ghstack-poisoned]
  • Loading branch information
leslie-fang-intel committed May 9, 2024
2 parents 530c2c4 + 468e423 commit b291167
Show file tree
Hide file tree
Showing 268 changed files with 9,997 additions and 4,249 deletions.
2 changes: 1 addition & 1 deletion .ci/docker/build.sh
Expand Up @@ -366,7 +366,7 @@ if [[ "$image" == *cuda* && ${OS} == "ubuntu" ]]; then
fi

# Build image
DOCKER_BUILDKIT=1 docker build \
docker build \
--no-cache \
--progress=plain \
--build-arg "BUILD_ENVIRONMENT=${image}" \
Expand Down
4 changes: 2 additions & 2 deletions .ci/docker/requirements-ci.txt
Expand Up @@ -279,9 +279,9 @@ ghstack==0.8.0
#Pinned versions: 0.8.0
#test that import:

jinja2==3.1.3
jinja2==3.1.4
#Description: jinja2 template engine
#Pinned versions: 3.1.3
#Pinned versions: 3.1.4
#test that import:

pytest-cpp==2.3.0
Expand Down
41 changes: 24 additions & 17 deletions .ci/pytorch/test.sh
Expand Up @@ -310,23 +310,23 @@ test_dynamo_shard() {
test_inductor_distributed() {
# Smuggle a few multi-gpu tests here so that we don't have to request another large node
echo "Testing multi_gpu tests in test_torchinductor"
pytest test/inductor/test_torchinductor.py -k test_multi_gpu
pytest test/inductor/test_aot_inductor.py -k test_non_default_cuda_device
pytest test/inductor/test_aot_inductor.py -k test_replicate_on_devices
pytest test/distributed/test_c10d_functional_native.py
pytest test/distributed/_tensor/test_dtensor_compile.py
pytest test/distributed/tensor/parallel/test_fsdp_2d_parallel.py
pytest test/distributed/_composable/fsdp/test_fully_shard_comm.py
pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group
pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing
pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp
pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp
pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume
pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_gradient_accumulation
pytest test/distributed/_composable/fsdp/test_fully_shard_frozen.py
pytest test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype
pytest test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype
pytest test/distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration
python test/run_test.py -i inductor/test_torchinductor.py -k test_multi_gpu --verbose
python test/run_test.py -i inductor/test_aot_inductor.py -k test_non_default_cuda_device --verbose
python test/run_test.py -i inductor/test_aot_inductor.py -k test_replicate_on_devices --verbose
python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose
python test/run_test.py -i distributed/tensor/parallel/test_fsdp_2d_parallel.py --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_gradient_accumulation --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_frozen.py --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose

# this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
# with if required # gpus aren't available
Expand Down Expand Up @@ -522,6 +522,11 @@ test_single_dynamo_benchmark() {
fi
}

test_inductor_micro_benchmark() {
TEST_REPORTS_DIR=$(pwd)/test/test-micro-reports
python benchmarks/gpt_fast/benchmark.py
}

test_dynamo_benchmark() {
# Usage: test_dynamo_benchmark huggingface 0
TEST_REPORTS_DIR=$(pwd)/test/test-reports
Expand Down Expand Up @@ -1209,6 +1214,8 @@ elif [[ "$TEST_CONFIG" == deploy ]]; then
test_torch_deploy
elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
test_inductor_distributed
elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
test_inductor_micro_benchmark
elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
install_torchvision
id=$((SHARD_NUMBER-1))
Expand Down
2 changes: 1 addition & 1 deletion .github/ci_commit_pins/vision.txt
@@ -1 +1 @@
06ad737628abc3a1e617571dc03cbdd5b36ea96a
d23a6e1664d20707c11781299611436e1f0c104f
2 changes: 2 additions & 0 deletions .github/merge_rules.yaml
Expand Up @@ -29,10 +29,12 @@
approved_by:
- BowenBao
- justinchuby
- liqunfu
- shubhambhokare1
- thiagocrepaldi
- titaiwangms
- wschin
- xadupre
mandatory_checks_name:
- EasyCLA
- Lint
Expand Down
1 change: 1 addition & 0 deletions .github/pytorch-probot.yml
Expand Up @@ -8,6 +8,7 @@ ciflow_push_tags:
- ciflow/binaries_wheel
- ciflow/inductor
- ciflow/inductor-perf-compare
- ciflow/inductor-micro-benchmark
- ciflow/linux-aarch64
- ciflow/mps
- ciflow/nightly
Expand Down
2 changes: 1 addition & 1 deletion .github/requirements-gha-cache.txt
Expand Up @@ -5,7 +5,7 @@
# functorch/docs/requirements.txt
# .ci/docker/requirements-ci.txt
boto3==1.19.12
jinja2==3.1.3
jinja2==3.1.4
lintrunner==0.10.7
ninja==1.10.0.post1
nvidia-ml-py==11.525.84
Expand Down
6 changes: 5 additions & 1 deletion .github/scripts/amd/patch_triton_wheel.sh
@@ -1,7 +1,11 @@
#!/bin/bash
set -x

WHEELHOUSE_DIR=/artifacts
if [ -z "$1" ]; then
echo "Need wheel location argument" && exit 1
fi

WHEELHOUSE_DIR=$1
PATCHELF_BIN=patchelf
ROCM_LIB=backends/amd/lib
ROCM_LD=backends/amd/llvm/bin
Expand Down
4 changes: 2 additions & 2 deletions .github/scripts/build_triton_wheel.py
Expand Up @@ -157,10 +157,10 @@ def build_triton(

if build_rocm:
check_call(
[f"{SCRIPT_DIR}/amd/patch_triton_wheel.sh"],
[f"{SCRIPT_DIR}/amd/patch_triton_wheel.sh", Path.cwd()],
cwd=triton_basedir,
shell=True,
)

return Path.cwd() / whl_path.name


Expand Down
14 changes: 13 additions & 1 deletion .github/scripts/generate_docker_release_matrix.py
Expand Up @@ -21,6 +21,8 @@

def generate_docker_matrix() -> Dict[str, List[Dict[str, str]]]:
ret: List[Dict[str, str]] = []
# CUDA amd64 Docker images are available as both runtime and devel while
# CPU arm64 image is only available as runtime.
for cuda, version in generate_binary_build_matrix.CUDA_ARCHES_FULL_VERSION.items():
for image in DOCKER_IMAGE_TYPES:
ret.append(
Expand All @@ -31,9 +33,19 @@ def generate_docker_matrix() -> Dict[str, List[Dict[str, str]]]:
cuda
],
"image_type": image,
"platform": "linux/arm64,linux/amd64",
"platform": "linux/amd64",
}
)
ret.append(
{
"cuda": "cpu",
"cuda_full_version": "",
"cudnn_version": "",
"image_type": "runtime",
"platform": "linux/arm64",
}
)

return {"include": ret}


Expand Down
21 changes: 16 additions & 5 deletions .github/workflows/docker-release.yml
Expand Up @@ -7,6 +7,7 @@ on:
- Dockerfile
- docker.Makefile
- .github/workflows/docker-release.yml
- .github/scripts/generate_docker_release_matrix.py
push:
branches:
- nightly
Expand Down Expand Up @@ -129,17 +130,27 @@ jobs:
if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' }}
run: |
PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-cuda${CUDA_VERSION_SHORT}-cudnn${CUDNN_VERSION}-runtime"
CUDA_SUFFIX="-cu${CUDA_VERSION}"
if [[ ${CUDA_VERSION_SHORT} == "cpu" ]]; then
PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-runtime"
CUDA_SUFFIX=""
fi
PYTORCH_NIGHTLY_COMMIT=$(docker run ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_DOCKER_TAG}" \
python -c 'import torch; print(torch.version.git_version[:7],end="")')
docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_DOCKER_TAG}" \
ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}"
docker push ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}"
ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}"
docker push ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}"
# Please note, here we ned to pin specific verison of CUDA as with latest label
if [[ ${CUDA_VERSION_SHORT} == "12.1" ]]; then
docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}" \
ghcr.io/pytorch/pytorch-nightly:latest
docker push ghcr.io/pytorch/pytorch-nightly:latest
fi
docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}" \
ghcr.io/pytorch/pytorch-nightly:latest
docker push ghcr.io/pytorch/pytorch-nightly:latest
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always()
40 changes: 40 additions & 0 deletions .github/workflows/inductor-micro-benchmark.yml
@@ -0,0 +1,40 @@
name: inductor-micro-benchmark

on:
schedule:
- cron: 0 7 * * *
push:
tags:
- ciflow/inductor-micro-benchmark/*
workflow_dispatch:


concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true

permissions: read-all

jobs:
linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build:
name: cuda12.1-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
{ config: "inductor-micro-benchmark", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
]}
linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-test:
name: cuda12.1-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build
with:
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build.outputs.test-matrix }}
use-gha: anything-non-empty-to-use-gha
timeout-minutes: 720
6 changes: 0 additions & 6 deletions .lintrunner.toml
Expand Up @@ -1052,12 +1052,6 @@ exclude_patterns = [
'test/quantization/fx/test_quantize_fx.py',
'test/quantization/fx/test_subgraph_rewriter.py',
'test/test_datapipe.py',
'test/test_decomp.py',
'test/test_deploy.py',
'test/test_determination.py',
'test/test_dlpack.py',
'test/test_dynamic_shapes.py',
'test/test_expanded_weights.py',
'test/test_fake_tensor.py',
'test/test_flop_counter.py',
'test/test_function_schema.py',
Expand Down
14 changes: 6 additions & 8 deletions Dockerfile
@@ -1,12 +1,10 @@
# syntax = docker/dockerfile:experimental
#
# NOTE: To build this you will need a docker version > 18.06 with
# experimental enabled and DOCKER_BUILDKIT=1
#
# If you do not use buildkit you are not going to have a good time
# syntax=docker/dockerfile:1

# NOTE: Building this image require's docker version >= 23.0.
#
# For reference:
# https://docs.docker.com/develop/develop-images/build_enhancements/
# For reference:
# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel

ARG BASE_IMAGE=ubuntu:22.04
ARG PYTHON_VERSION=3.11

Expand Down
1 change: 1 addition & 0 deletions aten/src/ATen/core/interned_strings.h
Expand Up @@ -227,6 +227,7 @@ namespace c10 {
_(aten, is_autocast_enabled) \
_(aten, is_autocast_cpu_enabled) \
_(aten, is_autocast_xla_enabled) \
_(aten, get_autocast_dtype) \
FORALL_ATEN_BASE_SYMBOLS(_) \
_(onnx, Add) \
_(onnx, Concat) \
Expand Down
8 changes: 4 additions & 4 deletions aten/src/ATen/cuda/CUDABlas.cpp
Expand Up @@ -236,7 +236,7 @@ namespace at::cuda::blas {
CUDABLAS_NONNEGINT_CHECK(bgemm<Dtype>, num_batches); \
} while (0)

#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)

#if defined(USE_ROCM) && ROCM_VERSION >= 50700 && ROCM_VERSION < 60000
// only for rocm 5.7 where we first supported hipblaslt, it was difficult
Expand Down Expand Up @@ -375,7 +375,7 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<

template <typename Dtype>
inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 60000)
#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 60000)
cudaDataType_t abcType = CUDA_R_32F;
cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
cudaDataType_t scaleType = CUDA_R_32F;
Expand Down Expand Up @@ -1235,7 +1235,7 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
}
}

#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)

template <typename Dtype>
void gemm_and_bias(
Expand Down Expand Up @@ -1745,7 +1745,7 @@ void int8_gemm(
TORCH_CHECK(false, "int8_gemm is only supported for ROCm 6.0 and above");
#endif // !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 60000)
}
#endif // (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
#endif // !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)

// ROCm 5.6 hipblas matches the const Dtype *A API, but prior hipblas does not.
#if defined(USE_ROCM) && ROCM_VERSION < 50600
Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/cuda/CUDABlas.h
Expand Up @@ -82,7 +82,7 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
template <>
void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));

#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
enum GEMMAndBiasActivationEpilogue {
None,
RELU,
Expand Down
4 changes: 2 additions & 2 deletions aten/src/ATen/cuda/CUDAContextLight.h
Expand Up @@ -9,7 +9,7 @@

// cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also
// added bf16 support
#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
#include <cublasLt.h>
#endif

Expand Down Expand Up @@ -82,7 +82,7 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator();
/* Handles */
TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle();
TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
#endif

Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/cuda/CublasHandlePool.cpp
Expand Up @@ -191,7 +191,7 @@ cublasHandle_t getCurrentCUDABlasHandle() {
return handle;
}

#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
cublasLtHandle_t getCurrentCUDABlasLtHandle() {
#ifdef USE_ROCM
c10::DeviceIndex device = 0;
Expand Down

0 comments on commit b291167

Please sign in to comment.