Skip to content

Commit

Permalink
Update on "[vulkan] Pad channels when using texture storage instead o…
Browse files Browse the repository at this point in the history
…f "tight packing""

Currently, in Vulkan 4D tensors are represented in GPU textures by simply combining the batch and channel dimensions into the depth axis. However, if the number of channels is not a multiple of 4, then data belonging to the same batch can cross texel boundaries.

For instance, consider a tensor with `N=2`, `C=3`. The depth axis of the texture would contain the data

```
|tex1|tex2|
-----------
|AAAB|BB00|
```
Where A represents data from `n=1`and B represents data form `n=2`.

This packing structure ("tight packing") makes some ops that care about batch boundaries more complex and inefficient to implement. Therefore this diff introduces channel padding when storing tensors as image textures.

The same tensor with `N=2`, `C=3` would now have the depth axis contain

```
|tex1|tex2|
-----------
|AAA0|BBB0|
```

Differential Revision: [D43068669](https://our.internmc.facebook.com/intern/diff/D43068669/)

**NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D43068669/)!

[ghstack-poisoned]
  • Loading branch information
SS-JIA committed Feb 22, 2023
2 parents 813f0a1 + 146acb6 commit 2790599
Show file tree
Hide file tree
Showing 50 changed files with 1,217 additions and 243 deletions.
66 changes: 48 additions & 18 deletions .ci/pytorch/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ test_single_dynamo_benchmark() {
# Feel free to remove --device cuda if you ever decide to need to
# test CPU as well in CI
python "benchmarks/dynamo/$suite.py" \
--ci --accuracy --timing --explain --device cuda \
--ci --accuracy --timing --explain \
"$@" "${partition_flags[@]}" \
--output "$TEST_REPORTS_DIR/${name}_${suite}.csv"
python benchmarks/dynamo/check_csv.py \
Expand All @@ -297,10 +297,10 @@ test_aot_eager_benchmark() {
local exit_status=0

# Check inference with --float32
test_single_dynamo_benchmark "aot_eager_inference" "$@" --backend aot_eager || exit_status=$?
test_single_dynamo_benchmark "aot_eager_inference" "$@" --backend aot_eager --device cuda || exit_status=$?

# Check training with --amp
test_single_dynamo_benchmark "aot_eager_training" "$@" --backend aot_eager --training --amp || exit_status=$?
test_single_dynamo_benchmark "aot_eager_training" "$@" --backend aot_eager --device cuda --training --amp || exit_status=$?

if [[ $exit_status -ne 0 ]]; then
echo "Some benchmarks failed; scroll up for details"
Expand All @@ -311,14 +311,22 @@ test_aot_eager_benchmark() {
test_inductor_benchmark() {
# Usage: test_dynamo_benchmark huggingface 0

# Check inference with --float32
test_single_dynamo_benchmark "inductor_inference" "$@" --inductor
local device="$1"
shift

# Check training with --amp
test_single_dynamo_benchmark "inductor_training" "$@" --inductor --training --amp
if [[ $device == "cpu" ]]; then
# TODO: Add training and dynamic shape test
test_single_dynamo_benchmark "inductor_inference" "$@" --inductor --float32 --device cpu
else
# Check inference with --float32
test_single_dynamo_benchmark "inductor_inference" "$@" --inductor --device cuda

# Check inference with --dynamic-shapes
test_single_dynamo_benchmark "dynamic_inductor-inference" "$@" --inductor --dynamic-shapes
# Check training with --amp
test_single_dynamo_benchmark "inductor_training" "$@" --inductor --training --amp --device cuda

# Check inference with --dynamic-shapes
test_single_dynamo_benchmark "dynamic_inductor-inference" "$@" --inductor --dynamic-shapes --device cuda
fi
}

test_inductor_benchmark_perf() {
Expand Down Expand Up @@ -371,7 +379,9 @@ test_aot_eager_all() {
}

test_inductor_huggingface() {
test_inductor_benchmark huggingface ""
local device=$1
shift
test_inductor_benchmark "$device" huggingface ""
}

test_inductor_huggingface_perf() {
Expand All @@ -383,7 +393,9 @@ test_inductor_timm_shard() {
echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
exit 1
fi
test_inductor_benchmark timm_models "$1"
local device=$1
shift
test_inductor_benchmark "$device" timm_models "$1"
}

test_inductor_timm_perf_shard() {
Expand All @@ -395,7 +407,9 @@ test_inductor_timm_perf_shard() {
}

test_inductor_torchbench() {
PYTHONPATH=$(pwd)/torchbench test_inductor_benchmark torchbench ""
local device=$1
shift
PYTHONPATH=$(pwd)/torchbench test_inductor_benchmark "$device" torchbench ""
}

test_inductor_torchbench_perf() {
Expand Down Expand Up @@ -917,38 +931,54 @@ elif [[ "${TEST_CONFIG}" == *aot_eager_torchbench* ]]; then
elif [[ "${TEST_CONFIG}" == *inductor_huggingface* ]]; then
install_torchvision
install_filelock
install_triton
if [[ "${TEST_CONFIG}" != *inductor_huggingface_cpu_accuracy* ]]; then
# Cpp backend does not depend on triton
install_triton
fi
install_huggingface
if [[ "${TEST_CONFIG}" == *inductor_huggingface_perf* ]]; then
test_inductor_huggingface_perf
elif [[ "${TEST_CONFIG}" == *inductor_huggingface_cpu_accuracy* ]]; then
test_inductor_huggingface cpu
else
test_inductor_huggingface
test_inductor_huggingface cuda
fi
elif [[ "${TEST_CONFIG}" == *inductor_timm* && $NUM_TEST_SHARDS -gt 1 ]]; then
install_torchvision
install_filelock
install_triton
if [[ "${TEST_CONFIG}" != *inductor_timm_cpu_accuracy* ]]; then
# Cpp backend does not depend on triton
install_triton
fi
install_timm
id=$((SHARD_NUMBER-1))
if [[ "${TEST_CONFIG}" == *inductor_timm_perf* && $NUM_TEST_SHARDS -gt 1 ]]; then
test_inductor_timm_perf_shard $id
elif [[ "${TEST_CONFIG}" == *inductor_timm_cpu_accuracy* && $NUM_TEST_SHARDS -gt 1 ]]; then
test_inductor_timm_shard cpu $id
else
test_inductor_timm_shard $id
test_inductor_timm_shard cuda $id
fi
elif [[ "${TEST_CONFIG}" == *inductor_torchbench* ]]; then
install_torchtext
install_torchvision
install_filelock
install_triton
if [[ "${TEST_CONFIG}" != *inductor_torchbench_cpu_accuracy* ]]; then
# Cpp backend does not depend on triton
install_triton
fi
if [[ "${TEST_CONFIG}" == *inductor_torchbench_perf* ]]; then
checkout_install_torchbench
test_inductor_torchbench_perf
elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_accuracy* ]]; then
checkout_install_torchbench
test_inductor_torchbench cpu
elif [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
checkout_install_torchbench hf_Bert hf_Albert timm_efficientdet timm_vision_transformer
test_inductor_torchbench_smoketest_perf
else
checkout_install_torchbench
test_inductor_torchbench
test_inductor_torchbench cuda
fi
elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 ]]; then
install_torchvision
Expand Down
23 changes: 23 additions & 0 deletions .github/workflows/inductor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,26 @@ jobs:
docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-inductor-build-gcp.outputs.docker-image }}
test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-inductor-build-gcp.outputs.test-matrix }}
use-gha: anything-non-empty-to-use-gha

linux-focal-cpu-py3_8-gcc7-inductor-build:
name: linux-focal-cpu-py3.8-gcc7-inductor
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-py3_8-gcc7-build
docker-image-name: pytorch-linux-focal-py3.8-gcc7
test-matrix: |
{ include: [
{ config: "inductor_huggingface_cpu_accuracy", shard: 1, num_shards: 1, runner: "linux.4xlarge" },
{ config: "inductor_timm_cpu_accuracy", shard: 1, num_shards: 2, runner: "linux.4xlarge" },
{ config: "inductor_timm_cpu_accuracy", shard: 2, num_shards: 2, runner: "linux.4xlarge" },
{ config: "inductor_torchbench_cpu_accuracy", shard: 1, num_shards: 1, runner: "linux.4xlarge" },
]}
linux-focal-cpu-py3_8-gcc7-inductor-test:
name: linux-focal-cpu-py3.8-gcc7-inductor
uses: ./.github/workflows/_linux-test.yml
needs: linux-focal-cpu-py3_8-gcc7-inductor-build
with:
build-environment: linux-focal-py3_8-gcc7-build
docker-image: ${{ needs.linux-focal-cpu-py3_8-gcc7-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-cpu-py3_8-gcc7-inductor-build.outputs.test-matrix }}
8 changes: 0 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -809,13 +809,11 @@ if(NOT MSVC)
append_cxx_flag_if_supported("-Werror=braced-scalar-init" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Werror=range-loop-construct" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Werror=bool-operation" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Winconsistent-missing-override" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Wnarrowing" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Wno-missing-field-initializers" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Wno-type-limits" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Wno-array-bounds" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Wno-unknown-pragmas" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Wunused-local-typedefs" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Wno-unused-parameter" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Wno-unused-function" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Wno-unused-result" CMAKE_CXX_FLAGS)
Expand Down Expand Up @@ -866,21 +864,16 @@ if(NOT MSVC)
endif()

append_cxx_flag_if_supported("-Wno-error=pedantic" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Wno-error=redundant-decls" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Wno-error=old-style-cast" CMAKE_CXX_FLAGS)
# These flags are not available in GCC-4.8.5. Set only when using clang.
# Compared against https://gcc.gnu.org/onlinedocs/gcc-4.8.5/gcc/Option-Summary.html
if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
append_cxx_flag_if_supported("-Wconstant-conversion" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Wno-invalid-partial-specialization" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Wno-typedef-redefinition" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Wno-unused-private-field" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Wno-inconsistent-missing-override" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Wno-aligned-allocation-unavailable" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Wno-constexpr-not-const" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Wno-missing-braces" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Wunused-lambda-capture" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Wunused-local-typedef" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Qunused-arguments" CMAKE_CXX_FLAGS)
if(${USE_COLORIZE_OUTPUT})
endif()
Expand Down Expand Up @@ -981,7 +974,6 @@ if(APPLE)
endif()
append_cxx_flag_if_supported("-Wno-unused-private-field" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Wno-missing-braces" CMAKE_CXX_FLAGS)
append_cxx_flag_if_supported("-Wno-constexpr-not-const" CMAKE_CXX_FLAGS)
endif()

if(EMSCRIPTEN)
Expand Down
5 changes: 4 additions & 1 deletion aten/src/ATen/TensorIndexing.h
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,10 @@ static inline Tensor scalarToTensor(
static inline SymIntArrayRef slicePrefix1sSize(const SymIntArrayRef& sizes) {
size_t first_non1_src = sizes.size();
for (const auto i : c10::irange(sizes.size())) {
if (sizes[i] != 1) {
// Unbacked SymInt has different behavior, but this is sound because
// failing to slice will only ever cause an error, not divergent
// behavior
if (!sizes[i].has_hint() || sizes[i] != 1) {
first_non1_src = i;
break;
}
Expand Down
7 changes: 7 additions & 0 deletions aten/src/ATen/functorch/BatchRulesViews.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,14 @@ std::tuple<Tensor, optional<int64_t>> roll_batch_rule(const Tensor& self, option
// We will do something like: t.reshape(a, -1).roll(1, dims=[1, ]).reshape(old_shape)
auto old_shape = self_.sizes();
new_dims.push_back(1);
auto logical_rank = rankWithoutBatchDim(self, bdim);
if (logical_rank == 0) {
self_ = self_.unsqueeze(0);
}

auto output = at::roll(self_.flatten(1), shifts, new_dims);
// NOTE: For scalar tensor, we don't need to unsqueeze as reshape
// with `old_shape` takes care of it.
output = output.reshape(old_shape);
return std::make_tuple(output, 0);
}
Expand Down
7 changes: 7 additions & 0 deletions aten/src/ATen/native/BucketizationUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,13 @@ inline void searchsorted_pre_check(

TORCH_CHECK(sorter.scalar_type() == ScalarType::Long, "torch.searchsorted(): sorter must be a tensor of long ",
"dtype but got dtype ", sorter.scalar_type());

if (sorter.numel() > 0) {
auto minmax = sorter.aminmax();
int64_t vmin = std::get<0>(minmax).item().toLong();
int64_t vmax = std::get<1>(minmax).item().toLong();
TORCH_CHECK(vmin >= 0 && vmax < sorter.sizes().back(), "torch.searchsorted(): sorter index out of range");
}
}

TORCH_CHECK(input.dim() > 0 || (input.dim() == 0 && input.numel() == 1 && boundaries.dim() == 1),
Expand Down
4 changes: 4 additions & 0 deletions aten/src/ATen/native/ReduceOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -905,6 +905,10 @@ static inline void diff_check(const Tensor& self, int64_t n, int64_t dim, const
self.dim() >= 1,
"diff expects input to be at least one-dimensional");

TORCH_CHECK(
n >= 0,
"order must be non-negative but got ", n);

diff_check_compatible_shape(self, prepend, dim);
diff_check_compatible_shape(self, append, dim);
}
Expand Down
1 change: 1 addition & 0 deletions aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ void foreach_tensor_list_op_(TensorList tensors1, TensorList tensors2, const Sca
/* res_arg_index */ 0>(),
Op<opmath_t>(),
alpha.to<opmath_t>());
increment_version(tensors1);
}

template<template<class> class Op>
Expand Down
1 change: 1 addition & 0 deletions aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ void foreach_binary_op_(TensorList tensors, const Scalar& scalar) {
/* res_arg_index */ 0>(),
Op<opmath_t>(),
scalar.to<opmath_t>());
increment_version(tensors);
}

template<template<class> class Op>
Expand Down
1 change: 1 addition & 0 deletions aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ void foreach_binary_op_(TensorList tensors, at::ArrayRef<Scalar> scalars) {
/* r_args_depth */ 1,
/* res_arg_index */ 0>(),
Op<opmath_t>());
increment_version(tensors);
}

template<template<class> class Op>
Expand Down
2 changes: 2 additions & 0 deletions aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ void foreach_pointwise_op_(TensorList input, TensorList tensors1, TensorList ten
Op<opmath_t>(),
scalar.to<opmath_t>());
});
increment_version(input);
}

template<template<class> class Op>
Expand All @@ -86,6 +87,7 @@ void foreach_pointwise_op_(TensorList input, TensorList tensors1, TensorList ten
/* res_arg_index */ 0>(),
Op<opmath_t>());
});
increment_version(input);
}

template<template<class> class Op>
Expand Down
1 change: 1 addition & 0 deletions aten/src/ATen/native/cuda/ForeachTernaryOp.cu
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ void foreach_tensor_lerp_ternary_cuda_(TensorList tensors1, TensorList tensors2,
LerpFunctor<opmath_t>());
}
);
increment_version(tensors1);
}

std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(TensorList tensors1, TensorList tensors2, const Scalar& weight) {
Expand Down

0 comments on commit 2790599

Please sign in to comment.