Skip to content

Commit

Permalink
Update on "[inductor] switch assume_aligned_inputs to False"
Browse files Browse the repository at this point in the history
In #123319, we guard some behavior behind the `assume_aligned_inputs` config option. If we set this to `False`, then the behavior added in #123319 becomes the default behavior. See the referenced PR for more details about the behavior affected.

Side effects:
* It's possible that this will hurt performance in some scenarios. For example, if an unaligned input is used in a matmul, it might be better to perform the clone to align it first.
* This will occasionally cause recompiles. Specifically: the check we perform (`(storage_offset * get_dtype_size(dtype)) % ALIGNMENT == 0`) can be guarded on if the storage_offset becomes dynamic. storage_offset becomes dynamic during automatic_dynamic_shapes after a shape or stride changes. Previously, this was increasing graph breaks in cpu inductor torchbench tests (but is fixed by more carefully guarding checks on alignment, so that we don't run them and generate guards unless actually needed).

cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx peterbell10 ipiszy yf225 chenyang78 kadeng muchulee8 aakhundov ColinPeppler amjames desertfire chauhang

[ghstack-poisoned]
  • Loading branch information
davidberard98 committed Apr 28, 2024
2 parents 49070d1 + 4313c7d commit 8fd7120
Show file tree
Hide file tree
Showing 326 changed files with 6,784 additions and 2,506 deletions.
5 changes: 2 additions & 3 deletions .ci/docker/requirements-ci.txt
Original file line number Diff line number Diff line change
Expand Up @@ -228,12 +228,11 @@ scikit-image==0.20.0 ; python_version >= "3.10"
#Pinned versions: 0.20.3
#test that import:

scipy==1.8.1 ; python_version <= "3.10"
scipy==1.10.1 ; python_version == "3.11"
scipy==1.10.1 ; python_version <= "3.11"
scipy==1.12.0 ; python_version == "3.12"
# Pin SciPy because of failing distribution tests (see #60347)
#Description: scientific python
#Pinned versions: 1.6.3
#Pinned versions: 1.10.1
#test that import: test_unary_ufuncs.py, test_torch.py,test_tensor_creation_ops.py
#test_spectral_ops.py, test_sparse_csr.py, test_reductions.py,test_nn.py
#test_linalg.py, test_binary_ufuncs.py
Expand Down
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ per-file-ignores =
torch/ao/quantization/fx/_decomposed.py: TOR901
torch/distributed/_functional_collectives.py: TOR901
torch/distributed/_spmd/data_parallel.py: TOR901
torch/distributed/_tensor/_collective_utils.py: TOR901
optional-ascii-coding = True
exclude =
./.git,
Expand Down
26 changes: 26 additions & 0 deletions .github/actions/setup-linux/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ runs:
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Kill any existing containers, clean up images
if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
shell: bash
run: |
# ignore expansion of "docker ps -q" since it could be empty
Expand Down Expand Up @@ -104,3 +105,28 @@ runs:
echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts
cat /etc/hosts
- name: Check that the docker daemon is running
shell: bash
continue-on-error: true
if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'true' }}
run: |
set +x
max_attempts=30
delay=10
attempt=1
for attempt in $(seq 1 $max_attempts); do
echo "Attempt $attempt of $max_attempts: Checking if Docker daemon is running..."
if docker info > /dev/null 2>&1; then
echo "Docker is running. Proceeding with the next steps"
exit 0
else
echo "Docker is not running yet."
echo "Retrying in $delay seconds..."
sleep $delay
fi
done
echo "Reached maximum attempts to connect to Docker. Exiting."
exit 1
13 changes: 13 additions & 0 deletions .github/label_to_label.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Use this to auto apply labels based on other labels. Applies to both PRs and
# issues. Currently only supports any and all
- any:
- "module: custom operators"
- "module: aotdispatch"
then:
- "module: pt2-dispatcher"
- any:
- "module: dynamo"
- "module: pt2-dispatcher"
- "module: inductor"
then:
- "oncall: pt2"
1 change: 1 addition & 0 deletions .github/pytorch-probot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ retryable_workflows:
- linux-binary
- windows-binary
labeler_config: labeler.yml
label_to_label_config: label_to_label.yml
2 changes: 1 addition & 1 deletion .github/scripts/cherry_pick.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def parse_args() -> Any:
"--onto-branch", type=str, required=True, help="the target release branch"
)
parser.add_argument(
"--github-actor", type=str, required=True, help="all the worlds a stage"
"--github-actor", type=str, required=True, help="all the world's a stage"
)
parser.add_argument(
"--classification",
Expand Down
3 changes: 2 additions & 1 deletion .github/scripts/get_workflow_job_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import argparse
import json
import operator
import os
import re
import sys
Expand Down Expand Up @@ -126,7 +127,7 @@ def find_job_id_name(args: Any) -> Tuple[str, str]:

# Sort the jobs list by start time, in descending order. We want to get the most
# recently scheduled job on the runner.
jobs.sort(key=lambda job: job["started_at"], reverse=True)
jobs.sort(key=operator.itemgetter("started_at"), reverse=True)

for job in jobs:
if job["runner_name"] == args.runner_name:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/_win-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ jobs:
retry_wait_seconds: 30
command: |
set -eu
python3 -m pip install rockset==1.0.3
python3 -m pip install rockset==1.0.3 'xdoctest>=1.1.0'
- name: Start monitoring script
id: monitor-script
Expand Down
20 changes: 10 additions & 10 deletions .github/workflows/inductor-perf-test-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,16 @@ on:
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
workflow_dispatch:
inputs:
training_and_inference:
description: Run training and inference?
training:
description: Run training (on by default)?
required: false
type: string
default: training-true-inference-false
type: boolean
default: true
inference:
description: Run inference (off by default)?
required: false
type: boolean
default: false
default:
description: Run inductor_default?
required: false
Expand All @@ -28,11 +33,6 @@ on:
required: false
type: boolean
default: true
cppwrapper:
description: Run inductor_cpp_wrapper for inference?
required: false
type: boolean
default: false
freezing_cudagraphs:
description: Run inductor_cudagraphs with freezing for inference?
required: false
Expand Down Expand Up @@ -129,7 +129,7 @@ jobs:
if: github.event_name == 'workflow_dispatch'
with:
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
dashboard-tag: ${{ inputs.training_and_inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-false-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.test-matrix }}
use-gha: anything-non-empty-to-use-gha
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/target_determination.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ jobs:
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_RUN_NUMBER: ${{ github.run_number }}
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
GITHUB_REF: ${{ github.ref }}
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
PR_NUMBER: ${{ github.event.pull_request.number }}
Expand Down
6 changes: 6 additions & 0 deletions aten/src/ATen/Context.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ class TORCH_API Context {
return at::detail::getMPSHooks();
} else if (device_type == at::kPrivateUse1) {
return at::detail::getPrivateUse1Hooks();
} else if (device_type == at::kMTIA) {
return at::detail::getMTIAHooks();
} else {
AT_ERROR(
c10::DeviceTypeName(device_type), " device type not an accelerator.");
Expand Down Expand Up @@ -156,6 +158,9 @@ class TORCH_API Context {
void lazyInitXPU() {
c10::call_once(thx_init, [&] { detail::getXPUHooks().initXPU(); });
}
void lazyInitMTIA() {
c10::call_once(th_mtia_init, [&] { detail::getMTIAHooks().initMTIA(); });
}
void lazyInitPrivateUse1() {
c10::call_once(thp_init, [&] {
if (isPrivateUse1HooksRegistered()) {
Expand Down Expand Up @@ -349,6 +354,7 @@ class TORCH_API Context {
c10::once_flag thc_init;
c10::once_flag thh_init;
c10::once_flag thx_init;
c10::once_flag th_mtia_init;
c10::once_flag thp_init;
bool enabled_cudnn = true;
bool deterministic_cudnn = false;
Expand Down
8 changes: 8 additions & 0 deletions aten/src/ATen/DeviceAccelerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,22 @@ C10_API std::optional<DeviceType> getAccelerator(bool checked) {
#define CHECK_NO_PU1 \
TORCH_CHECK(!is_privateuse1_backend_registered(), "Cannot have both CUDA and PrivateUse1");

#define CHECK_NO_MTIA \
TORCH_CHECK(!at::hasMTIA(), "Cannot have MTIA with other devices");

if (is_privateuse1_backend_registered()) {
// We explicitly allow PrivateUse1 and another device at the same time
// as we use this for testing.
// Whenever a PrivateUse1 device is registered, use it first.
return kPrivateUse1;
} else if (at::hasCUDA()) {
CHECK_NO_PU1
CHECK_NO_MTIA
return kCUDA;
} else if (at::hasMTIA()) {
CHECK_NO_CUDA
CHECK_NO_PU1
return kMTIA;
} else {
TORCH_CHECK(!checked, "Cannot access accelerator device when none is available.")
return std::nullopt;
Expand Down
8 changes: 4 additions & 4 deletions aten/src/ATen/NestedTensorImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ inline std::vector<int64_t> construct_opt_sizes(const at::Tensor& sizes) {
std::vector<int64_t> result(1, sizes.sizes()[0]);
if (sizes.dim() > 0) {
size_t nested_dim = result.size();
int64_t* sizes_ptr = sizes.data_ptr<int64_t>();
const int64_t* sizes_ptr = sizes.const_data_ptr<int64_t>();
result.resize(nested_dim + sizes.sizes()[1]);
int64_t sizes_size_0 = sizes.sizes()[0];
int64_t sizes_size_1 = sizes.sizes()[1];
Expand Down Expand Up @@ -114,7 +114,7 @@ at::Tensor construct_nested_strides(const at::Tensor& sizes) {
return sizes;
}
at::Tensor strides = sizes.new_empty(sizes.sizes());
const int64_t* sizes_ptr = sizes.data_ptr<int64_t>();
const int64_t* sizes_ptr = sizes.const_data_ptr<int64_t>();
int64_t* strides_ptr = strides.data_ptr<int64_t>();
for (int64_t i = 0; i < sizes.size(0); i++) {
strides_ptr[orig_dim - 1] = 1;
Expand Down Expand Up @@ -152,7 +152,7 @@ at::Tensor construct_offsets(const at::Tensor& sizes) {
std::iota(offsets_ptr, offsets_ptr + ntensors, 0);
return offsets;
}
const int64_t* sizes_ptr = sizes.data_ptr<int64_t>();
const int64_t* sizes_ptr = sizes.const_data_ptr<int64_t>();
offsets_ptr[0] = 0;
for (const auto i : c10::irange(ntensors - 1)) {
const int64_t row_product = std::accumulate(sizes_ptr, sizes_ptr + orig_dim, 1, std::multiplies());
Expand Down Expand Up @@ -344,7 +344,7 @@ int64_t get_numel_from_nested_size_tensor(const at::Tensor& tensor) {
static_cast<uint64_t>(std::numeric_limits<int64_t>::max()),
static_cast<uint64_t>(std::numeric_limits<size_t>::max()));

const int64_t* sizes_ptr = tensor.data_ptr<int64_t>();
const int64_t* sizes_ptr = tensor.const_data_ptr<int64_t>();
const auto nt_dim = tensor.size(1);
uint64_t num_elements{0};

Expand Down
11 changes: 6 additions & 5 deletions aten/src/ATen/NestedTensorImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,8 @@ inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt) {
}
const Tensor &sizemat = nt->get_nested_sizes(),
&stridemat = nt->get_nested_strides();
int64_t* offsets_ptr = nt->get_storage_offsets().data_ptr<int64_t>();
const int64_t* offsets_ptr =
nt->get_storage_offsets().const_data_ptr<int64_t>();
int64_t orig_dim = sizemat.size(1);
// nesting scalars
if (orig_dim == 0) {
Expand All @@ -243,8 +244,8 @@ inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt) {
// nesting tensors
else {
// if any underlying tensor is non-contiguous
const int64_t *sizemat_ptr = sizemat.data_ptr<int64_t>(),
*stridemat_ptr = stridemat.data_ptr<int64_t>();
const int64_t *sizemat_ptr = sizemat.const_data_ptr<int64_t>(),
*stridemat_ptr = stridemat.const_data_ptr<int64_t>();
for (int64_t i = 0; i < ntensors; i++) {
if (stridemat_ptr[orig_dim - 1] != 1) {
return false;
Expand All @@ -263,8 +264,8 @@ inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt) {
if (offsets_ptr[0] != 0) {
return false;
}
sizemat_ptr = sizemat.data_ptr<int64_t>();
stridemat_ptr = stridemat.data_ptr<int64_t>();
sizemat_ptr = sizemat.const_data_ptr<int64_t>();
stridemat_ptr = stridemat.const_data_ptr<int64_t>();
for (int64_t i = 1; i < ntensors; i++) {
if (offsets_ptr[i] !=
offsets_ptr[i - 1] + *sizemat_ptr * *stridemat_ptr) {
Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/autocast_mode.h
Original file line number Diff line number Diff line change
Expand Up @@ -728,7 +728,7 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.

// KERNEL_PRIVATEUSEONE/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_PRIVATEUSEONE
// registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastPrivateUse1
#define KERNEL_PRIVATEUSEONE(OP, ...) \
#define KERNEL_PRIVATEUSEONE(...) \
KERNEL(c10::DeviceType::PrivateUse1, __VA_ARGS__)

#define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_PRIVATEUSEONE( \
Expand Down
8 changes: 4 additions & 4 deletions aten/src/ATen/core/Formatting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ static std::tuple<double, int> __printFormat(std::ostream& stream, const Tensor&
return std::make_tuple(1., 0);
}
bool intMode = true;
auto self_p = self.data_ptr<double>();
auto self_p = self.const_data_ptr<double>();
for (const auto i : c10::irange(size)) {
auto z = self_p[i];
if(std::isfinite(z)) {
Expand Down Expand Up @@ -189,7 +189,7 @@ static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t line
}
for (const auto l : c10::irange(self.size(0))) {
Tensor row = self.select(0,l);
double *row_ptr = row.data_ptr<double>();
const double *row_ptr = row.const_data_ptr<double>();
for (const auto c : c10::irange(firstColumn, lastColumn+1)) {
stream << std::setw(sz) << row_ptr[c]/scale;
if(c == lastColumn) {
Expand Down Expand Up @@ -279,15 +279,15 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
tensor = tensor_.to(kCPU, kDouble).contiguous();
}
if(tensor.ndimension() == 0) {
stream << defaultfloat << tensor.data_ptr<double>()[0] << '\n';
stream << defaultfloat << tensor.const_data_ptr<double>()[0] << '\n';
stream << "[ " << tensor_.toString() << "{}";
} else if(tensor.ndimension() == 1) {
if (tensor.numel() > 0) {
auto [scale, sz] = __printFormat(stream, tensor);
if(scale != 1) {
printScale(stream, scale);
}
double* tensor_p = tensor.data_ptr<double>();
const double* tensor_p = tensor.const_data_ptr<double>();
for (const auto i : c10::irange(tensor.size(0))) {
stream << std::setw(sz) << tensor_p[i]/scale << '\n';
}
Expand Down
48 changes: 30 additions & 18 deletions aten/src/ATen/cpu/vec/vec256/vec256_convert.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,32 +126,44 @@ struct VecConvert<int32_t, 1, uint8_t, 1> {
}
};

template <typename dst_t, typename src_t>
struct VecConvert<
dst_t,
1,
src_t,
1,
typename std::enable_if_t<
(is_reduced_floating_point_v<dst_t> && is_8bit_integer_v<src_t>) ||
(is_reduced_floating_point_v<src_t> && is_8bit_integer_v<dst_t>),
void>> {
static inline VectorizedN<dst_t, 1> apply(const VectorizedN<src_t, 1>& src) {
VectorizedN<float, 1> tmp_fp32 = VecConvert<float, 1, src_t, 1>::apply(src);
return VecConvert<dst_t, 1, float, 1>::apply(tmp_fp32);
}
};

template <typename dst_t>
struct VecConvert<
dst_t,
1,
float,
1,
typename std::enable_if_t<
std::is_same_v<dst_t, unsigned char> || std::is_same_v<dst_t, signed char>,
void>> {
static inline VectorizedN<dst_t, 1> apply(
const VectorizedN<float, 1>& src) {
dst_t,
1,
float,
1,
typename std::enable_if_t<is_8bit_integer_v<dst_t>,
void>> {
static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 1>& src) {
return convert_float_to_int8<dst_t>(src[0]);
}
};

template <typename src_t>
struct VecConvert<
float,
1,
src_t,
1,
typename std::enable_if_t<
std::is_same_v<src_t, unsigned char> || std::is_same_v<src_t, signed char>,
void>> {
static inline VectorizedN<float, 1> apply(
const VectorizedN<src_t, 1>& src) {
float,
1,
src_t,
1,
typename std::enable_if_t<is_8bit_integer_v<src_t>,
void>> {
static inline VectorizedN<float, 1> apply(const VectorizedN<src_t, 1>& src) {
return convert_int8_to_float<src_t>(src[0]);
}
};
Expand Down

0 comments on commit 8fd7120

Please sign in to comment.