Skip to content

Commit

Permalink
Update on "[Inductor][CPP] Add Min/Max with VecMask"
Browse files Browse the repository at this point in the history
**Summary**
Fix issue: #126824 which is missing the support of `min/max` with `VecMask`.

**TestPlan**
```
python test/inductor/test_torchinductor_opinfo.py -k test_comprehensive_clamp_max_cpu_bool
python test/inductor/test_torchinductor_opinfo.py -k test_comprehensive_clamp_min_cpu_bool
```

cc jgong5 mingfeima XiaobingSuper sanchitintel ashokei jingxu10 voznesenskym penguinwu EikanWang Guobing-Chen zhuhaozhe blzheng wenzhe-nrv jiayisunx peterbell10 ipiszy yf225 chenyang78 kadeng muchulee8 ColinPeppler amjames desertfire chauhang

[ghstack-poisoned]
  • Loading branch information
leslie-fang-intel committed May 23, 2024
2 parents 90a84cc + 3b73445 commit b5df05d
Show file tree
Hide file tree
Showing 152 changed files with 3,660 additions and 1,126 deletions.
1 change: 1 addition & 0 deletions .ci/pytorch/multigpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ time python test/run_test.py --verbose -i distributed/test_c10d_gloo
time python test/run_test.py --verbose -i distributed/test_c10d_nccl
time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
time python test/run_test.py --verbose -i distributed/test_cuda_p2p
time python test/run_test.py --verbose -i distributed/test_store
time python test/run_test.py --verbose -i distributed/test_pg_wrapper
time python test/run_test.py --verbose -i distributed/rpc/cuda/test_tensorpipe_agent
Expand Down
5 changes: 4 additions & 1 deletion .ci/pytorch/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,7 @@ test_inductor_distributed() {
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_frozen.py --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py -k test_clip_grad_norm_2d --verbose
python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose

# this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
Expand Down Expand Up @@ -355,7 +356,7 @@ test_inductor_cpp_wrapper_abi_compatible() {

echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1"
# cpu stack allocation causes segfault and needs more investigation
TORCHINDUCTOR_STACK_ALLOCATION=0 python test/run_test.py --include inductor/test_cpu_cpp_wrapper
python test/run_test.py --include inductor/test_cpu_cpp_wrapper
python test/run_test.py --include inductor/test_cuda_cpp_wrapper

TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
Expand Down Expand Up @@ -569,6 +570,8 @@ test_inductor_torchbench_smoketest_perf() {
# Test some models in the cpp wrapper mode
TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
--bfloat16 --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
--bfloat16 --inference --inductor --only llama --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
python benchmarks/dynamo/check_accuracy.py \
--actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \
--expected "benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv"
Expand Down
3 changes: 2 additions & 1 deletion .github/actions/filter-test-configs/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ runs:
command: |
set -eux
# PyYAML 6.0 doesn't work with MacOS x86 anymore
python3 -m pip install requests==2.26.0 pyyaml==6.0.1
# This must run on Python-3.7 (AmazonLinux2) so can't use request=3.32.2
python3 -m pip install requests==2.27.1 pyyaml==6.0.1
- name: Parse ref
id: parse-ref
Expand Down
2 changes: 1 addition & 1 deletion .github/requirements-gha-cache.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ lintrunner==0.10.7
ninja==1.10.0.post1
nvidia-ml-py==11.525.84
pyyaml==6.0
requests==2.31.0
requests==2.32.2
rich==10.9.0
rockset==1.0.3
2 changes: 1 addition & 1 deletion .github/requirements/conda-env-Linux-X64.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@ mkl-include=2022.1.0
ninja=1.10.2
numpy=1.23.3
pyyaml=6.0
requests=2.31.0
requests=2.32.2
setuptools=68.2.2
typing-extensions=4.3.0
2 changes: 1 addition & 1 deletion .github/requirements/conda-env-iOS.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ cmake=3.22.1
ninja=1.10.2
numpy=1.23.3
pyyaml=6.0
requests=2.31.0
requests=2.32.2
setuptools=68.2.2
typing-extensions=4.3.0
2 changes: 1 addition & 1 deletion .github/scripts/lintrunner.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ eval "$(command conda 'shell.bash' 'hook' 2> /dev/null)"
conda activate "${CONDA_ENV}"

# Use uv to speed up lintrunner init
python3 -m pip install uv
python3 -m pip install uv==0.1.45

CACHE_DIRECTORY="/tmp/.lintbin"
# Try to recover the cached binaries
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/close-nonexistent-disable-issues.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,6 @@ jobs:
ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
pip3 install requests==2.26
pip3 install requests==2.32.2
pip3 install rockset==1.0.3
python3 .github/scripts/close_nonexistent_disable_issues.py
2 changes: 1 addition & 1 deletion .github/workflows/nightly-rockset-uploads.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
cache: pip

- run: |
pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12
- name: Upload external contribution stats
uses: nick-fields/retry@v2.8.2
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/upload-alerts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:

- name: Install Python Packages
run: |
pip3 install rockset==1.0.3 boto3==1.19.12 requests==2.27.1
pip3 install rockset==1.0.3 boto3==1.19.12 requests==2.32.2
- name: Create alerts
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/upload-test-stats.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
cache: pip

- run: |
pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12
- name: Upload test artifacts
id: upload-s3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/upload-torch-dynamo-perf-stats.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
cache: pip

- run: |
pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12
- name: Upload torch dynamo performance stats to S3
id: upload-s3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/upload_test_stats_intermediate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
cache: pip

- run: |
pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12
- name: Upload test stats
env:
Expand Down
5 changes: 3 additions & 2 deletions aten/src/ATen/cudnn/Descriptors.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,9 @@ class TORCH_CUDA_CPP_API TensorDescriptor : public Descriptor<
void set(cudnnDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad, bool nhwc);

void set(cudnnDataType_t dataType, int dim, int* size, int* stride, bool nhwc) {
fixSizeOneDimStride<int>(dim, size, stride, nhwc);
AT_CUDNN_CHECK(cudnnSetTensorNdDescriptor(mut_desc(), dataType, dim, size, stride));
std::vector<int> strides_copy(stride, stride + dim);
fixSizeOneDimStride<int>(dim, size, strides_copy.data(), nhwc);
AT_CUDNN_CHECK(cudnnSetTensorNdDescriptor(mut_desc(), dataType, dim, size, strides_copy.data()));
}
};

Expand Down
5 changes: 3 additions & 2 deletions aten/src/ATen/native/ForeachOpsKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -438,11 +438,12 @@ void foreach_tensor_zero_slow_(TensorList tensors) {

std::vector<Tensor> foreach_tensor_norm_slow(
TensorList tensors,
const Scalar& ord) {
const Scalar& ord,
c10::optional<ScalarType> dtype) {
check_foreach_api_restrictions(tensors);
std::vector<Tensor> result;
for (const auto& t : tensors) {
result.emplace_back(at::linalg_vector_norm(t, ord));
result.emplace_back(at::linalg_vector_norm(t, ord, {}, false, dtype));
}
return result;
}
Expand Down
Loading

0 comments on commit b5df05d

Please sign in to comment.