Update on "[Inductor][CPP] Add Min/Max with VecMask"

**Summary** Fix issue: #126824 which is missing the support of `min/max` with `VecMask`. **TestPlan** ``` python test/inductor/test_torchinductor_opinfo.py -k test_comprehensive_clamp_max_cpu_bool python test/inductor/test_torchinductor_opinfo.py -k test_comprehensive_clamp_min_cpu_bool ``` cc jgong5 mingfeima XiaobingSuper sanchitintel ashokei jingxu10 voznesenskym penguinwu EikanWang Guobing-Chen zhuhaozhe blzheng wenzhe-nrv jiayisunx peterbell10 ipiszy yf225 chenyang78 kadeng muchulee8 ColinPeppler amjames desertfire chauhang [ghstack-poisoned]
pytorch · May 23, 2024 · b5df05d · b5df05d
2 parents 90a84cc + 3b73445
commit b5df05d
Show file tree

Hide file tree

Showing 152 changed files with 3,660 additions and 1,126 deletions.
diff --git a/.ci/pytorch/multigpu-test.sh b/.ci/pytorch/multigpu-test.sh
@@ -18,6 +18,7 @@ time python test/run_test.py --verbose -i distributed/test_c10d_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_nccl
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
+time python test/run_test.py --verbose -i distributed/test_cuda_p2p
 time python test/run_test.py --verbose -i distributed/test_store
 time python test/run_test.py --verbose -i distributed/test_pg_wrapper
 time python test/run_test.py --verbose -i distributed/rpc/cuda/test_tensorpipe_agent

diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
@@ -326,6 +326,7 @@ test_inductor_distributed() {
   python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_frozen.py --verbose
   python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
   python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py -k test_clip_grad_norm_2d --verbose
   python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose
 
   # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
@@ -355,7 +356,7 @@ test_inductor_cpp_wrapper_abi_compatible() {
 
   echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1"
   # cpu stack allocation causes segfault and needs more investigation
-  TORCHINDUCTOR_STACK_ALLOCATION=0 python test/run_test.py --include inductor/test_cpu_cpp_wrapper
+  python test/run_test.py --include inductor/test_cpu_cpp_wrapper
   python test/run_test.py --include inductor/test_cuda_cpp_wrapper
 
   TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
@@ -569,6 +570,8 @@ test_inductor_torchbench_smoketest_perf() {
   # Test some models in the cpp wrapper mode
   TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
     --bfloat16 --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
+  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
+    --bfloat16 --inference --inductor --only llama --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
   python benchmarks/dynamo/check_accuracy.py \
     --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \
     --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv"

diff --git a/.github/actions/filter-test-configs/action.yml b/.github/actions/filter-test-configs/action.yml
@@ -66,7 +66,8 @@ runs:
         command: |
           set -eux
           # PyYAML 6.0 doesn't work with MacOS x86 anymore
-          python3 -m pip install requests==2.26.0 pyyaml==6.0.1
+          # This must run on Python-3.7 (AmazonLinux2) so can't use request=3.32.2
+          python3 -m pip install requests==2.27.1 pyyaml==6.0.1
 
     - name: Parse ref
       id: parse-ref

diff --git a/.github/requirements-gha-cache.txt b/.github/requirements-gha-cache.txt
@@ -10,6 +10,6 @@ lintrunner==0.10.7
 ninja==1.10.0.post1
 nvidia-ml-py==11.525.84
 pyyaml==6.0
-requests==2.31.0
+requests==2.32.2
 rich==10.9.0
 rockset==1.0.3
diff --git a/.github/requirements/conda-env-Linux-X64.txt b/.github/requirements/conda-env-Linux-X64.txt
@@ -4,6 +4,6 @@ mkl-include=2022.1.0
 ninja=1.10.2
 numpy=1.23.3
 pyyaml=6.0
-requests=2.31.0
+requests=2.32.2
 setuptools=68.2.2
 typing-extensions=4.3.0
diff --git a/.github/requirements/conda-env-iOS.txt b/.github/requirements/conda-env-iOS.txt
@@ -3,6 +3,6 @@ cmake=3.22.1
 ninja=1.10.2
 numpy=1.23.3
 pyyaml=6.0
-requests=2.31.0
+requests=2.32.2
 setuptools=68.2.2
 typing-extensions=4.3.0
diff --git a/.github/scripts/lintrunner.sh b/.github/scripts/lintrunner.sh
@@ -7,7 +7,7 @@ eval "$(command conda 'shell.bash' 'hook' 2> /dev/null)"
 conda activate "${CONDA_ENV}"
 
 # Use uv to speed up lintrunner init
-python3 -m pip install uv
+python3 -m pip install uv==0.1.45
 
 CACHE_DIRECTORY="/tmp/.lintbin"
 # Try to recover the cached binaries

diff --git a/.github/workflows/close-nonexistent-disable-issues.yml b/.github/workflows/close-nonexistent-disable-issues.yml
@@ -18,6 +18,6 @@ jobs:
           ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
-          pip3 install requests==2.26
+          pip3 install requests==2.32.2
           pip3 install rockset==1.0.3
           python3 .github/scripts/close_nonexistent_disable_issues.py
diff --git a/.github/workflows/nightly-rockset-uploads.yml b/.github/workflows/nightly-rockset-uploads.yml
@@ -32,7 +32,7 @@ jobs:
           cache: pip
 
       - run: |
-          pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12
+          pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12
 
       - name: Upload external contribution stats
         uses: nick-fields/retry@v2.8.2

diff --git a/.github/workflows/upload-alerts.yml b/.github/workflows/upload-alerts.yml
@@ -28,7 +28,7 @@ jobs:
 
       - name: Install Python Packages
         run: |
-          pip3 install rockset==1.0.3 boto3==1.19.12 requests==2.27.1
+          pip3 install rockset==1.0.3 boto3==1.19.12 requests==2.32.2
 
       - name: Create alerts
         run: |

diff --git a/.github/workflows/upload-test-stats.yml b/.github/workflows/upload-test-stats.yml
@@ -47,7 +47,7 @@ jobs:
           cache: pip
 
       - run: |
-          pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12
+          pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12
 
       - name: Upload test artifacts
         id: upload-s3

diff --git a/.github/workflows/upload-torch-dynamo-perf-stats.yml b/.github/workflows/upload-torch-dynamo-perf-stats.yml
@@ -40,7 +40,7 @@ jobs:
           cache: pip
 
       - run: |
-          pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12
+          pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12
 
       - name: Upload torch dynamo performance stats to S3
         id: upload-s3

diff --git a/.github/workflows/upload_test_stats_intermediate.yml b/.github/workflows/upload_test_stats_intermediate.yml
@@ -28,7 +28,7 @@ jobs:
           cache: pip
 
       - run: |
-          pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12
+          pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12
 
       - name: Upload test stats
         env:

diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h
@@ -167,8 +167,9 @@ class TORCH_CUDA_CPP_API TensorDescriptor : public Descriptor<
   void set(cudnnDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad, bool nhwc);
 
   void set(cudnnDataType_t dataType, int dim, int* size, int* stride, bool nhwc) {
-    fixSizeOneDimStride<int>(dim, size, stride, nhwc);
-    AT_CUDNN_CHECK(cudnnSetTensorNdDescriptor(mut_desc(), dataType, dim, size, stride));
+    std::vector<int> strides_copy(stride, stride + dim);
+    fixSizeOneDimStride<int>(dim, size, strides_copy.data(), nhwc);
+    AT_CUDNN_CHECK(cudnnSetTensorNdDescriptor(mut_desc(), dataType, dim, size, strides_copy.data()));
   }
 };
 

diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp
@@ -438,11 +438,12 @@ void foreach_tensor_zero_slow_(TensorList tensors) {
 
 std::vector<Tensor> foreach_tensor_norm_slow(
     TensorList tensors,
-    const Scalar& ord) {
+    const Scalar& ord,
+    c10::optional<ScalarType> dtype) {
   check_foreach_api_restrictions(tensors);
   std::vector<Tensor> result;
   for (const auto& t : tensors) {
-    result.emplace_back(at::linalg_vector_norm(t, ord));
+    result.emplace_back(at::linalg_vector_norm(t, ord, {}, false, dtype));
   }
   return result;
 }