Update on "Fix hybrid sparse COO tensor conversion to meta tensor"

As in the title. Addresses a bug reported in #117907 (comment) cc alexsamardzic nikitaved cpuhrsch amjames bhosmer jcaip [ghstack-poisoned]
pytorch · Apr 29, 2024 · 3bb888e · 3bb888e
2 parents 9510ac7 + db204db
commit 3bb888e
Show file tree

Hide file tree

Showing 256 changed files with 3,161 additions and 38,971 deletions.
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
@@ -306,6 +306,12 @@ case "$image" in
     DB=yes
     VISION=yes
     CONDA_CMAKE=yes
+    # snadampal: skipping sccache due to the following issue
+    # https://github.com/pytorch/pytorch/issues/121559
+    SKIP_SCCACHE_INSTALL=yes
+    # snadampal: skipping llvm src build install because the current version
+    # from pytorch/llvm:9.0.1 is x86 specific
+    SKIP_LLVM_SRC_BUILD_INSTALL=yes
     ;;
   *)
     # Catch-all for builds that are not hardcoded.
@@ -399,6 +405,8 @@ DOCKER_BUILDKIT=1 docker build \
        --build-arg "EXECUTORCH=${EXECUTORCH}" \
        --build-arg "BASEKIT_VERSION=${BASEKIT_VERSION}" \
        --build-arg "ACL=${ACL:-}" \
+       --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
+       --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
        -f $(dirname ${DOCKERFILE})/Dockerfile \
        -t "$tmp_tag" \
        "$@" \

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
@@ -263,10 +263,11 @@ unittest-xml-reporting<=3.2.0,>=2.0.0
 #Pinned versions:
 #test that import:
 
-#wheel not found on aarch64, and source build requires rust
 lintrunner==0.10.7 ; platform_machine == "x86_64"
+#lintrunner is supported on aarch64-linux only from 0.12.4 version
+lintrunner==0.12.5 ; platform_machine == "aarch64"
 #Description: all about linters!
-#Pinned versions: 0.10.7
+#Pinned versions: 0.10.7 on x86 and 0.12.5 on aarch64
 #test that import:
 
 rockset==1.0.3

diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
@@ -169,9 +169,11 @@ RUN rm install_acl.sh
 ENV INSTALLED_ACL ${ACL}
 
 # Install ccache/sccache (do this last, so we get priority in PATH)
+ARG SKIP_SCCACHE_INSTALL
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
-RUN bash ./install_cache.sh && rm install_cache.sh
+RUN if [ -z "${SKIP_SCCACHE_INSTALL}" ]; then bash ./install_cache.sh; fi
+RUN rm install_cache.sh
 
 # Add jni.h for java host build
 COPY ./common/install_jni.sh install_jni.sh
@@ -188,7 +190,9 @@ ARG BUILD_ENVIRONMENT
 ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
 
 # Install LLVM dev version (Defined in the pytorch/builder github repository)
+ARG SKIP_LLVM_SRC_BUILD_INSTALL
 COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
+RUN if [ -n "${SKIP_LLVM_SRC_BUILD_INSTALL}" ]; then set -eu; rm -rf /opt/llvm; fi
 
 # AWS specific CUDA build guidance
 ENV TORCH_CUDA_ARCH_LIST Maxwell

diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh
@@ -376,4 +376,8 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]];
   python tools/stats/export_test_times.py
 fi
 
-print_sccache_stats
+# snadampal: skipping it till sccache support added for aarch64
+# https://github.com/pytorch/pytorch/issues/121559
+if [[ "$BUILD_ENVIRONMENT" != *aarch64* ]]; then
+  print_sccache_stats
+fi
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
@@ -181,6 +181,11 @@ if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then
   export PATH="$HOME/.local/bin:$PATH"
 fi
 
+if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
+  # TODO: revisit this once the CI is stabilized on aarch64 linux
+  export VALGRIND=OFF
+fi
+
 install_tlparse
 
 # DANGER WILL ROBINSON.  The LD_PRELOAD here could cause you problems

diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
@@ -8,6 +8,7 @@ ciflow_push_tags:
 - ciflow/binaries_wheel
 - ciflow/inductor
 - ciflow/inductor-perf-compare
+- ciflow/linux-aarch64
 - ciflow/mps
 - ciflow/nightly
 - ciflow/periodic

diff --git a/.github/workflows/linux-aarch64.yml b/.github/workflows/linux-aarch64.yml
@@ -0,0 +1,38 @@
+name: linux-aarch64
+
+on:
+  # For testing purposes, removeme later
+  pull_request:
+  push:
+    tags:
+      - ciflow/linux-aarch64/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} but found ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  linux-jammy-aarch64-py3_10-build:
+    name: linux-jammy-aarch64-py3.10
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-aarch64-py3.10
+      docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11
+      runner: linux.arm64.2xlarge
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1, runner: "linux.arm64.2xlarge" },
+        ]}
+
+  linux-jammy-aarch64-py3_10-test:
+    name: linux-jammy-aarch64-py3.10
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-aarch64-py3_10-build
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      build-environment: linux-jammy-aarch64-py3.10
+      docker-image: ${{ needs.linux-jammy-aarch64-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-aarch64-py3_10-build.outputs.test-matrix }}
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -1051,8 +1051,6 @@ exclude_patterns = [
     'test/quantization/fx/test_numeric_suite_fx.py',
     'test/quantization/fx/test_quantize_fx.py',
     'test/quantization/fx/test_subgraph_rewriter.py',
-    'test/test_custom_op_testing.py',
-    'test/test_dataloader.py',
     'test/test_datapipe.py',
     'test/test_decomp.py',
     'test/test_deploy.py',
@@ -1065,7 +1063,6 @@ exclude_patterns = [
     'test/test_function_schema.py',
     'test/test_functional_autograd_benchmark.py',
     'test/test_functional_optim.py',
-    'test/test_functionalization.py',
     'test/test_functionalization_of_rng_ops.py',
     'test/test_futures.py',
     'test/test_fx.py',

diff --git a/BUILD.bazel b/BUILD.bazel
@@ -446,30 +446,13 @@ cu_library(
 # caffe2
 CAFFE2_COPTS = COMMON_COPTS + [
     "-Dcaffe2_EXPORTS",
-    "-DCAFFE2_USE_GLOO",
     "-DCAFFE2_USE_CUDNN",
     "-DCAFFE2_BUILD_MAIN_LIB",
     "-fvisibility-inlines-hidden",
     "-fno-math-errno",
     "-fno-trapping-math",
 ]
 
-filegroup(
-    name = "caffe2_contrib_srcs",
-    srcs = [
-        "caffe2/contrib/aten/aten_op.cc",
-        "caffe2/contrib/gloo/allgather_ops.cc",
-        "caffe2/contrib/gloo/allreduce_ops.cc",
-        "caffe2/contrib/gloo/barrier_ops.cc",
-        "caffe2/contrib/gloo/broadcast_ops.cc",
-        "caffe2/contrib/gloo/common.cc",
-        "caffe2/contrib/gloo/common_world_ops.cc",
-        "caffe2/contrib/gloo/context.cc",
-        "caffe2/contrib/gloo/reduce_scatter_ops.cc",
-        "caffe2/contrib/gloo/store_handler.cc",
-    ],
-)
-
 filegroup(
     name = "caffe2_core_srcs",
     srcs = [
@@ -1024,10 +1007,6 @@ filegroup(
 filegroup(
     name = "caffe2_cuda_cpp_srcs",
     srcs = [
-        "caffe2/contrib/aten/aten_op_gpu.cc",
-        "caffe2/contrib/gloo/allreduce_ops_gpu.cc",
-        "caffe2/contrib/gloo/broadcast_ops_gpu.cc",
-        "caffe2/contrib/gloo/common_world_ops_gpu.cc",
         "caffe2/core/blob_serialization_gpu.cc",
         "caffe2/core/common_cudnn.cc",
         "caffe2/core/common_gpu.cc",
@@ -1271,35 +1250,10 @@ cc_library(
     ],
 )
 
-py_binary(
-    name = "gen_op",
-    srcs = ["caffe2/contrib/aten/gen_op.py"],
-    deps = ["//torchgen"],
-)
-
-genrule(
-    name = "generated_caffe2_aten_op_headers",
-    srcs = [
-        "caffe2/contrib/aten/aten_op_template.h",
-        "aten/src/ATen/Declarations.yaml",
-    ],
-    outs = ["caffe2/caffe2/contrib/aten/gen_aten_op.h"],
-    cmd = """
-    $(location :gen_op) \
-        --output_prefix gen_ \
-        --install_dir $(@D) \
-        --aten_root `dirname $(location aten/src/ATen/Declarations.yaml)`/../.. \
-        --template_dir `dirname $(location caffe2/contrib/aten/aten_op_template.h)` \
-        --yaml_dir `dirname $(location aten/src/ATen/Declarations.yaml)`""",
-    tools = [":gen_op"],
-)
-
 cc_library(
     name = "caffe2_headers",
     hdrs = glob(
         [
-            "caffe2/contrib/aten/*.h",
-            "caffe2/contrib/gloo/*.h",
             "caffe2/core/*.h",
             "caffe2/core/nomnigraph/include/nomnigraph/Converters/*.h",
             "caffe2/core/nomnigraph/include/nomnigraph/Generated/*.h",
@@ -1338,10 +1292,9 @@ cc_library(
     ) + if_cuda(glob([
         "caffe2/**/*.cuh",
         "caffe2/image/*.h",
-    ])) + [":generated_caffe2_aten_op_headers"],
+    ])),
     copts = CAFFE2_COPTS,
     includes = [
-        "caffe2/contrib/aten",
         "caffe2/core/nomnigraph/include",
     ],
     visibility = ["//visibility:public"],
@@ -1385,7 +1338,6 @@ cc_library(
         "caffe2/db/create_db_op.cc",
         "caffe2/db/protodb.cc",
         "caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc",
-        ":caffe2_contrib_srcs",
         ":caffe2_core_srcs",
         ":caffe2_distributed_srcs",
         ":caffe2_ideep_srcs",
@@ -1419,7 +1371,6 @@ cc_library(
         "@fbgemm//:fbgemm_src_headers",
         "@fmt",
         "@foxi",
-        "@gloo",
         "@onnx",
     ] + if_cuda(
         [
@@ -1467,7 +1418,6 @@ cu_library(
         "@cuda//:curand",
         "@cudnn",
         "@eigen",
-        "@gloo",
         "@tensorpipe//:tensorpipe_cuda",
     ],
     alwayslink = True,

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
@@ -2478,7 +2478,7 @@ def record_status(accuracy_status, dynamo_start_stats):
                     if isinstance(e, torch.cuda.OutOfMemoryError)
                     else "eager_1st_run_fail"
                 )
-                log.exception(e)
+                log.exception("")
                 return record_status(accuracy_status, dynamo_start_stats=start_stats)
             finally:
                 del model_copy
@@ -2499,7 +2499,7 @@ def record_status(accuracy_status, dynamo_start_stats):
                     if isinstance(e, torch.cuda.OutOfMemoryError)
                     else "eager_2nd_run_fail"
                 )
-                log.exception(e)
+                log.exception("")
                 return record_status(accuracy_status, dynamo_start_stats=start_stats)
             finally:
                 del model_copy
@@ -2551,7 +2551,7 @@ def record_status(accuracy_status, dynamo_start_stats):
                     with maybe_enable_compiled_autograd(self.args.compiled_autograd):
                         new_result = optimized_model_iter_fn(model_copy, example_inputs)
             except Exception as e:
-                log.exception(e)
+                log.exception("")
                 print(
                     "TorchDynamo optimized model failed to run because of following error"
                 )
@@ -2653,7 +2653,7 @@ def check_tolerance(
                 optimized_model_iter_fn = optimize_ctx(self.run_n_iterations)
                 new_result = optimized_model_iter_fn(model, example_inputs)
             except Exception as e:
-                log.exception(e)
+                log.exception("")
                 print(
                     "TorchDynamo optimized model failed to run because of following error"
                 )

diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
@@ -1452,7 +1452,7 @@ def update(self):
             try:
                 RegressionTracker(self.args).diff()
             except Exception as e:
-                logging.exception(e)
+                logging.exception("")
                 with open(f"{self.args.output_dir}/gh_regression.txt", "w") as gh_fh:
                     gh_fh.write("")
 

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
@@ -59,23 +59,7 @@ if(INTERN_BUILD_ATEN_OPS)
 
   # Generate the headers wrapped by our operator
   file(GLOB_RECURSE torchgen_python "${PROJECT_SOURCE_DIR}/torchgen/*.py")
-  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/contrib/aten/aten_op.h
-  COMMAND
-  "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten/gen_op.py
-    --aten_root=${CMAKE_CURRENT_SOURCE_DIR}/../aten
-    --template_dir=${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten
-    --yaml_dir=${CMAKE_BINARY_DIR}/aten/src/ATen
-    --install_dir=${CMAKE_CURRENT_BINARY_DIR}/contrib/aten
-  DEPENDS
-  ${torchgen_python}
-  ${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml
-  ${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten/gen_op.py
-  ${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten/aten_op_template.h)
 
-  add_custom_target(__aten_op_header_gen
-    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/contrib/aten/aten_op.h)
-  add_library(aten_op_header_gen INTERFACE)
-  add_dependencies(aten_op_header_gen __aten_op_header_gen)
 
   # Add source, includes, and libs to lists
   list(APPEND Caffe2_CPU_SRCS ${ATen_CPU_SRCS})
@@ -132,7 +116,6 @@ endif()
 
 # Skip modules that are not used by libtorch mobile yet.
 if(BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
-  add_subdirectory(contrib)
   add_subdirectory(predictor)
   add_subdirectory(predictor/emulator)
   add_subdirectory(core/nomnigraph)
@@ -141,7 +124,6 @@ if(BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
   endif()
   add_subdirectory(db)
   add_subdirectory(distributed)
-  # add_subdirectory(experiments) # note, we may remove this folder at some point
   add_subdirectory(ideep)
   add_subdirectory(image)
   add_subdirectory(video)

diff --git a/caffe2/contrib/CMakeLists.txt b/caffe2/contrib/CMakeLists.txt
diff --git a/caffe2/contrib/__init__.py b/caffe2/contrib/__init__.py
diff --git a/caffe2/contrib/aten/CMakeLists.txt b/caffe2/contrib/aten/CMakeLists.txt