pytorch · q10 · Jun 10, 2023
diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
@@ -294,14 +294,17 @@ build_fbgemm_gpu_package () {
   echo "################################################################################"
   echo ""
 
-  # manylinux1_x86_64 is specified for PyPI upload
+  # manylinux2014 is specified, bc manylinux1 does not support aarch64
+  # See https://github.com/pypa/manylinux
+  local plat_name="manylinux2014_${MACHINE_NAME}"
+
   # Distribute Python extensions as wheels on Linux
   echo "[BUILD] Building FBGEMM-GPU wheel (VARIANT=${fbgemm_variant}) ..."
   print_exec conda run -n "${env_name}" \
     python setup.py bdist_wheel \
       --package_name="${package_name}" \
       --python-tag="${python_tag}" \
-      --plat-name="manylinux1_${MACHINE_NAME}" \
+      --plat-name="${plat_name}" \
       "${build_args[@]}"
 
   # Run checks on the built libraries

diff --git a/.github/scripts/utils_system.bash b/.github/scripts/utils_system.bash
@@ -140,6 +140,7 @@ __print_system_info_linux () {
   echo "################################################################################"
   echo "[INFO] Print Linux distribution info ..."
   print_exec uname -a
+  print_exec uname -m
   print_exec cat /proc/version
   print_exec cat /etc/os-release
 }

diff --git a/.github/workflows/fbgemm_ci.yml b/.github/workflows/fbgemm_ci.yml
@@ -20,7 +20,7 @@ concurrency:
 
 jobs:
   build-linux:
-    runs-on: linux.12xlarge
+    runs-on: ${{ matrix.host-machine }}
     container:
       image: ${{ matrix.container-image }}
       options: --user root
@@ -34,6 +34,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
+        host-machine: [
+          "linux.12xlarge",     # x86 machine
+        ]
         container-image: [ "ubuntu:20.04" ]
         library-type: [ static, shared ]
 

diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml
@@ -29,7 +29,7 @@ concurrency:
 
 jobs:
   build_and_test_amd:
-    runs-on: linux.12xlarge
+    runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: ${{ matrix.container-image }}
       options: --user root
@@ -42,6 +42,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
+        host-machine: [
+          { arch: x86, instance: "linux.12xlarge" },
+        ]
         container-image: [ "ubuntu:20.04" ]
         python-version: [ "3.8", "3.9", "3.10" ]
         rocm-version: [ "5.3", "5.4.2" ]
@@ -94,7 +97,7 @@ jobs:
 
 
   test_amd_gpu:
-    runs-on: rocm
+    runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: "rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}-complete"
       options: --user root --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
@@ -108,6 +111,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
+        host-machine: [
+          { arch: x86, instance: "rocm" },
+        ]
         # ROCm machines are limited, so we only test against Python 3.10
         python-version: [ "3.10" ]
         rocm-version: [ "5.3", "5.4.2" ]
@@ -157,7 +163,7 @@ jobs:
 
 
   build_and_test_cpu:
-    runs-on: linux.12xlarge
+    runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: ${{ matrix.container-image }}
       options: --user root
@@ -170,8 +176,12 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
+        host-machine: [
+          { arch: x86, instance: "linux.4xlarge" },
+          { arch: arm, instance: "linux.t4g.2xlarge" },
+        ]
         container-image: [ "ubuntu:20.04", "ubuntu:22.04" ]
-        python-version: [ "3.8", "3.9", "3.10" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
 
     steps:
     - name: Setup Build Container
@@ -210,5 +220,5 @@ jobs:
       run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_install $BUILD_ENV cpu
 
     - name: Test FBGEMM_GPU-CPU Nightly Installation
-      timeout-minutes: 10
+      timeout-minutes: 15
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
diff --git a/.github/workflows/fbgemm_gpu_cpu_nightly.yml b/.github/workflows/fbgemm_gpu_cpu_nightly.yml
@@ -39,7 +39,7 @@ concurrency:
 jobs:
   # Build on CPU hosts, run tests, and upload to GHA
   build_artifact:
-    runs-on: linux.4xlarge
+    runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: amazonlinux:2023
       options: --user root
@@ -54,6 +54,10 @@ jobs:
       # Don't fast-fail all the other builds if one of the them fails
       fail-fast: false
       matrix:
+        host-machine: [
+          { arch: x86, instance: "linux.4xlarge" },
+          { arch: arm, instance: "linux.t4g.2xlarge" },
+        ]
         python-version: [ "3.8", "3.9", "3.10", "3.11" ]
 
     steps:
@@ -84,7 +88,7 @@ jobs:
       run: . $PRELUDE; install_build_tools $BUILD_ENV
 
     - name: Install PyTorch-CPU Nightly
-      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpu
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu
 
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
@@ -95,13 +99,13 @@ jobs:
     - name: Upload Built Wheel as GHA Artifact
       uses: actions/upload-artifact@v3
       with:
-        name: fbgemm_gpu_nightly_cpu_${{ matrix.python-version }}.whl
+        name: fbgemm_gpu_nightly_cpu_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}.whl
         path: fbgemm_gpu/dist/fbgemm_gpu_nightly_cpu-*.whl
 
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
-    runs-on: linux.4xlarge
+    runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: amazonlinux:2023
       options: --user root
@@ -114,6 +118,10 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
+        host-machine: [
+          { arch: x86, instance: "linux.4xlarge" },
+          { arch: arm, instance: "linux.t4g.2xlarge" },
+        ]
         python-version: [ "3.8", "3.9", "3.10", "3.11" ]
     needs: build_artifact
 
@@ -129,7 +137,7 @@ jobs:
     - name: Download Wheel Artifact from GHA
       uses: actions/download-artifact@v3
       with:
-        name: fbgemm_gpu_nightly_cpu_${{ matrix.python-version }}.whl
+        name: fbgemm_gpu_nightly_cpu_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}.whl
 
     - name: Display System Info
       run: . $PRELUDE; print_system_info; print_ec2_info
@@ -143,8 +151,8 @@ jobs:
     - name: Create Conda Environment
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
 
-    - name: Install PyTorch Nightly
-      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpu
+    - name: Install PyTorch-CPU Nightly
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu
 
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

diff --git a/.github/workflows/fbgemm_gpu_cpu_release.yml b/.github/workflows/fbgemm_gpu_cpu_release.yml
@@ -30,7 +30,7 @@ concurrency:
 jobs:
   # Build on CPU hosts, run tests, and upload to GHA
   build_artifact:
-    runs-on: linux.4xlarge
+    runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: amazonlinux:2023
       options: --user root
@@ -45,6 +45,10 @@ jobs:
       # Don't fast-fail all the other builds if one of the them fails
       fail-fast: false
       matrix:
+        host-machine: [
+          { arch: x86, instance: "linux.4xlarge" },
+          { arch: arm, instance: "linux.t4g.2xlarge" },
+        ]
         python-version: [ "3.8", "3.9", "3.10", "3.11" ]
 
     steps:
@@ -75,7 +79,7 @@ jobs:
       run: . $PRELUDE; install_build_tools $BUILD_ENV
 
     - name: Install PyTorch-CPU Test
-      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpu
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV test cpu
 
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
@@ -86,13 +90,13 @@ jobs:
     - name: Upload Built Wheel as GHA Artifact
       uses: actions/upload-artifact@v3
       with:
-        name: fbgemm_gpu_cpu_${{ matrix.python-version }}.whl
+        name: fbgemm_gpu_release_cpu_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}.whl
         path: fbgemm_gpu/dist/fbgemm_gpu_cpu-*.whl
 
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
-    runs-on: linux.4xlarge
+    runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: amazonlinux:2023
       options: --user root
@@ -105,6 +109,10 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
+        host-machine: [
+          { arch: x86, instance: "linux.4xlarge" },
+          { arch: arm, instance: "linux.t4g.2xlarge" },
+        ]
         python-version: [ "3.8", "3.9", "3.10", "3.11" ]
     needs: build_artifact
 
@@ -120,7 +128,7 @@ jobs:
     - name: Download Wheel Artifact from GHA
       uses: actions/download-artifact@v3
       with:
-        name: fbgemm_gpu_cpu_${{ matrix.python-version }}.whl
+        name: fbgemm_gpu_release_cpu_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}.whl
 
     - name: Display System Info
       run: . $PRELUDE; print_system_info; print_ec2_info
@@ -135,7 +143,7 @@ jobs:
       run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
 
     - name: Install PyTorch Test
-      run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpu
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV test cpu
 
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

diff --git a/.github/workflows/fbgemm_gpu_cuda_nightly.yml b/.github/workflows/fbgemm_gpu_cuda_nightly.yml
@@ -38,7 +38,7 @@ concurrency:
 jobs:
   # Build on CPU hosts and upload to GHA
   build_artifact:
-    runs-on: linux.24xlarge
+    runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: amazonlinux:2023
       options: --user root
@@ -53,6 +53,9 @@ jobs:
       # Don't fast-fail all the other builds if one of the them fails
       fail-fast: false
       matrix:
+        host-machine: [
+          { arch: x86, instance: "linux.24xlarge" },
+        ]
         python-version: [ "3.8", "3.9", "3.10", "3.11" ]
         cuda-version: [ "11.7.1", "11.8.0" ]
 
@@ -102,15 +105,15 @@ jobs:
     - name: Upload Built Wheel as GHA Artifact
       uses: actions/upload-artifact@v3
       with:
-        name: fbgemm_gpu_nightly_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl
+        name: fbgemm_gpu_nightly_cuda_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
         path: fbgemm_gpu/dist/fbgemm_gpu_nightly-*.whl
 
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
     # runs-on: linux.4xlarge.nvidia.gpu
     # Use available instance types - https://github.com/pytorch/test-infra/blob/main/.github/scale-config.yml
-    runs-on: linux.g5.4xlarge.nvidia.gpu
+    runs-on: ${{ matrix.host-machine.instance }}
     defaults:
       run:
         shell: bash
@@ -121,6 +124,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
+        host-machine: [
+          { arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
+        ]
         python-version: [ "3.8", "3.9", "3.10", "3.11" ]
         cuda-version: [ "11.7.1", "11.8.0" ]
         # Specify exactly ONE CUDA version for artifact publish
@@ -136,7 +142,7 @@ jobs:
     - name: Download Wheel Artifact from GHA
       uses: actions/download-artifact@v3
       with:
-        name: fbgemm_gpu_nightly_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl
+        name: fbgemm_gpu_nightly_cuda_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
 
     # Use PyTorch test infrastructure action - https://github.com/pytorch/test-infra/blob/main/.github/actions/setup-nvidia/action.yml
     - name: Install NVIDIA Drivers and NVIDIA-Docker Runtime

diff --git a/.github/workflows/fbgemm_gpu_cuda_release.yml b/.github/workflows/fbgemm_gpu_cuda_release.yml
@@ -30,7 +30,7 @@ concurrency:
 jobs:
   # Build on CPU hosts and upload to GHA
   build_artifact:
-    runs-on: linux.24xlarge
+    runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: amazonlinux:2023
       options: --user root
@@ -45,6 +45,9 @@ jobs:
       # Don't fast-fail all the other builds if one of the them fails
       fail-fast: false
       matrix:
+        host-machine: [
+          { arch: x86, instance: "linux.24xlarge" },
+        ]
         python-version: [ "3.8", "3.9", "3.10", "3.11" ]
         cuda-version: [ "11.7.1", "11.8.0" ]
 
@@ -93,13 +96,13 @@ jobs:
     - name: Upload Built Wheel as GHA Artifact
       uses: actions/upload-artifact@v3
       with:
-        name: fbgemm_gpu_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl
+        name: fbgemm_gpu_release_cuda_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
         path: fbgemm_gpu/dist/fbgemm_gpu-*.whl
 
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
-    runs-on: linux.g5.4xlarge.nvidia.gpu
+    runs-on: ${{ matrix.host-machine.instance }}
     defaults:
       run:
         shell: bash
@@ -110,6 +113,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
+        host-machine: [
+          { arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
+        ]
         python-version: [ "3.8", "3.9", "3.10", "3.11" ]
         cuda-version: [ "11.7.1", "11.8.0" ]
         # Specify exactly ONE CUDA version for artifact publish
@@ -125,7 +131,7 @@ jobs:
     - name: Download Wheel Artifact from GHA
       uses: actions/download-artifact@v3
       with:
-        name: fbgemm_gpu_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl
+        name: fbgemm_gpu_release_cuda_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
 
     - name: Install NVIDIA Drivers and NVIDIA-Docker Runtime
       uses: pytorch/test-infra/.github/actions/setup-nvidia@main

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -108,7 +108,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 # Add address sanitizer
 set(USE_SANITIZER "" CACHE STRING "options include address, leak, ...")
 
-# Check if compiler supports avx512
+# Check if compiler supports AVX512
 include(CheckCXXCompilerFlag)
 if(MSVC)
   CHECK_CXX_COMPILER_FLAG(/arch:AVX512 COMPILER_SUPPORTS_AVX512)

diff --git a/fbgemm_gpu/src/jagged_tensor_ops_autograd.cpp b/fbgemm_gpu/src/jagged_tensor_ops_autograd.cpp
@@ -8,12 +8,13 @@
 
 #include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
+#include <ATen/TensorUtils.h>
 #include <ATen/core/dispatch/Dispatcher.h>
+#include <c10/core/SymIntArrayRef.h>
 #include <torch/csrc/autograd/custom_function.h>
 #include <torch/library.h>
 #include <torch/torch.h>
 
-#include "ATen/TensorUtils.h"
 #include "fbgemm_gpu/sparse_ops.h"
 #include "fbgemm_gpu/sparse_ops_utils.h"