Enable NCCL code (pytorch#2631)

Summary: - Enable NCCL code and tests for multiple GPU car Differential Revision: D58147817 Pulled By: q10
q10 · Jun 4, 2024 · 7b13182 · 7b13182
1 parent 7d4b51e
commit 7b13182
Show file tree

Hide file tree

Showing 17 changed files with 93 additions and 58 deletions.
diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
@@ -123,8 +123,10 @@ __configure_fbgemm_gpu_build_nvcc () {
   build_args+=(
     # Override CMake configuration
     -DCMAKE_CXX_STANDARD="${cppstd_ver}"
-    -DNCCL_INCLUDE_DIR=${nccl_path}/include
-    -DNCCL_LIB_DIR=${nccl_path}/lib
+    -DNCCL_INCLUDE_DIRS=${nccl_path}/include
+    -DNCCL_LIBRARIES=${nccl_lib}
+    -DCMAKE_C_FLAGS="'-L${nccl_path}/lib'"
+    -DCMAKE_CXX_FLAGS="'-L${nccl_path}/lib'"
   )
 }
 

diff --git a/.github/scripts/fbgemm_gpu_install.bash b/.github/scripts/fbgemm_gpu_install.bash
@@ -99,7 +99,7 @@ install_fbgemm_gpu_pip () {
     echo "Example(s):"
     echo "    ${FUNCNAME[0]} build_env 0.5.0 cpu                  # Install the CPU variant, specific version from release channel"
     echo "    ${FUNCNAME[0]} build_env release cuda 12.4.1        # Install the CUDA variant, latest version from release channel"
-    echo "    ${FUNCNAME[0]} build_env test/0.6.0rc0 cuda 12.4.1  # Install the CUDA 12.4 variant, specific version from test channel"
+    echo "    ${FUNCNAME[0]} build_env test/0.7.0rc0 cuda 12.4.1  # Install the CUDA 12.4 variant, specific version from test channel"
     echo "    ${FUNCNAME[0]} build_env nightly rocm 5.3           # Install the ROCM 5.3 variant, latest version from nightly channel"
     return 1
   else

diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
@@ -70,16 +70,12 @@ __configure_fbgemm_gpu_test_cpu () {
     # These tests have non-CPU operators referenced in @given
     ./uvm/copy_test.py
     ./uvm/uvm_test.py
-    # require multiple GPUs
-    ./comm/multi_gpu_car_test.py
   )
 }
 
 __configure_fbgemm_gpu_test_cuda () {
   ignored_tests=(
     ./tbe/ssd/ssd_split_table_batched_embeddings_test.py
-    # require multiple GPUs
-    ./comm/multi_gpu_car_test.py
   )
 }
 
@@ -105,8 +101,6 @@ __configure_fbgemm_gpu_test_rocm () {
     ./tbe/ssd/ssd_split_table_batched_embeddings_test.py
     # https://github.com/pytorch/FBGEMM/issues/1559
     ./batched_unary_embeddings_test.py
-    # require multiple GPUs
-    ./comm/multi_gpu_car_test.py
   )
 }
 
@@ -250,7 +244,7 @@ test_setup_conda_environment () {
   if [ "$pytorch_variant_type" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} ENV_NAME COMPILER PYTHON_VERSION PYTORCH_INSTALLER PYTORCH_CHANNEL[/VERSION] PYTORCH_VARIANT_TYPE [PYTORCH_VARIANT_VERSION]"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env clang 3.12 pip test/0.6.0 cuda 12.1.0       # Setup environment with pytorch-test 0.6.0 for Clang + Python 3.12 + CUDA 12.1.0"
+    echo "    ${FUNCNAME[0]} build_env clang 3.12 pip test/0.7.0 cuda 12.1.0       # Setup environment with pytorch-test 0.7.0 for Clang + Python 3.12 + CUDA 12.1.0"
     return 1
   else
     echo "################################################################################"
@@ -332,7 +326,7 @@ test_fbgemm_gpu_setup_and_pip_install () {
   if [ "$fbgemm_gpu_channel_version" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} ENV_NAME PYTORCH_CHANNEL[/VERSION] FBGEMM_GPU_CHANNEL[/VERSION]"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} test_env cpu test/2.2.0 test/0.6.0     # Run tests against all Python versions with PyTorch test/2.2.0 and FBGEMM_GPU test/0.6.0 (CPU-only)"
+    echo "    ${FUNCNAME[0]} test_env cpu test/2.2.0 test/0.7.0     # Run tests against all Python versions with PyTorch test/2.2.0 and FBGEMM_GPU test/0.7.0 (CPU-only)"
     echo "    ${FUNCNAME[0]} test_env cuda test/2.3.0 test/0.7.0    # Run tests against all Python versions with PyTorch test/2.3.0 and FBGEMM_GPU test/0.7.0 (all CUDA versions)"
     return 1
   else

diff --git a/.github/scripts/utils_pip.bash b/.github/scripts/utils_pip.bash
@@ -176,11 +176,11 @@ install_from_pytorch_pip () {
     echo "Example(s):"
     echo "    ${FUNCNAME[0]} build_env torch 1.11.0 cpu                       # Install the CPU variant, specific version from release channel"
     echo "    ${FUNCNAME[0]} build_env torch release cpu                      # Install the CPU variant, latest version from release channel"
-    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu test/0.6.0rc0 cuda/12.1.0   # Install the CUDA 12.1 variant, specific version from test channel"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu test/0.7.0rc0 cuda/12.1.0   # Install the CUDA 12.1 variant, specific version from test channel"
     echo "    ${FUNCNAME[0]} build_env fbgemm_gpu nightly rocm/5.3            # Install the ROCM 5.3 variant, latest version from nightly channel"
     echo "    ${FUNCNAME[0]} build_env pytorch_triton 1.11.0                  # Install specific version from release channel"
     echo "    ${FUNCNAME[0]} build_env pytorch_triton release                 # Install latest version from release channel"
-    echo "    ${FUNCNAME[0]} build_env pytorch_triton test/0.6.0rc0           # Install specific version from test channel"
+    echo "    ${FUNCNAME[0]} build_env pytorch_triton test/0.7.0rc0           # Install specific version from test channel"
     echo "    ${FUNCNAME[0]} build_env pytorch_triton_rocm nightly            # Install latest version from nightly channel"
     return 1
   else
@@ -233,7 +233,7 @@ download_from_pytorch_pip () {
     echo "Example(s):"
     echo "    ${FUNCNAME[0]} build_env torch 1.11.0 cpu                       # Download the CPU variant, specific version from release channel"
     echo "    ${FUNCNAME[0]} build_env torch release cpu                      # Download the CPU variant, latest version from release channel"
-    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu test/0.6.0rc0 cuda/12.1.0   # Download the CUDA 12.1 variant, specific version from test channel"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu test/0.7.0rc0 cuda/12.1.0   # Download the CUDA 12.1 variant, specific version from test channel"
     echo "    ${FUNCNAME[0]} build_env fbgemm_gpu nightly rocm/5.3            # Download the ROCM 5.3 variant, latest version from nightly channel"
     return 1
   else

diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml
@@ -31,7 +31,7 @@ on:
         required: true
         default: "nightly"
       fbgemm_gpu_channel_version:
-        description: FBGEMM-GPU Channel + Version (e.g. '0.5.0', 'nightly', 'test/0.6.0r0')
+        description: FBGEMM-GPU Channel + Version (e.g. '0.5.0', 'nightly', 'test/0.7.0r0')
         type: string
         required: true
         default: "nightly"

diff --git a/cmake/modules/CudaSetup.cmake b/cmake/modules/CudaSetup.cmake
@@ -12,10 +12,10 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/modules/Utilities.cmake)
 ################################################################################
 
 BLOCK_PRINT(
-  "NCCL flags"
+  "NCCL Flags"
   ""
-  "NCCL_INCLUDE_DIR=${NCCL_INCLUDE_DIR}"
-  "NCCL_LIB_DIR=${NCCL_LIB_DIR}"
+  "NCCL_INCLUDE_DIRS=${NCCL_INCLUDE_DIRS}"
+  "NCCL_LIBRARIES=${NCCL_LIBRARIES}"
 )
 
 # Set NVML_LIB_PATH if provided, or detect the default lib path

diff --git a/cmake/modules/PyTorchSetup.cmake b/cmake/modules/PyTorchSetup.cmake
@@ -13,6 +13,14 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/modules/Utilities.cmake)
 
 find_package(Torch REQUIRED)
 
+BLOCK_PRINT(
+  "PyTorch Flags"
+  ""
+  "TORCH_INCLUDE_DIRS=${TORCH_INCLUDE_DIRS}"
+  ""
+  "TORCH_LIBRARIES=${TORCH_LIBRARIES}"
+)
+
 #
 # PyTorch CUDA Extensions are normally compiled with the flags below. However we
 # disabled -D__CUDA_NO_HALF_CONVERSIONS__ here as it caused "error: no suitable

diff --git a/fbgemm_gpu/FbgemmGpu.cmake b/fbgemm_gpu/FbgemmGpu.cmake
@@ -31,7 +31,7 @@ set(fbgemm_sources_include_directories
   ${THIRDPARTY}/cpuinfo/include
   ${THIRDPARTY}/cutlass/include
   ${THIRDPARTY}/cutlass/tools/util/include
-  ${NCCL_INCLUDE_DIR})
+  ${NCCL_INCLUDE_DIRS})
 
 
 ################################################################################
@@ -673,15 +673,15 @@ endif()
 # Add PyTorch include/
 target_include_directories(fbgemm_gpu_py PRIVATE
   ${TORCH_INCLUDE_DIRS}
-  ${NCCL_INCLUDE_DIR})
+  ${NCCL_INCLUDE_DIRS})
 
 # Remove `lib` from the output artifact name `libfbgemm_gpu_py.so`
 set_target_properties(fbgemm_gpu_py PROPERTIES PREFIX "")
 
 # Link to PyTorch
 target_link_libraries(fbgemm_gpu_py
   ${TORCH_LIBRARIES}
-  ${NCCL_LIB_DIR})
+  ${NCCL_LIBRARIES})
 
 # Link to NVML
 if(NVML_LIB_PATH)

diff --git a/fbgemm_gpu/docs/src/conf.py b/fbgemm_gpu/docs/src/conf.py
@@ -28,10 +28,10 @@
 author = "FBGEMM Team"
 
 # The short X.Y version.
-version = "0.6"
+version = "0.7"
 
 # The full version, including alpha/beta/rc tags
-release = "0.6.0"
+release = "0.7.0"
 
 
 # -- Path setup --------------------------------------------------------------

diff --git a/fbgemm_gpu/docs/src/fbgemm_gpu-development/BuildInstructions.rst b/fbgemm_gpu/docs/src/fbgemm_gpu-development/BuildInstructions.rst
@@ -115,14 +115,16 @@ Install the full CUDA package through Conda, which includes
   # Install the full CUDA package
   conda install -n ${env_name} -y cuda -c "nvidia/label/cuda-${cuda_version}"
 
-Verify that ``cuda_runtime.h`` and ``libnvidia-ml.so`` are found:
+Verify that ``cuda_runtime.h``, ``libnvidia-ml.so``, and ``libnccl.so*`` are
+found:
 
 .. code:: sh
 
   conda_prefix=$(conda run -n ${env_name} printenv CONDA_PREFIX)
 
   find "${conda_prefix}" -name cuda_runtime.h
   find "${conda_prefix}" -name libnvidia-ml.so
+  find "${conda_prefix}" -name libnccl.so*
 
 Install cuDNN
 ~~~~~~~~~~~~~
@@ -141,6 +143,14 @@ cuDNN package for the given CUDA version:
   wget -q "${cudnn_url}" -O cudnn.tar.xz
   tar -xvf cudnn.tar.xz
 
+Install CUTLASS
+~~~~~~~~~~~~~~~
+
+This section is only applicable to building the experimental FBGEMM_GPU GenAI
+module.  CUTLASS should be already be available in the repository as a git
+submodule (see :ref:`fbgemm-gpu.build.prepare`).  The following include paths
+are already added to the CMake configuration:
+
 
 Set Up for ROCm Build
 ---------------------
@@ -407,6 +417,8 @@ For the CUDA variant of PyTorch, verify that at the minimum ``cuda_cmake_macros.
 Build the FBGEMM_GPU Package
 ----------------------------
 
+.. _fbgemm-gpu.build.prepare:
+
 Preparing the Build
 ~~~~~~~~~~~~~~~~~~~
 
@@ -418,7 +430,7 @@ Clone the repo along with its submodules, and install the
   # !! Run inside the Conda environment !!
 
   # Select a version tag
-  FBGEMM_VERSION=v0.6.0
+  FBGEMM_VERSION=v0.7.0
 
   # Clone the repo along with its submodules
   git clone --recursive -b ${FBGEMM_VERSION} https://github.com/pytorch/FBGEMM.git fbgemm_${FBGEMM_VERSION}
@@ -547,6 +559,9 @@ toolchains have been properly installed.
   # Specify NVML path
   export NVML_LIB_PATH=/path/to/libnvidia-ml.so
 
+  # Specify NCCL path
+  export nccl_path=/path/to/nccl
+
   # Build for SM70/80 (V100/A100 GPU); update as needed
   # If not specified, only the CUDA architecture supported by current system will be targeted
   # If not specified and no CUDA device is present either, all CUDA architectures will be targeted
@@ -563,12 +578,16 @@ toolchains have been properly installed.
       --python-tag="${python_tag}" \
       --plat-name="${python_plat_name}" \
       --nvml_lib_path=${NVML_LIB_PATH} \
+      -DNCCL_INCLUDE_DIRS="${nccl_path}/include" \
+      -DNCCL_LIBRARIES="${nccl_path}/lib/libnccl.so.2" \
       -DTORCH_CUDA_ARCH_LIST="${cuda_arch_list}"
 
   # Build and install the library into the Conda environment
   python setup.py install \
       --package_variant=cuda \
       --nvml_lib_path=${NVML_LIB_PATH} \
+      -DNCCL_INCLUDE_DIRS="${nccl_path}/include" \
+      -DNCCL_LIBRARIES="${nccl_path}/lib/libnccl.so.2" \
       -DTORCH_CUDA_ARCH_LIST="${cuda_arch_list}"
 
 .. _fbgemm-gpu.build.process.rocm:

diff --git a/fbgemm_gpu/docs/src/fbgemm_gpu-development/InstallationInstructions.rst b/fbgemm_gpu/docs/src/fbgemm_gpu-development/InstallationInstructions.rst
@@ -163,6 +163,32 @@ Follow the instructions in :ref:`fbgemm-gpu.build.setup.pytorch.install`
 for installing PyTorch inside a Conda environment.
 
 
+Install Triton
+--------------
+
+This section is only applicable to working the experimental FBGEMM_GPU GenAI
+module.  Triton should already come packaged with the PyTOrch installation.
+This can be verified with:
+
+.. code:: sh
+
+  conda run -n ${env_name} python -c "import triton"
+
+If Triton is not available, it can be installed through PyTorch PIP:
+
+.. code:: sh
+
+  # Most recent version used can be found in the build scripts
+  TRITON_VERSION=3.0.0+45fff310c8
+
+  conda run -n ${env_name} pip install \
+    --pre pytorch-triton==${TRITON_VERSION} \
+    --index-url https://download.pytorch.org/whl/nightly/
+
+Information about PyTorch-Triton release can be found
+`here <https://github.com/pytorch/pytorch/blob/main/RELEASE.md>`__.
+
+
 Install the FBGEMM_GPU Package
 ------------------------------
 

diff --git a/fbgemm_gpu/docs/src/general/documentation/Cpp.rst b/fbgemm_gpu/docs/src/general/documentation/Cpp.rst
@@ -4,7 +4,7 @@ Adding Documentation to C++ Code
 --------------------------------
 
 Documentation for C++ is provided through
-`Javadoc-style comments <https://www.oracle.com/technical-resources/articles/java/javadoc-tool.html>`__
+`Javadoc-style comments <https://www.oracle.com/java/technologies/javase/javadoc-tool.html>`__
 and generated using Sphinx, `Doxygen <https://www.doxygen.nl/>`__, and
 `Breathe <https://www.breathe-doc.org/>`__.
 

diff --git a/fbgemm_gpu/experimental/example/CMakeLists.txt b/fbgemm_gpu/experimental/example/CMakeLists.txt
@@ -25,12 +25,7 @@ if(FBGEMM_GENAI_ONLY)
     ${THIRDPARTY}/cpuinfo/include
     ${THIRDPARTY}/cutlass/include
     ${THIRDPARTY}/cutlass/tools/util/include
-    ${NCCL_INCLUDE_DIR})
-
-  set(third_party_include_directories
-    ${THIRDPARTY}/asmjit/src
-    ${THIRDPARTY}/cpuinfo/include
-    ${THIRDPARTY}/cutlass/include)
+    ${NCCL_INCLUDE_DIRS})
 endif()
 
 set(experimental_example_cpp_source_files
@@ -56,11 +51,11 @@ add_library(fbgemm_gpu_experimental_example_py MODULE
 
 target_include_directories(fbgemm_gpu_experimental_example_py PRIVATE
     ${TORCH_INCLUDE_DIRS}
-    ${NCCL_INCLUDE_DIR})
+    ${NCCL_INCLUDE_DIRS})
 
 target_link_libraries(fbgemm_gpu_experimental_example_py
     ${TORCH_LIBRARIES}
-    ${NCCL_LIB_DIR})
+    ${NCCL_LIBRARIES})
 
 # Remove `lib` from the output artifact name
 set_target_properties(fbgemm_gpu_experimental_example_py PROPERTIES PREFIX "")

diff --git a/fbgemm_gpu/experimental/example/test/triton_example_test.py b/fbgemm_gpu/experimental/example/test/triton_example_test.py
@@ -14,7 +14,8 @@
 @triton.jit
 # fmt: off
 def triton_add_kernel(x_ptr, y_ptr, z_ptr, n_elements, BLOCK_SIZE: tl.constexpr) -> None:
-    # fmt: on
+# fmt: on  # noqa E115
+
     # We use a 1D launch grid so axis is 0.
     pid = tl.program_id(axis=0)
 

diff --git a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt
@@ -23,7 +23,7 @@ if(FBGEMM_GENAI_ONLY)
     ${THIRDPARTY}/cpuinfo/include
     ${THIRDPARTY}/cutlass/include
     ${THIRDPARTY}/cutlass/tools/util/include
-    ${NCCL_INCLUDE_DIR})
+    ${NCCL_INCLUDE_DIRS})
 endif()
 
 set(attention_ops_sources
@@ -35,14 +35,14 @@ set(quantize_ops_sources
     src/quantize/quantize.cu
     src/quantize/quantize.cpp)
 
-# set(comm_ops_sources
-#     src/comm/car.cu
-#     src/comm/car.cpp)
+set(comm_ops_sources
+    src/comm/car.cu
+    src/comm/car.cpp)
 
 set(experimental_gen_ai_cpp_source_files
     ${attention_ops_sources}
-    ${quantize_ops_sources})
-#     ${comm_ops_sources})
+    ${quantize_ops_sources}
+    ${comm_ops_sources})
 
 set_source_files_properties(${experimental_gen_ai_cpp_source_files}
     PROPERTIES INCLUDE_DIRECTORIES
@@ -104,11 +104,11 @@ endif()
 
 target_include_directories(fbgemm_gpu_experimental_gen_ai_py PRIVATE
   ${TORCH_INCLUDE_DIRS}
-  ${NCCL_INCLUDE_DIR})
+  ${NCCL_INCLUDE_DIRS})
 
 target_link_libraries(fbgemm_gpu_experimental_gen_ai_py
   ${TORCH_LIBRARIES}
-  ${NCCL_LIB_DIR})
+  ${NCCL_LIBRARIES})
 
 # Remove `lib` from the output artifact name
 set_target_properties(fbgemm_gpu_experimental_gen_ai_py PROPERTIES PREFIX "")

diff --git a/fbgemm_gpu/experimental/gen_ai/src/comm/car.cpp b/fbgemm_gpu/experimental/gen_ai/src/comm/car.cpp
@@ -16,13 +16,11 @@
 #include "c10/cuda/CUDAFunctions.h"
 #include "c10/cuda/CUDAStream.h"
 #include "c10/util/Optional.h"
-#include "folly/futures/Future.h"
 
 #include <ATen/cuda/CUDAEvent.h>
-#include <folly/experimental/symbolizer/SignalHandler.h>
-
 #include <sys/stat.h>
 #include <torch/csrc/cuda/nccl.h>
+#include <unistd.h>
 #include <algorithm>
 #include <atomic>
 #include <cassert>
@@ -42,8 +40,8 @@ constexpr size_t kMaxNumNcclComms = 3;
 static ncclComm_t* get_nccl_comm(int64_t comm_idx) {
   static ncclComm_t comms[kMaxNumNcclComms];
 
-  CHECK_GE(comm_idx, 0);
-  CHECK_LT(comm_idx, kMaxNumNcclComms);
+  TORCH_CHECK_GE(comm_idx, 0);
+  TORCH_CHECK_LT(comm_idx, kMaxNumNcclComms);
   return &comms[comm_idx];
 }