Skip to content

Commit

Permalink
Enable NCCL code (pytorch#2631)
Browse files Browse the repository at this point in the history
Summary:
- Enable NCCL code and tests for multiple GPU car

Differential Revision: D58147817

Pulled By: q10
  • Loading branch information
q10 committed Jun 4, 2024
1 parent 7d4b51e commit 7b13182
Show file tree
Hide file tree
Showing 17 changed files with 93 additions and 58 deletions.
6 changes: 4 additions & 2 deletions .github/scripts/fbgemm_gpu_build.bash
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,10 @@ __configure_fbgemm_gpu_build_nvcc () {
build_args+=(
# Override CMake configuration
-DCMAKE_CXX_STANDARD="${cppstd_ver}"
-DNCCL_INCLUDE_DIR=${nccl_path}/include
-DNCCL_LIB_DIR=${nccl_path}/lib
-DNCCL_INCLUDE_DIRS=${nccl_path}/include
-DNCCL_LIBRARIES=${nccl_lib}
-DCMAKE_C_FLAGS="'-L${nccl_path}/lib'"
-DCMAKE_CXX_FLAGS="'-L${nccl_path}/lib'"
)
}

Expand Down
2 changes: 1 addition & 1 deletion .github/scripts/fbgemm_gpu_install.bash
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ install_fbgemm_gpu_pip () {
echo "Example(s):"
echo " ${FUNCNAME[0]} build_env 0.5.0 cpu # Install the CPU variant, specific version from release channel"
echo " ${FUNCNAME[0]} build_env release cuda 12.4.1 # Install the CUDA variant, latest version from release channel"
echo " ${FUNCNAME[0]} build_env test/0.6.0rc0 cuda 12.4.1 # Install the CUDA 12.4 variant, specific version from test channel"
echo " ${FUNCNAME[0]} build_env test/0.7.0rc0 cuda 12.4.1 # Install the CUDA 12.4 variant, specific version from test channel"
echo " ${FUNCNAME[0]} build_env nightly rocm 5.3 # Install the ROCM 5.3 variant, latest version from nightly channel"
return 1
else
Expand Down
10 changes: 2 additions & 8 deletions .github/scripts/fbgemm_gpu_test.bash
Original file line number Diff line number Diff line change
Expand Up @@ -70,16 +70,12 @@ __configure_fbgemm_gpu_test_cpu () {
# These tests have non-CPU operators referenced in @given
./uvm/copy_test.py
./uvm/uvm_test.py
# require multiple GPUs
./comm/multi_gpu_car_test.py
)
}

__configure_fbgemm_gpu_test_cuda () {
ignored_tests=(
./tbe/ssd/ssd_split_table_batched_embeddings_test.py
# require multiple GPUs
./comm/multi_gpu_car_test.py
)
}

Expand All @@ -105,8 +101,6 @@ __configure_fbgemm_gpu_test_rocm () {
./tbe/ssd/ssd_split_table_batched_embeddings_test.py
# https://github.com/pytorch/FBGEMM/issues/1559
./batched_unary_embeddings_test.py
# require multiple GPUs
./comm/multi_gpu_car_test.py
)
}

Expand Down Expand Up @@ -250,7 +244,7 @@ test_setup_conda_environment () {
if [ "$pytorch_variant_type" == "" ]; then
echo "Usage: ${FUNCNAME[0]} ENV_NAME COMPILER PYTHON_VERSION PYTORCH_INSTALLER PYTORCH_CHANNEL[/VERSION] PYTORCH_VARIANT_TYPE [PYTORCH_VARIANT_VERSION]"
echo "Example(s):"
echo " ${FUNCNAME[0]} build_env clang 3.12 pip test/0.6.0 cuda 12.1.0 # Setup environment with pytorch-test 0.6.0 for Clang + Python 3.12 + CUDA 12.1.0"
echo " ${FUNCNAME[0]} build_env clang 3.12 pip test/0.7.0 cuda 12.1.0 # Setup environment with pytorch-test 0.7.0 for Clang + Python 3.12 + CUDA 12.1.0"
return 1
else
echo "################################################################################"
Expand Down Expand Up @@ -332,7 +326,7 @@ test_fbgemm_gpu_setup_and_pip_install () {
if [ "$fbgemm_gpu_channel_version" == "" ]; then
echo "Usage: ${FUNCNAME[0]} ENV_NAME PYTORCH_CHANNEL[/VERSION] FBGEMM_GPU_CHANNEL[/VERSION]"
echo "Example(s):"
echo " ${FUNCNAME[0]} test_env cpu test/2.2.0 test/0.6.0 # Run tests against all Python versions with PyTorch test/2.2.0 and FBGEMM_GPU test/0.6.0 (CPU-only)"
echo " ${FUNCNAME[0]} test_env cpu test/2.2.0 test/0.7.0 # Run tests against all Python versions with PyTorch test/2.2.0 and FBGEMM_GPU test/0.7.0 (CPU-only)"
echo " ${FUNCNAME[0]} test_env cuda test/2.3.0 test/0.7.0 # Run tests against all Python versions with PyTorch test/2.3.0 and FBGEMM_GPU test/0.7.0 (all CUDA versions)"
return 1
else
Expand Down
6 changes: 3 additions & 3 deletions .github/scripts/utils_pip.bash
Original file line number Diff line number Diff line change
Expand Up @@ -176,11 +176,11 @@ install_from_pytorch_pip () {
echo "Example(s):"
echo " ${FUNCNAME[0]} build_env torch 1.11.0 cpu # Install the CPU variant, specific version from release channel"
echo " ${FUNCNAME[0]} build_env torch release cpu # Install the CPU variant, latest version from release channel"
echo " ${FUNCNAME[0]} build_env fbgemm_gpu test/0.6.0rc0 cuda/12.1.0 # Install the CUDA 12.1 variant, specific version from test channel"
echo " ${FUNCNAME[0]} build_env fbgemm_gpu test/0.7.0rc0 cuda/12.1.0 # Install the CUDA 12.1 variant, specific version from test channel"
echo " ${FUNCNAME[0]} build_env fbgemm_gpu nightly rocm/5.3 # Install the ROCM 5.3 variant, latest version from nightly channel"
echo " ${FUNCNAME[0]} build_env pytorch_triton 1.11.0 # Install specific version from release channel"
echo " ${FUNCNAME[0]} build_env pytorch_triton release # Install latest version from release channel"
echo " ${FUNCNAME[0]} build_env pytorch_triton test/0.6.0rc0 # Install specific version from test channel"
echo " ${FUNCNAME[0]} build_env pytorch_triton test/0.7.0rc0 # Install specific version from test channel"
echo " ${FUNCNAME[0]} build_env pytorch_triton_rocm nightly # Install latest version from nightly channel"
return 1
else
Expand Down Expand Up @@ -233,7 +233,7 @@ download_from_pytorch_pip () {
echo "Example(s):"
echo " ${FUNCNAME[0]} build_env torch 1.11.0 cpu # Download the CPU variant, specific version from release channel"
echo " ${FUNCNAME[0]} build_env torch release cpu # Download the CPU variant, latest version from release channel"
echo " ${FUNCNAME[0]} build_env fbgemm_gpu test/0.6.0rc0 cuda/12.1.0 # Download the CUDA 12.1 variant, specific version from test channel"
echo " ${FUNCNAME[0]} build_env fbgemm_gpu test/0.7.0rc0 cuda/12.1.0 # Download the CUDA 12.1 variant, specific version from test channel"
echo " ${FUNCNAME[0]} build_env fbgemm_gpu nightly rocm/5.3 # Download the ROCM 5.3 variant, latest version from nightly channel"
return 1
else
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/fbgemm_gpu_pip.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ on:
required: true
default: "nightly"
fbgemm_gpu_channel_version:
description: FBGEMM-GPU Channel + Version (e.g. '0.5.0', 'nightly', 'test/0.6.0r0')
description: FBGEMM-GPU Channel + Version (e.g. '0.5.0', 'nightly', 'test/0.7.0r0')
type: string
required: true
default: "nightly"
Expand Down
6 changes: 3 additions & 3 deletions cmake/modules/CudaSetup.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/modules/Utilities.cmake)
################################################################################

BLOCK_PRINT(
"NCCL flags"
"NCCL Flags"
""
"NCCL_INCLUDE_DIR=${NCCL_INCLUDE_DIR}"
"NCCL_LIB_DIR=${NCCL_LIB_DIR}"
"NCCL_INCLUDE_DIRS=${NCCL_INCLUDE_DIRS}"
"NCCL_LIBRARIES=${NCCL_LIBRARIES}"
)

# Set NVML_LIB_PATH if provided, or detect the default lib path
Expand Down
8 changes: 8 additions & 0 deletions cmake/modules/PyTorchSetup.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/modules/Utilities.cmake)

find_package(Torch REQUIRED)

BLOCK_PRINT(
"PyTorch Flags"
""
"TORCH_INCLUDE_DIRS=${TORCH_INCLUDE_DIRS}"
""
"TORCH_LIBRARIES=${TORCH_LIBRARIES}"
)

#
# PyTorch CUDA Extensions are normally compiled with the flags below. However we
# disabled -D__CUDA_NO_HALF_CONVERSIONS__ here as it caused "error: no suitable
Expand Down
6 changes: 3 additions & 3 deletions fbgemm_gpu/FbgemmGpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ set(fbgemm_sources_include_directories
${THIRDPARTY}/cpuinfo/include
${THIRDPARTY}/cutlass/include
${THIRDPARTY}/cutlass/tools/util/include
${NCCL_INCLUDE_DIR})
${NCCL_INCLUDE_DIRS})


################################################################################
Expand Down Expand Up @@ -673,15 +673,15 @@ endif()
# Add PyTorch include/
target_include_directories(fbgemm_gpu_py PRIVATE
${TORCH_INCLUDE_DIRS}
${NCCL_INCLUDE_DIR})
${NCCL_INCLUDE_DIRS})

# Remove `lib` from the output artifact name `libfbgemm_gpu_py.so`
set_target_properties(fbgemm_gpu_py PROPERTIES PREFIX "")

# Link to PyTorch
target_link_libraries(fbgemm_gpu_py
${TORCH_LIBRARIES}
${NCCL_LIB_DIR})
${NCCL_LIBRARIES})

# Link to NVML
if(NVML_LIB_PATH)
Expand Down
4 changes: 2 additions & 2 deletions fbgemm_gpu/docs/src/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@
author = "FBGEMM Team"

# The short X.Y version.
version = "0.6"
version = "0.7"

# The full version, including alpha/beta/rc tags
release = "0.6.0"
release = "0.7.0"


# -- Path setup --------------------------------------------------------------
Expand Down
23 changes: 21 additions & 2 deletions fbgemm_gpu/docs/src/fbgemm_gpu-development/BuildInstructions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -115,14 +115,16 @@ Install the full CUDA package through Conda, which includes
# Install the full CUDA package
conda install -n ${env_name} -y cuda -c "nvidia/label/cuda-${cuda_version}"
Verify that ``cuda_runtime.h`` and ``libnvidia-ml.so`` are found:
Verify that ``cuda_runtime.h``, ``libnvidia-ml.so``, and ``libnccl.so*`` are
found:

.. code:: sh
conda_prefix=$(conda run -n ${env_name} printenv CONDA_PREFIX)
find "${conda_prefix}" -name cuda_runtime.h
find "${conda_prefix}" -name libnvidia-ml.so
find "${conda_prefix}" -name libnccl.so*
Install cuDNN
~~~~~~~~~~~~~
Expand All @@ -141,6 +143,14 @@ cuDNN package for the given CUDA version:
wget -q "${cudnn_url}" -O cudnn.tar.xz
tar -xvf cudnn.tar.xz
Install CUTLASS
~~~~~~~~~~~~~~~

This section is only applicable to building the experimental FBGEMM_GPU GenAI
module. CUTLASS should be already be available in the repository as a git
submodule (see :ref:`fbgemm-gpu.build.prepare`). The following include paths
are already added to the CMake configuration:


Set Up for ROCm Build
---------------------
Expand Down Expand Up @@ -407,6 +417,8 @@ For the CUDA variant of PyTorch, verify that at the minimum ``cuda_cmake_macros.
Build the FBGEMM_GPU Package
----------------------------

.. _fbgemm-gpu.build.prepare:

Preparing the Build
~~~~~~~~~~~~~~~~~~~

Expand All @@ -418,7 +430,7 @@ Clone the repo along with its submodules, and install the
# !! Run inside the Conda environment !!
# Select a version tag
FBGEMM_VERSION=v0.6.0
FBGEMM_VERSION=v0.7.0
# Clone the repo along with its submodules
git clone --recursive -b ${FBGEMM_VERSION} https://github.com/pytorch/FBGEMM.git fbgemm_${FBGEMM_VERSION}
Expand Down Expand Up @@ -547,6 +559,9 @@ toolchains have been properly installed.
# Specify NVML path
export NVML_LIB_PATH=/path/to/libnvidia-ml.so
# Specify NCCL path
export nccl_path=/path/to/nccl
# Build for SM70/80 (V100/A100 GPU); update as needed
# If not specified, only the CUDA architecture supported by current system will be targeted
# If not specified and no CUDA device is present either, all CUDA architectures will be targeted
Expand All @@ -563,12 +578,16 @@ toolchains have been properly installed.
--python-tag="${python_tag}" \
--plat-name="${python_plat_name}" \
--nvml_lib_path=${NVML_LIB_PATH} \
-DNCCL_INCLUDE_DIRS="${nccl_path}/include" \
-DNCCL_LIBRARIES="${nccl_path}/lib/libnccl.so.2" \
-DTORCH_CUDA_ARCH_LIST="${cuda_arch_list}"
# Build and install the library into the Conda environment
python setup.py install \
--package_variant=cuda \
--nvml_lib_path=${NVML_LIB_PATH} \
-DNCCL_INCLUDE_DIRS="${nccl_path}/include" \
-DNCCL_LIBRARIES="${nccl_path}/lib/libnccl.so.2" \
-DTORCH_CUDA_ARCH_LIST="${cuda_arch_list}"
.. _fbgemm-gpu.build.process.rocm:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,32 @@ Follow the instructions in :ref:`fbgemm-gpu.build.setup.pytorch.install`
for installing PyTorch inside a Conda environment.


Install Triton
--------------

This section is only applicable to working the experimental FBGEMM_GPU GenAI
module. Triton should already come packaged with the PyTOrch installation.
This can be verified with:

.. code:: sh
conda run -n ${env_name} python -c "import triton"
If Triton is not available, it can be installed through PyTorch PIP:

.. code:: sh
# Most recent version used can be found in the build scripts
TRITON_VERSION=3.0.0+45fff310c8
conda run -n ${env_name} pip install \
--pre pytorch-triton==${TRITON_VERSION} \
--index-url https://download.pytorch.org/whl/nightly/
Information about PyTorch-Triton release can be found
`here <https://github.com/pytorch/pytorch/blob/main/RELEASE.md>`__.


Install the FBGEMM_GPU Package
------------------------------

Expand Down
2 changes: 1 addition & 1 deletion fbgemm_gpu/docs/src/general/documentation/Cpp.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Adding Documentation to C++ Code
--------------------------------

Documentation for C++ is provided through
`Javadoc-style comments <https://www.oracle.com/technical-resources/articles/java/javadoc-tool.html>`__
`Javadoc-style comments <https://www.oracle.com/java/technologies/javase/javadoc-tool.html>`__
and generated using Sphinx, `Doxygen <https://www.doxygen.nl/>`__, and
`Breathe <https://www.breathe-doc.org/>`__.

Expand Down
11 changes: 3 additions & 8 deletions fbgemm_gpu/experimental/example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,7 @@ if(FBGEMM_GENAI_ONLY)
${THIRDPARTY}/cpuinfo/include
${THIRDPARTY}/cutlass/include
${THIRDPARTY}/cutlass/tools/util/include
${NCCL_INCLUDE_DIR})

set(third_party_include_directories
${THIRDPARTY}/asmjit/src
${THIRDPARTY}/cpuinfo/include
${THIRDPARTY}/cutlass/include)
${NCCL_INCLUDE_DIRS})
endif()

set(experimental_example_cpp_source_files
Expand All @@ -56,11 +51,11 @@ add_library(fbgemm_gpu_experimental_example_py MODULE

target_include_directories(fbgemm_gpu_experimental_example_py PRIVATE
${TORCH_INCLUDE_DIRS}
${NCCL_INCLUDE_DIR})
${NCCL_INCLUDE_DIRS})

target_link_libraries(fbgemm_gpu_experimental_example_py
${TORCH_LIBRARIES}
${NCCL_LIB_DIR})
${NCCL_LIBRARIES})

# Remove `lib` from the output artifact name
set_target_properties(fbgemm_gpu_experimental_example_py PROPERTIES PREFIX "")
Expand Down
3 changes: 2 additions & 1 deletion fbgemm_gpu/experimental/example/test/triton_example_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
@triton.jit
# fmt: off
def triton_add_kernel(x_ptr, y_ptr, z_ptr, n_elements, BLOCK_SIZE: tl.constexpr) -> None:
# fmt: on
# fmt: on # noqa E115

# We use a 1D launch grid so axis is 0.
pid = tl.program_id(axis=0)

Expand Down
16 changes: 8 additions & 8 deletions fbgemm_gpu/experimental/gen_ai/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ if(FBGEMM_GENAI_ONLY)
${THIRDPARTY}/cpuinfo/include
${THIRDPARTY}/cutlass/include
${THIRDPARTY}/cutlass/tools/util/include
${NCCL_INCLUDE_DIR})
${NCCL_INCLUDE_DIRS})
endif()

set(attention_ops_sources
Expand All @@ -35,14 +35,14 @@ set(quantize_ops_sources
src/quantize/quantize.cu
src/quantize/quantize.cpp)

# set(comm_ops_sources
# src/comm/car.cu
# src/comm/car.cpp)
set(comm_ops_sources
src/comm/car.cu
src/comm/car.cpp)

set(experimental_gen_ai_cpp_source_files
${attention_ops_sources}
${quantize_ops_sources})
# ${comm_ops_sources})
${quantize_ops_sources}
${comm_ops_sources})

set_source_files_properties(${experimental_gen_ai_cpp_source_files}
PROPERTIES INCLUDE_DIRECTORIES
Expand Down Expand Up @@ -104,11 +104,11 @@ endif()

target_include_directories(fbgemm_gpu_experimental_gen_ai_py PRIVATE
${TORCH_INCLUDE_DIRS}
${NCCL_INCLUDE_DIR})
${NCCL_INCLUDE_DIRS})

target_link_libraries(fbgemm_gpu_experimental_gen_ai_py
${TORCH_LIBRARIES}
${NCCL_LIB_DIR})
${NCCL_LIBRARIES})

# Remove `lib` from the output artifact name
set_target_properties(fbgemm_gpu_experimental_gen_ai_py PROPERTIES PREFIX "")
Expand Down
8 changes: 3 additions & 5 deletions fbgemm_gpu/experimental/gen_ai/src/comm/car.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,11 @@
#include "c10/cuda/CUDAFunctions.h"
#include "c10/cuda/CUDAStream.h"
#include "c10/util/Optional.h"
#include "folly/futures/Future.h"

#include <ATen/cuda/CUDAEvent.h>
#include <folly/experimental/symbolizer/SignalHandler.h>

#include <sys/stat.h>
#include <torch/csrc/cuda/nccl.h>
#include <unistd.h>
#include <algorithm>
#include <atomic>
#include <cassert>
Expand All @@ -42,8 +40,8 @@ constexpr size_t kMaxNumNcclComms = 3;
static ncclComm_t* get_nccl_comm(int64_t comm_idx) {
static ncclComm_t comms[kMaxNumNcclComms];

CHECK_GE(comm_idx, 0);
CHECK_LT(comm_idx, kMaxNumNcclComms);
TORCH_CHECK_GE(comm_idx, 0);
TORCH_CHECK_LT(comm_idx, kMaxNumNcclComms);
return &comms[comm_idx];
}

Expand Down
Loading

0 comments on commit 7b13182

Please sign in to comment.