Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BE] Delete BUILD_SPLIT_CUDA option #87502

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/actions/test-pytorch-binary/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ runs:
-e BINARY_ENV_FILE \
-e BUILDER_ROOT \
-e BUILD_ENVIRONMENT \
-e BUILD_SPLIT_CUDA \
-e DESIRED_CUDA \
-e DESIRED_DEVTOOLSET \
-e DESIRED_PYTHON \
Expand Down
6 changes: 0 additions & 6 deletions .github/workflows/_binary-build-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -167,11 +167,6 @@ jobs:
git clean -fxd
working-directory: builder

- name: Set BUILD_SPLIT_CUDA
if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && startsWith(inputs.GPU_ARCH_VERSION, '11') }}
shell: bash
run: |
echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
Expand All @@ -184,7 +179,6 @@ jobs:
-e BINARY_ENV_FILE \
-e BUILDER_ROOT \
-e BUILD_ENVIRONMENT \
-e BUILD_SPLIT_CUDA \
-e DESIRED_CUDA \
-e DESIRED_DEVTOOLSET \
-e DESIRED_PYTHON \
Expand Down
2 changes: 0 additions & 2 deletions .jenkins/pytorch/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
fi

if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
# enable split torch_cuda build option in CMake
export BUILD_SPLIT_CUDA=ON
if [[ "$BUILD_ENVIRONMENT" != *cuda11.3* && "$BUILD_ENVIRONMENT" != *clang* ]]; then
# TODO: there is a linking issue when building with UCC using clang,
# disable it for now and to be fix later.
Expand Down
4 changes: 0 additions & 4 deletions .jenkins/pytorch/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,6 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* || "$BUILD_ENVIRONMENT" == *rocm* ]]; then
export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda"
fi

if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
export BUILD_SPLIT_CUDA=ON
fi

if [[ "$TEST_CONFIG" == *crossref* ]]; then
export PYTORCH_TEST_WITH_CROSSREF=1
fi
Expand Down
4 changes: 0 additions & 4 deletions .jenkins/pytorch/win-test-helpers/build_pytorch.bat
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,6 @@ if "%REBUILD%" == "" (
if not errorlevel 0 exit /b
)
)
:: tests if BUILD_ENVIRONMENT contains cuda11 as a substring
if not x%BUILD_ENVIRONMENT:cuda11=%==x%BUILD_ENVIRONMENT% (
set BUILD_SPLIT_CUDA=ON
)

python setup.py bdist_wheel && sccache --show-stats && python -c "import os, glob; os.system('python -mpip install ' + glob.glob('dist/*.whl')[0] + '[opt-einsum]')" (
if "%BUILD_ENVIRONMENT%"=="" (
Expand Down
4 changes: 0 additions & 4 deletions .jenkins/pytorch/win-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,6 @@ fi

export SCRIPT_HELPERS_DIR=$SCRIPT_PARENT_DIR/win-test-helpers

if [[ "${BUILD_ENVIRONMENT}" == *cuda11* ]]; then
export BUILD_SPLIT_CUDA=ON
fi

if [[ "$TEST_CONFIG" = "force_on_cpu" ]]; then
# run the full test suite for force_on_cpu test
export USE_CUDA=0
Expand Down
7 changes: 0 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -187,13 +187,6 @@ option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON)
option(USE_ASAN "Use Address Sanitizer" OFF)
option(USE_TSAN "Use Thread Sanitizer" OFF)
option(USE_CUDA "Use CUDA" ON)
# BUILD_SPLIT_CUDA must also be exported as an environment variable before building, with
# `export BUILD_SPLIT_CUDA=1` because cpp_extension.py can only work properly if this variable
# also exists in the environment.
# This option is incompatible with CUDA_SEPARABLE_COMPILATION.
cmake_dependent_option(
BUILD_SPLIT_CUDA "Split torch_cuda library into torch_cuda_cu and torch_cuda_cpp" OFF
"USE_CUDA AND NOT CUDA_SEPARABLE_COMPILATION" OFF)
cmake_dependent_option(
BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
option(USE_FAST_NVCC "Use parallel NVCC build" OFF)
Expand Down
6 changes: 0 additions & 6 deletions aten/src/ATen/native/cuda/Bucketization.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/_torch_cuda_cu_linker_symbol_op_native.h>
#include <ATen/ops/bucketize_native.h>
#include <ATen/ops/empty.h>
#include <ATen/ops/searchsorted_native.h>
Expand Down Expand Up @@ -191,11 +190,6 @@ Tensor searchsorted_cuda(
return result;
}

// See [Note about _torch_cuda_cu_linker_symbol_op and torch_cuda_cu] in native_functions.yaml
Tensor _torch_cuda_cu_linker_symbol_op_cuda(const Tensor& self) {
return self;
}

Tensor searchsorted_cuda(
const Tensor& sorted_sequence,
const Scalar& self,
Expand Down
11 changes: 0 additions & 11 deletions aten/src/ATen/native/native_functions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9930,17 +9930,6 @@
CPU: searchsorted_cpu
CUDA: searchsorted_cuda

# [Note about _torch_cuda_cu_linker_symbol_op and torch_cuda_cu]
# This is a DUMMY function to force the linking against torch_cuda_cu on Windows.
# Otherwise, the Windows linker will optimize and not include torch_cuda_cu even when we
# want it to be included. This is similar to what we do with warp_size for torch_cuda_cpp,
# described as the solution to this issue: https://github.com/pytorch/pytorch/issues/31611
# This op should NOT be used or exposed or edited or else Windows builds (with BUILD_SPLIT_CUDA) will break.
- func: _torch_cuda_cu_linker_symbol_op(Tensor self) -> Tensor
dispatch:
CUDA: _torch_cuda_cu_linker_symbol_op_cuda
autogen: _torch_cuda_cu_linker_symbol_op.out

- func: searchsorted.Tensor_out(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!)
dispatch:
CPU: searchsorted_out_cpu
Expand Down
128 changes: 15 additions & 113 deletions caffe2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -883,10 +883,6 @@ file(WRITE ${DUMMY_EMPTY_FILE} ${DUMMY_FILE_CONTENT})
# Wrapper library for people who link against torch and expect both CPU and CUDA support
# Contains "torch_cpu" and "torch_cuda"
add_library(torch ${DUMMY_EMPTY_FILE})
if(BUILD_SPLIT_CUDA)
# When we split torch_cuda, we want a dummy torch_cuda library that contains both parts
add_library(torch_cuda ${DUMMY_EMPTY_FILE})
endif()
if(HAVE_SOVERSION)
set_target_properties(torch PROPERTIES
VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
Expand Down Expand Up @@ -926,37 +922,19 @@ elseif(USE_CUDA)
${Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY})
set_property(TARGET torch_cuda_w_sort_by_key PROPERTY CUDA_SEPARABLE_COMPILATION OFF)
target_link_libraries(torch_cuda PRIVATE torch_cuda_w_sort_by_key)
elseif(BUILD_SPLIT_CUDA)
add_library(torch_cuda_cpp ${Caffe2_GPU_SRCS} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY})
add_library(torch_cuda_cu ${Caffe2_GPU_CU_SRCS} ${Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY})
else()
add_library(torch_cuda
${Caffe2_GPU_SRCS} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY}
${Caffe2_GPU_CU_SRCS} ${Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY})
endif()
set(CUDA_LINK_LIBRARIES_KEYWORD)
if(BUILD_SPLIT_CUDA)
torch_compile_options(torch_cuda_cpp) # see cmake/public/utils.cmake
torch_compile_options(torch_cuda_cu) # see cmake/public/utils.cmake
target_compile_definitions(torch_cuda_cpp PRIVATE BUILD_SPLIT_CUDA)
target_compile_definitions(torch_cuda_cpp PRIVATE USE_CUDA)
target_compile_definitions(torch_cuda_cu PRIVATE BUILD_SPLIT_CUDA)
target_compile_definitions(torch_cuda_cu PRIVATE USE_CUDA)
else()
torch_compile_options(torch_cuda) # see cmake/public/utils.cmake
target_compile_definitions(torch_cuda PRIVATE USE_CUDA)
endif()
if(USE_NCCL AND BUILD_SPLIT_CUDA)
target_link_libraries(torch_cuda_cpp PRIVATE __caffe2_nccl)
target_compile_definitions(torch_cuda_cpp PRIVATE USE_NCCL)
elseif(USE_NCCL)
torch_compile_options(torch_cuda) # see cmake/public/utils.cmake
target_compile_definitions(torch_cuda PRIVATE USE_CUDA)
if(USE_NCCL)
target_link_libraries(torch_cuda PRIVATE __caffe2_nccl)
target_compile_definitions(torch_cuda PRIVATE USE_NCCL)
endif()
if(USE_UCC AND BUILD_SPLIT_CUDA)
target_link_libraries(torch_cuda_cpp PRIVATE __caffe2_ucc)
target_compile_definitions(torch_cuda_cpp PRIVATE USE_UCC)
elseif(USE_UCC)
if(USE_UCC)
target_link_libraries(torch_cuda PRIVATE __caffe2_ucc)
target_compile_definitions(torch_cuda PRIVATE USE_UCC)
endif()
Expand Down Expand Up @@ -998,13 +976,8 @@ elseif(USE_CUDA)
endif()

if(USE_PRECOMPILED_HEADERS)
if(BUILD_SPLIT_CUDA)
target_precompile_headers(torch_cuda_cpp PRIVATE
"$<$<COMPILE_LANGUAGE:CXX>:ATen/core/ATen_pch.h>")
else()
target_precompile_headers(torch_cuda PRIVATE
"$<$<COMPILE_LANGUAGE:CXX>:ATen/core/ATen_pch.h>")
endif()
target_precompile_headers(torch_cuda PRIVATE
"$<$<COMPILE_LANGUAGE:CXX>:ATen/core/ATen_pch.h>")
endif()
endif()

Expand Down Expand Up @@ -1085,12 +1058,7 @@ if(NOT NO_API)
${TORCH_SRC_DIR}/csrc/api/include)
endif()

if(BUILD_SPLIT_CUDA AND MSVC)
# -INCLUDE is used to ensure torch_cuda_cpp/cu are linked against in a project that relies on them.
target_link_libraries(torch_cuda_cpp INTERFACE "-INCLUDE:?warp_size@cuda@at@@YAHXZ")
# See [Note about _torch_cuda_cu_linker_symbol_op and torch_cuda_cu] in native_functions.yaml
target_link_libraries(torch_cuda_cu INTERFACE "-INCLUDE:?_torch_cuda_cu_linker_symbol_op_cuda@native@at@@YA?AVTensor@2@AEBV32@@Z")
elseif(USE_CUDA AND MSVC)
if(USE_CUDA AND MSVC)
# -INCLUDE is used to ensure torch_cuda is linked against in a project that relies on them.
# Related issue: https://github.com/pytorch/pytorch/issues/31611
target_link_libraries(torch_cuda INTERFACE "-INCLUDE:?warp_size@cuda@at@@YAHXZ")
Expand Down Expand Up @@ -1320,27 +1288,16 @@ if(USE_DISTRIBUTED)
if(USE_UCC AND USE_C10D_UCC)
target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
if(USE_CUDA)
if(BUILD_SPLIT_CUDA)
target_compile_definitions(torch_cuda_cpp PUBLIC USE_C10D_UCC)
else()
target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
endif()
target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
endif()
endif()
if(USE_NCCL AND USE_C10D_NCCL)
if(USE_ROCM)
target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
else()
if(BUILD_SPLIT_CUDA)
target_compile_definitions(torch_cuda_cpp PUBLIC USE_C10D_NCCL)
if(USE_NCCL_WITH_UCC)
target_compile_definitions(torch_cuda_cpp PUBLIC USE_NCCL_WITH_UCC)
endif()
else()
target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
if(USE_NCCL_WITH_UCC)
target_compile_definitions(torch_cuda PUBLIC USE_NCCL_WITH_UCC)
endif()
target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
if(USE_NCCL_WITH_UCC)
target_compile_definitions(torch_cuda PUBLIC USE_NCCL_WITH_UCC)
endif()
endif()
endif()
Expand Down Expand Up @@ -1423,14 +1380,7 @@ torch_set_target_props(torch_cpu)


target_compile_options(torch_cpu PRIVATE "-DCAFFE2_BUILD_MAIN_LIB")
if(BUILD_SPLIT_CUDA)
target_compile_options(torch_cuda_cu PRIVATE "-DTORCH_CUDA_CU_BUILD_MAIN_LIB")
target_compile_options(torch_cuda_cpp PRIVATE "-DTORCH_CUDA_CPP_BUILD_MAIN_LIB")
# NB: This must be target_compile_definitions, not target_compile_options,
# as the latter is not respected by nvcc
target_compile_definitions(torch_cuda_cu PRIVATE "-DTORCH_CUDA_CU_BUILD_MAIN_LIB")
target_compile_definitions(torch_cuda_cpp PRIVATE "-DTORCH_CUDA_CPP_BUILD_MAIN_LIB")
elseif(USE_CUDA)
if(USE_CUDA)
target_compile_options(torch_cuda PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB")
# NB: This must be target_compile_definitions, not target_compile_options,
# as the latter is not respected by nvcc
Expand All @@ -1441,10 +1391,7 @@ elseif(USE_ROCM)
endif()

if(USE_EXPERIMENTAL_CUDNN_V8_API)
if(BUILD_SPLIT_CUDA)
target_compile_definitions(torch_cuda_cu PRIVATE "-DUSE_EXPERIMENTAL_CUDNN_V8_API")
target_compile_definitions(torch_cuda_cpp PRIVATE "-DUSE_EXPERIMENTAL_CUDNN_V8_API")
elseif(USE_CUDA)
if(USE_CUDA)
target_compile_definitions(torch_cuda PRIVATE "-DUSE_EXPERIMENTAL_CUDNN_V8_API")
endif()
endif()
Expand Down Expand Up @@ -1534,10 +1481,6 @@ caffe2_interface_library(torch_cpu torch_cpu_library)

if(USE_CUDA)
caffe2_interface_library(torch_cuda torch_cuda_library)
if(BUILD_SPLIT_CUDA)
caffe2_interface_library(torch_cuda_cu torch_cuda_cu_library)
caffe2_interface_library(torch_cuda_cpp torch_cuda_cpp_library)
endif()
elseif(USE_ROCM)
caffe2_interface_library(torch_hip torch_hip_library)
endif()
Expand All @@ -1548,10 +1491,6 @@ install(TARGETS torch_cpu torch_cpu_library EXPORT Caffe2Targets DESTINATION "${

if(USE_CUDA)
install(TARGETS torch_cuda torch_cuda_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
if(BUILD_SPLIT_CUDA)
install(TARGETS torch_cuda_cu torch_cuda_cu_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
install(TARGETS torch_cuda_cpp torch_cuda_cpp_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
endif()
elseif(USE_ROCM)
install(TARGETS torch_hip torch_hip_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
endif()
Expand All @@ -1561,11 +1500,6 @@ target_link_libraries(torch PUBLIC torch_cpu_library)

if(USE_CUDA)
target_link_libraries(torch PUBLIC torch_cuda_library)
if(BUILD_SPLIT_CUDA)
# NS: Library order is important here to prevent cudnn double linking
target_link_libraries(torch_cuda PUBLIC torch_cuda_cpp_library)
target_link_libraries(torch_cuda PUBLIC torch_cuda_cu_library)
endif()
elseif(USE_ROCM)
target_link_libraries(torch PUBLIC torch_hip_library)
endif()
Expand All @@ -1578,47 +1512,15 @@ endif()
# Install PDB files for MSVC builds
if(MSVC AND BUILD_SHARED_LIBS)
install(FILES $<TARGET_PDB_FILE:torch_cpu> DESTINATION "${TORCH_INSTALL_LIB_DIR}" OPTIONAL)
if(BUILD_SPLIT_CUDA)
install(FILES $<TARGET_PDB_FILE:torch_cuda_cu> DESTINATION "${TORCH_INSTALL_LIB_DIR}" OPTIONAL)
install(FILES $<TARGET_PDB_FILE:torch_cuda_cpp> DESTINATION "${TORCH_INSTALL_LIB_DIR}" OPTIONAL)
elseif(USE_CUDA)
if(USE_CUDA)
install(FILES $<TARGET_PDB_FILE:torch_cuda> DESTINATION "${TORCH_INSTALL_LIB_DIR}" OPTIONAL)
elseif(USE_ROCM)
install(FILES $<TARGET_PDB_FILE:torch_hip> DESTINATION "${TORCH_INSTALL_LIB_DIR}" OPTIONAL)
endif()
endif()

# ---[ CUDA library.
if(BUILD_SPLIT_CUDA)
target_link_libraries(torch_cuda_cu INTERFACE torch::cudart)
target_link_libraries(torch_cuda_cpp INTERFACE torch::cudart)
target_link_libraries(torch_cuda_cu PUBLIC c10_cuda torch::nvtoolsext)
target_link_libraries(torch_cuda_cpp PUBLIC c10_cuda torch::nvtoolsext)

target_include_directories(
torch_cuda_cu INTERFACE $<INSTALL_INTERFACE:include>)
target_include_directories(
torch_cuda_cpp INTERFACE $<INSTALL_INTERFACE:include>)
target_include_directories(
torch_cuda_cu PRIVATE ${Caffe2_GPU_INCLUDE})
target_include_directories(
torch_cuda_cpp PRIVATE ${Caffe2_GPU_INCLUDE})
target_link_libraries(
torch_cuda_cu PRIVATE ${Caffe2_CUDA_DEPENDENCY_LIBS})
target_link_libraries(
torch_cuda_cpp PRIVATE ${Caffe2_CUDA_DEPENDENCY_LIBS})
target_link_libraries(torch_cuda_cu PRIVATE torch_cuda_cpp)
if(USE_CUDNN)
target_link_libraries(
torch_cuda_cpp PRIVATE caffe2::cudnn-private)
endif()

# These public dependencies must go after the previous dependencies, as the
# order of the libraries in the linker call matters here when statically
# linking; libculibos and cublas must be last.
target_link_libraries(torch_cuda_cpp PUBLIC torch_cpu_library ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
target_link_libraries(torch_cuda_cu PUBLIC torch_cpu_library ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
elseif(USE_CUDA)
if(USE_CUDA)
target_link_libraries(torch_cuda INTERFACE torch::cudart)
target_link_libraries(torch_cuda PUBLIC c10_cuda torch::nvtoolsext)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@
("aten::nested_to_padded_tensor", datetime.date(2022, 10, 1)),
("aten::nested_tensor", datetime.date(2022, 10, 15)),
("aten::_nested_tensor_layer_norm", datetime.date(2022, 10, 15)),
("aten::_torch_cuda_cu_linker_symbol_op", datetime.date(2022, 11, 1)),

]

Expand Down
4 changes: 1 addition & 3 deletions torch/csrc/jit/codegen/cuda/nvfuser.cmake
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
if(BUILD_SPLIT_CUDA)
set(TORCHLIB_FLAVOR torch_cuda_cu) # chose torch_cuda_cu here since JIT is in torch_cuda_cpp
elseif(USE_CUDA)
if(USE_CUDA)
set(TORCHLIB_FLAVOR torch_cuda)
elseif(USE_ROCM)
set(TORCHLIB_FLAVOR torch_hip)
Expand Down
Loading