Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[core][pruning][feature] cuSPARSELt build integration #103700

Closed
wants to merge 33 commits into from
Closed
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
b551a60
[core][pruning][feature] cuSPARSELt integration
jcaip Jun 15, 2023
2491add
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jun 15, 2023
c6e553c
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jun 22, 2023
a0c22e9
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jun 22, 2023
9dbef1a
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jun 27, 2023
32efc97
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jul 20, 2023
190c8ae
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jul 20, 2023
7ec37b6
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jul 20, 2023
151d70b
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jul 20, 2023
821076f
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jul 20, 2023
761e851
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jul 22, 2023
bcce2d4
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jul 22, 2023
05241a1
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jul 24, 2023
a6f5f1c
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jul 24, 2023
5ccc09a
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jul 24, 2023
c327a17
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jul 26, 2023
8a8830b
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jul 27, 2023
590ea84
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jul 27, 2023
7affa24
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jul 31, 2023
51df7ed
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jul 31, 2023
f824700
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jul 31, 2023
0f80f29
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jul 31, 2023
bce9a44
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jul 31, 2023
4301278
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jul 31, 2023
efd8867
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jul 31, 2023
fc4778f
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jul 31, 2023
b2fc7d4
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Jul 31, 2023
c067e6d
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Aug 1, 2023
4b281da
Update on "[core][pruning][feature] cuSPARSELt integration"
jcaip Aug 1, 2023
50d4648
Update on "[core][pruning][feature] cuSPARSELt build integration"
jcaip Aug 1, 2023
0bfcd8c
Update on "[core][pruning][feature] cuSPARSELt build integration"
jcaip Aug 1, 2023
2a2126d
Update on "[core][pruning][feature] cuSPARSELt build integration"
jcaip Aug 1, 2023
cf4c4d2
Update on "[core][pruning][feature] cuSPARSELt build integration"
jcaip Aug 1, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1152,6 +1152,10 @@ if(BUILD_SHARED_LIBS)
${PROJECT_SOURCE_DIR}/cmake/Modules/FindCUDAToolkit.cmake
DESTINATION share/cmake/Caffe2/
COMPONENT dev)
install(FILES
${PROJECT_SOURCE_DIR}/cmake/Modules/FindCUSPARSELT.cmake
DESTINATION share/cmake/Caffe2/
COMPONENT dev)

install(EXPORT Caffe2Targets DESTINATION share/cmake/Caffe2
FILE Caffe2Targets.cmake
Expand Down
6 changes: 6 additions & 0 deletions aten/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,12 @@ else()
set(AT_NNPACK_ENABLED 1)
endif()

if(NOT USE_CUSPARSELT)
set(AT_CUSPARSELT_ENABLED 0)
else()
set(AT_CUSPARSELT_ENABLED 1)
endif()

list(APPEND ATen_CPU_INCLUDE
${CMAKE_CURRENT_SOURCE_DIR}/src)
add_subdirectory(src/ATen)
Expand Down
1 change: 1 addition & 0 deletions aten/src/ATen/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ set_bool(AT_BLAS_USE_CBLAS_DOT BLAS_USE_CBLAS_DOT)
set_bool(AT_MAGMA_ENABLED USE_MAGMA)
set_bool(CAFFE2_STATIC_LINK_CUDA_INT CAFFE2_STATIC_LINK_CUDA)
set_bool(AT_CUDNN_ENABLED CAFFE2_USE_CUDNN)
set_bool(AT_CUSPARSELT_ENABLED CAFFE2_USE_CUSPARSELT)

configure_file(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h")
# TODO: Do not generate CUDAConfig.h for ROCm BUILDS
Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/cuda/CUDAConfig.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
//
// NB: This header MUST NOT be included from other headers; it should
// only be included from C++ files.

#define AT_CUDNN_ENABLED() @AT_CUDNN_ENABLED@
#define AT_CUSPARSELT_ENABLED() @AT_CUSPARSELT_ENABLED@
#define AT_ROCM_ENABLED() @AT_ROCM_ENABLED@
#define AT_MAGMA_ENABLED() @AT_MAGMA_ENABLED@

Expand Down
8 changes: 8 additions & 0 deletions aten/src/ATen/native/native_functions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3240,6 +3240,14 @@
dispatch:
CUDA: _sparse_semi_structured_linear

- func: _cslt_compress(Tensor input) -> Tensor
dispatch:
CUDA: _cslt_compress

- func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, bool transpose_result=False) -> Tensor
dispatch:
CUDA: _cslt_sparse_mm

- func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor

- func: fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
Expand Down
269 changes: 269 additions & 0 deletions aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,269 @@
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDADataType.h>
#include <ATen/cuda/CUDASparse.h>
#include <ATen/cuda/CUDAConfig.h>
#include <ATen/core/Tensor.h>
#include <ATen/Dispatch.h>
#include <c10/core/ScalarType.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/util/Half.h>
#include <cusparse.h>
#include <cstdint>

#if !AT_CUSPARSELT_ENABLED()

namespace at{
namespace native{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Space after namespace and use nested namespaces

Suggested change
namespace at{
namespace native{
namespace at::native {


at::Tensor _cslt_compress(const Tensor& sparse_input){
TORCH_CHECK(false, "cuSPARSELT not supported on your machine.");
}

at::Tensor _cslt_sparse_mm(
const Tensor& compressed_A,
const Tensor& dense_B,
const c10::optional<Tensor>& bias_opt,
bool transpose_result)
{
TORCH_CHECK(false, "cuSPARSELT not supported on your machine.");
}

} // namespace native
} //namespace at

#else // No cuSPARSELt support, throw error if these functions are called.

#include <cusparseLt.h>

namespace at {
namespace native {

cusparseLtHandle_t handle;
bool handle_initialized = false;

at::Tensor _cslt_compress(const Tensor& sparse_input)
{
if (!handle_initialized){
TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
handle_initialized = true;
}
// create sparse descriptor, dtype
cusparseLtMatDescriptor_t sparse_input_descriptor;
cudaDataType type;
auto compression_factor = 9;

switch(
sparse_input.scalar_type()
)
{
case at::ScalarType::Char:
type = CUDA_R_8I;
compression_factor = 10;
break;
case at::ScalarType::Half:
type = CUDA_R_16F;
break;
case at::ScalarType::BFloat16:
type = CUDA_R_16BF;
break;
case at::ScalarType::Float:
type = CUDA_R_32F;
break;
default:
TORCH_CHECK(false, "Unsupported dtype for cuSPARSELt compressed matrix");
break;
}

// create a new compressed tensor with the same dtype as
auto compressed_tensor = sparse_input.new_empty(sparse_input.numel() * compression_factor / 16);

TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
&handle,
&sparse_input_descriptor,
sparse_input.size(0),
sparse_input.size(1),
sparse_input.size(1),
16,
type,
CUSPARSE_ORDER_ROW,
CUSPARSELT_SPARSITY_50_PERCENT));

// compress input
//--------------------------------------------------------------------------
size_t compressed_size, compressed_buffer_size;
TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompressedSize2(
&handle,
&sparse_input_descriptor,
&compressed_size,
&compressed_buffer_size));

auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
albanD marked this conversation as resolved.
Show resolved Hide resolved

TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompress2(
&handle,
&sparse_input_descriptor,
true,
CUSPARSE_OPERATION_NON_TRANSPOSE,
sparse_input.data_ptr(),
compressed_tensor.data_ptr(),
compressedBufferPtr.get(),
nullptr));

return compressed_tensor;
}


at::Tensor _cslt_sparse_mm(
const Tensor& compressed_A,
const Tensor& dense_B,
const c10::optional<Tensor>& bias_opt,
bool transpose_result
)
{
if (!handle_initialized){
TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
handle_initialized = true;
}
// cupsarselt constructs
cusparseLtMatmulDescriptor_t matmul;
cusparseLtMatmulPlan_t plan;
cusparseLtMatmulAlgSelection_t alg_sel;

float alpha = 1.0;
float beta = 0.0;
cudaDataType type;
cusparseComputeType compute_type;
auto compression_factor = 9;

switch(compressed_A.scalar_type())
{
case at::ScalarType::Char:
type = CUDA_R_8I;
compute_type = CUSPARSE_COMPUTE_32I;
compression_factor = 10;
break;
case at::ScalarType::Half:
type = CUDA_R_16F;
compute_type = CUSPARSE_COMPUTE_16F;
break;
case at::ScalarType::BFloat16:
type = CUDA_R_16BF;
compute_type = CUSPARSE_COMPUTE_16F;
break;
case at::ScalarType::Float:
type = CUDA_R_32F;
compute_type = CUSPARSE_COMPUTE_TF32;
break;
default:
TORCH_CHECK(false, "Unsupported dtype for cuSPARSE compressed matrix multiplication.");
break;
}

int64_t k = dense_B.size(0);
int64_t n = dense_B.size(1);
int64_t m = (compressed_A.numel() * 16 / compression_factor ) / k;

//initialize sparse descriptor
cusparseLtMatDescriptor_t sparse_input_descriptor;
TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
&handle,
&sparse_input_descriptor,
m,
k,
k,
16,
type,
CUSPARSE_ORDER_ROW,
CUSPARSELT_SPARSITY_50_PERCENT));

// initalize dense input descriptor
cusparseLtMatDescriptor_t dense_input_descriptor;
TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
&handle,
&dense_input_descriptor,
(dense_B.is_contiguous()) ? k : n,
(dense_B.is_contiguous()) ? n : k,
(dense_B.is_contiguous()) ? n : k,
16,
type,
CUSPARSE_ORDER_ROW));

// create result tensor
auto res = (transpose_result) ? dense_B.new_empty({n, m})
: dense_B.new_empty({m, n});


cusparseLtMatDescriptor_t res_descriptor;
TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
&handle,
&res_descriptor,
m,
n,
(transpose_result) ? m: n,
16,
type,
(transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));

// intialize matmul
TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
&handle,
&matmul,
CUSPARSE_OPERATION_NON_TRANSPOSE,
(dense_B.is_contiguous()) ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE,
&sparse_input_descriptor,
&dense_input_descriptor,
&res_descriptor,
&res_descriptor,
compute_type));

// set bias pointer for matmut, need to assign to get location
if (bias_opt.has_value()) {
auto& bias = bias_opt.value();
void* dBias = bias.data_ptr();
TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
&handle, &matmul, CUSPARSELT_MATMUL_BIAS_POINTER, &dBias, sizeof(dBias)));
}

TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSelectionInit(
&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT));

TORCH_CUDASPARSE_CHECK(
cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel));

size_t workspace_size;
TORCH_CUDASPARSE_CHECK(
cusparseLtMatmulGetWorkspace(&handle, &plan, &workspace_size));

auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
auto workspacePtr = allocator.allocate(workspace_size);

TORCH_CUDASPARSE_CHECK(cusparseLtMatmul(
&handle,
&plan,
&alpha,
compressed_A.data_ptr(),
dense_B.data_ptr(),
&beta,
res.data_ptr(),
res.data_ptr(),
workspacePtr.get(),
nullptr,
0));


//destroy descriptors
TORCH_CUDASPARSE_CHECK(
cusparseLtMatDescriptorDestroy(&sparse_input_descriptor));
TORCH_CUDASPARSE_CHECK(
cusparseLtMatDescriptorDestroy(&dense_input_descriptor));
TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&res_descriptor));
// destroy plan
TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
return res;
}

} // namespace native
} // namespace at

#endif
5 changes: 5 additions & 0 deletions caffe2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -956,6 +956,11 @@ elseif(USE_CUDA)
set(CUDA_LINK_LIBRARIES_KEYWORD)
torch_compile_options(torch_cuda) # see cmake/public/utils.cmake
target_compile_definitions(torch_cuda PRIVATE USE_CUDA)

if(USE_CUSPARSELT)
target_link_libraries(torch_cuda PRIVATE torch::cusparselt)
target_compile_definitions(torch_cuda PRIVATE USE_CUSPARSELT)
endif()
if(USE_NCCL)
target_link_libraries(torch_cuda PRIVATE __caffe2_nccl)
target_compile_definitions(torch_cuda PRIVATE USE_NCCL)
Expand Down
6 changes: 6 additions & 0 deletions cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ if(USE_CUDA)
# public/*.cmake uses CAFFE2_USE_*
set(CAFFE2_USE_CUDA ${USE_CUDA})
set(CAFFE2_USE_CUDNN ${USE_CUDNN})
set(CAFFE2_USE_CUSPARSELT ${USE_CUSPARSELT})
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this is a new env variable, please document it at the top of setup.py with the others.

set(CAFFE2_USE_NVRTC ${USE_NVRTC})
set(CAFFE2_USE_TENSORRT ${USE_TENSORRT})
include(${CMAKE_CURRENT_LIST_DIR}/public/cuda.cmake)
Expand All @@ -57,6 +58,11 @@ if(USE_CUDA)
else()
caffe2_update_option(USE_CUDNN OFF)
endif()
if(CAFFE2_USE_CUSPARSELT)
list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS torch::cusparselt)
else()
caffe2_update_option(USE_CUSPARSELT OFF)
endif()
if(CAFFE2_USE_TENSORRT)
list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::tensorrt)
else()
Expand Down