Skip to content

Commit

Permalink
[core][pruning][feature] cuSPARSELt integration
Browse files Browse the repository at this point in the history
Summary:

This PR integrates cuSPARSELt v0.4.0.7 into PyTorch.

It is comprised of three parts:
1. modifications  to the build process to add a USE_CUSPARSELT flag,
   which currently defaults to 0.

CUSPARSELT_ROOT also needs to be set to the location of the cusparselt
library.

Compiling pytorch with cusparselt support can be done as follows

```
USE_CUSPARSELT=1
CUSPARSELT_ROOT=/path/to/cusparselt

python setup.py develop
```

2. It adds in two new private ops:
```
_cslt_compress()
_cslt_sparse_mm()
```

_cslt_compress is an op that reuturns the compressesed matrix given a
sparse matrix that is passed in.

_cslt_sparse_mm is an op that expects a compressed matrix (the result of
_cslt_compress) and a dense matrix and performs sparse-dense matmul

These ops will throw runtime errors if they cusparselt is not present.

3. Updates to teh semi-structured tensor subclass to use the new
   cuSPARSELt ops.

Test Plan:
```
python test/test_sparse_semi_structured.py
```

Reviewers:

Subscribers:

Tasks:

Tags:

ghstack-source-id: 20c7a3ab256d59bd77c9ae4394c6b80d5089d670
Pull Request resolved: #103700
  • Loading branch information
jcaip committed Jul 31, 2023
1 parent 0a0abd0 commit ce13e18
Show file tree
Hide file tree
Showing 14 changed files with 538 additions and 82 deletions.
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1152,6 +1152,10 @@ if(BUILD_SHARED_LIBS)
${PROJECT_SOURCE_DIR}/cmake/Modules/FindCUDAToolkit.cmake
DESTINATION share/cmake/Caffe2/
COMPONENT dev)
install(FILES
${PROJECT_SOURCE_DIR}/cmake/Modules/FindCUSPARSELT.cmake
DESTINATION share/cmake/Caffe2/
COMPONENT dev)

install(EXPORT Caffe2Targets DESTINATION share/cmake/Caffe2
FILE Caffe2Targets.cmake
Expand Down
6 changes: 6 additions & 0 deletions aten/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,12 @@ else()
set(AT_NNPACK_ENABLED 1)
endif()

if(NOT USE_CUSPARSELT)
set(AT_CUSPARSELT_ENABLED 0)
else()
set(AT_CUSPARSELT_ENABLED 1)
endif()

list(APPEND ATen_CPU_INCLUDE
${CMAKE_CURRENT_SOURCE_DIR}/src)
add_subdirectory(src/ATen)
Expand Down
1 change: 1 addition & 0 deletions aten/src/ATen/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ set_bool(AT_BLAS_USE_CBLAS_DOT BLAS_USE_CBLAS_DOT)
set_bool(AT_MAGMA_ENABLED USE_MAGMA)
set_bool(CAFFE2_STATIC_LINK_CUDA_INT CAFFE2_STATIC_LINK_CUDA)
set_bool(AT_CUDNN_ENABLED CAFFE2_USE_CUDNN)
set_bool(AT_CUSPARSELT_ENABLED CAFFE2_USE_CUSPARSELT)

configure_file(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h")
# TODO: Do not generate CUDAConfig.h for ROCm BUILDS
Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/cuda/CUDAConfig.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
//
// NB: This header MUST NOT be included from other headers; it should
// only be included from C++ files.

#define AT_CUDNN_ENABLED() @AT_CUDNN_ENABLED@
#define AT_CUSPARSELT_ENABLED() @AT_CUSPARSELT_ENABLED@
#define AT_ROCM_ENABLED() @AT_ROCM_ENABLED@
#define AT_MAGMA_ENABLED() @AT_MAGMA_ENABLED@

Expand Down
8 changes: 8 additions & 0 deletions aten/src/ATen/native/native_functions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3240,6 +3240,14 @@
dispatch:
CUDA: _sparse_semi_structured_linear

- func: _cslt_compress(Tensor input) -> Tensor
dispatch:
CUDA: _cslt_compress

- func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, bool transpose_result=False) -> Tensor
dispatch:
CUDA: _cslt_sparse_mm

- func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor

- func: fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
Expand Down
269 changes: 269 additions & 0 deletions aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,269 @@
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDADataType.h>
#include <ATen/cuda/CUDASparse.h>
#include <ATen/cuda/CUDAConfig.h>
#include <ATen/core/Tensor.h>
#include <ATen/Dispatch.h>
#include <c10/core/ScalarType.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/util/Half.h>
#include <cusparse.h>
#include <cstdint>

#if !AT_CUSPARSELT_ENABLED()

namespace at{
namespace native{

at::Tensor _cslt_compress(const Tensor& sparse_input){
TORCH_CHECK(false, "cuSPARSELT not supported on your machine.");
}

at::Tensor _cslt_sparse_mm(
const Tensor& compressed_A,
const Tensor& dense_B,
const c10::optional<Tensor>& bias_opt,
bool transpose_result)
{
TORCH_CHECK(false, "cuSPARSELT not supported on your machine.");
}

} // namespace native
} //namespace at

#else // No cuSPARSELt support, throw error if these functions are called.

#include <cusparseLt.h>

namespace at {
namespace native {

cusparseLtHandle_t handle;
bool handle_initialized = false;

at::Tensor _cslt_compress(const Tensor& sparse_input)
{
if (!handle_initialized){
TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
handle_initialized = true;
}
// create sparse descriptor, dtype
cusparseLtMatDescriptor_t sparse_input_descriptor;
cudaDataType type;
auto compression_factor = 9;

switch(
sparse_input.scalar_type()
)
{
case at::ScalarType::Char:
type = CUDA_R_8I;
compression_factor = 10;
break;
case at::ScalarType::Half:
type = CUDA_R_16F;
break;
case at::ScalarType::BFloat16:
type = CUDA_R_16BF;
break;
case at::ScalarType::Float:
type = CUDA_R_32F;
break;
default:
TORCH_CHECK(false, "Unsupported dtype for cuSPARSELt compressed matrix");
break;
}

// create a new compressed tensor with the same dtype as
auto compressed_tensor = sparse_input.new_empty(sparse_input.numel() * compression_factor / 16);

TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
&handle,
&sparse_input_descriptor,
sparse_input.size(0),
sparse_input.size(1),
sparse_input.size(1),
16,
type,
CUSPARSE_ORDER_ROW,
CUSPARSELT_SPARSITY_50_PERCENT));

// compress input
//--------------------------------------------------------------------------
size_t compressed_size, compressed_buffer_size;
TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompressedSize2(
&handle,
&sparse_input_descriptor,
&compressed_size,
&compressed_buffer_size));

auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);

TORCH_CUDASPARSE_CHECK(cusparseLtSpMMACompress2(
&handle,
&sparse_input_descriptor,
true,
CUSPARSE_OPERATION_NON_TRANSPOSE,
sparse_input.data_ptr(),
compressed_tensor.data_ptr(),
compressedBufferPtr.get(),
nullptr));

return compressed_tensor;
}


at::Tensor _cslt_sparse_mm(
const Tensor& compressed_A,
const Tensor& dense_B,
const c10::optional<Tensor>& bias_opt,
bool transpose_result
)
{
if (!handle_initialized){
TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
handle_initialized = true;
}
// cupsarselt constructs
cusparseLtMatmulDescriptor_t matmul;
cusparseLtMatmulPlan_t plan;
cusparseLtMatmulAlgSelection_t alg_sel;

float alpha = 1.0;
float beta = 0.0;
cudaDataType type;
cusparseComputeType compute_type;
auto compression_factor = 9;

switch(compressed_A.scalar_type())
{
case at::ScalarType::Char:
type = CUDA_R_8I;
compute_type = CUSPARSE_COMPUTE_32I;
compression_factor = 10;
break;
case at::ScalarType::Half:
type = CUDA_R_16F;
compute_type = CUSPARSE_COMPUTE_16F;
break;
case at::ScalarType::BFloat16:
type = CUDA_R_16BF;
compute_type = CUSPARSE_COMPUTE_16F;
break;
case at::ScalarType::Float:
type = CUDA_R_32F;
compute_type = CUSPARSE_COMPUTE_TF32;
break;
default:
TORCH_CHECK(false, "Unsupported dtype for cuSPARSE compressed matrix multiplication.");
break;
}

int64_t k = dense_B.size(0);
int64_t n = dense_B.size(1);
int64_t m = (compressed_A.numel() * 16 / compression_factor ) / k;

//initialize sparse descriptor
cusparseLtMatDescriptor_t sparse_input_descriptor;
TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
&handle,
&sparse_input_descriptor,
m,
k,
k,
16,
type,
CUSPARSE_ORDER_ROW,
CUSPARSELT_SPARSITY_50_PERCENT));

// initalize dense input descriptor
cusparseLtMatDescriptor_t dense_input_descriptor;
TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
&handle,
&dense_input_descriptor,
(dense_B.is_contiguous()) ? k : n,
(dense_B.is_contiguous()) ? n : k,
(dense_B.is_contiguous()) ? n : k,
16,
type,
CUSPARSE_ORDER_ROW));

// create result tensor
auto res = (transpose_result) ? dense_B.new_empty({n, m})
: dense_B.new_empty({m, n});


cusparseLtMatDescriptor_t res_descriptor;
TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
&handle,
&res_descriptor,
m,
n,
(transpose_result) ? m: n,
16,
type,
(transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));

// intialize matmul
TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
&handle,
&matmul,
CUSPARSE_OPERATION_NON_TRANSPOSE,
(dense_B.is_contiguous()) ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE,
&sparse_input_descriptor,
&dense_input_descriptor,
&res_descriptor,
&res_descriptor,
compute_type));

// set bias pointer for matmut, need to assign to get location
if (bias_opt.has_value()) {
auto& bias = bias_opt.value();
void* dBias = bias.data_ptr();
TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
&handle, &matmul, CUSPARSELT_MATMUL_BIAS_POINTER, &dBias, sizeof(dBias)));
}

TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSelectionInit(
&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT));

TORCH_CUDASPARSE_CHECK(
cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel));

size_t workspace_size;
TORCH_CUDASPARSE_CHECK(
cusparseLtMatmulGetWorkspace(&handle, &plan, &workspace_size));

auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
auto workspacePtr = allocator.allocate(workspace_size);

TORCH_CUDASPARSE_CHECK(cusparseLtMatmul(
&handle,
&plan,
&alpha,
compressed_A.data_ptr(),
dense_B.data_ptr(),
&beta,
res.data_ptr(),
res.data_ptr(),
workspacePtr.get(),
nullptr,
0));


//destroy descriptors
TORCH_CUDASPARSE_CHECK(
cusparseLtMatDescriptorDestroy(&sparse_input_descriptor));
TORCH_CUDASPARSE_CHECK(
cusparseLtMatDescriptorDestroy(&dense_input_descriptor));
TORCH_CUDASPARSE_CHECK(cusparseLtMatDescriptorDestroy(&res_descriptor));
// destroy plan
TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
return res;
}

} // namespace native
} // namespace at

#endif
5 changes: 5 additions & 0 deletions caffe2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -956,6 +956,11 @@ elseif(USE_CUDA)
set(CUDA_LINK_LIBRARIES_KEYWORD)
torch_compile_options(torch_cuda) # see cmake/public/utils.cmake
target_compile_definitions(torch_cuda PRIVATE USE_CUDA)

if(USE_CUSPARSELT)
target_link_libraries(torch_cuda PRIVATE torch::cusparselt)
target_compile_definitions(torch_cuda PRIVATE USE_CUSPARSELT)
endif()
if(USE_NCCL)
target_link_libraries(torch_cuda PRIVATE __caffe2_nccl)
target_compile_definitions(torch_cuda PRIVATE USE_NCCL)
Expand Down
6 changes: 6 additions & 0 deletions cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ if(USE_CUDA)
# public/*.cmake uses CAFFE2_USE_*
set(CAFFE2_USE_CUDA ${USE_CUDA})
set(CAFFE2_USE_CUDNN ${USE_CUDNN})
set(CAFFE2_USE_CUSPARSELT ${USE_CUSPARSELT})
set(CAFFE2_USE_NVRTC ${USE_NVRTC})
set(CAFFE2_USE_TENSORRT ${USE_TENSORRT})
include(${CMAKE_CURRENT_LIST_DIR}/public/cuda.cmake)
Expand All @@ -57,6 +58,11 @@ if(USE_CUDA)
else()
caffe2_update_option(USE_CUDNN OFF)
endif()
if(CAFFE2_USE_CUSPARSELT)
list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS torch::cusparselt)
else()
caffe2_update_option(USE_CUSPARSELT OFF)
endif()
if(CAFFE2_USE_TENSORRT)
list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::tensorrt)
else()
Expand Down

0 comments on commit ce13e18

Please sign in to comment.