Skip to content

Commit

Permalink
[Lang] Add element access for sparse matrix on CUDA (taichi-dev#6250)
Browse files Browse the repository at this point in the history
Issue: taichi-dev#2906 taichi-dev#6082 

+ Support element access for sparse matrix on GPU.
+ Add tests for GPU sparse matrix operators.

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Yi Xu <xy_xuyi@foxmail.com>
  • Loading branch information
3 people authored and quadpixels committed May 13, 2023
1 parent 8c0f9b4 commit f54c2fc
Show file tree
Hide file tree
Showing 5 changed files with 145 additions and 96 deletions.
84 changes: 0 additions & 84 deletions misc/test_gpu_sm_op.py

This file was deleted.

75 changes: 64 additions & 11 deletions taichi/program/sparse_matrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ struct key_hash {
};

template <typename T, typename T1, typename T2>
void print_triplet_from_csr(int64_t n_rows,
int n_cols,
T *row,
T1 *col,
T2 *value,
std::ostringstream &ostr) {
void print_triplets_from_csr(int64_t n_rows,
int n_cols,
T *row,
T1 *col,
T2 *value,
std::ostringstream &ostr) {
using Triplets = Eigen::Triplet<T2>;
std::vector<Triplets> trips;
for (int64_t i = 1; i <= n_rows; ++i) {
Expand All @@ -57,6 +57,20 @@ void print_triplet_from_csr(int64_t n_rows,
ostr << Eigen::MatrixXf(m.cast<float>()).format(clean_fmt);
}

template <typename T, typename T1, typename T2>
T2 get_element_from_csr(int row,
int col,
T *row_data,
T1 *col_data,
T2 *value) {
for (T i = row_data[row]; i < row_data[row + 1]; ++i) {
if (col == col_data[i])
return value[i];
}
// zero entry
return 0;
}

} // namespace

namespace taichi::lang {
Expand Down Expand Up @@ -433,7 +447,7 @@ std::unique_ptr<SparseMatrix> CuSparseMatrix::addition(
std::unique_ptr<SparseMatrix> CuSparseMatrix::matmul(
const CuSparseMatrix &other) const {
#if defined(TI_WITH_CUDA)
return gemm(other, 1.0f, 1.0f);
return gemm(other, 1.0f, 0.0f);
#else
TI_NOT_IMPLEMENTED;
return std::unique_ptr<SparseMatrix>();
Expand All @@ -446,7 +460,7 @@ std::unique_ptr<SparseMatrix> CuSparseMatrix::gemm(const CuSparseMatrix &other,
const float alpha,
const float beta) const {
#if defined(TI_WITH_CUDA)
cusparseHandle_t handle;
cusparseHandle_t handle = nullptr;
CUSPARSEDriver::get_instance().cpCreate(&handle);
cusparseOperation_t op_A = CUSPARSE_OPERATION_NON_TRANSPOSE;
cusparseOperation_t op_B = CUSPARSE_OPERATION_NON_TRANSPOSE;
Expand All @@ -468,7 +482,7 @@ std::unique_ptr<SparseMatrix> CuSparseMatrix::gemm(const CuSparseMatrix &other,
CUSPARSEDriver::get_instance().cpCreateSpGEMM(&spgemm_desc);

// 3. ask buffer_size1 bytes for external memory
void *d_buffer1;
void *d_buffer1 = nullptr;
size_t buffer_size1 = 0;
CUSPARSEDriver::get_instance().cpSpGEMM_workEstimation(
handle, op_A, op_B, &alpha, this->matrix_, other.matrix_, &beta, mat_C,
Expand All @@ -486,7 +500,7 @@ std::unique_ptr<SparseMatrix> CuSparseMatrix::gemm(const CuSparseMatrix &other,
CUSPARSEDriver::get_instance().cpSpGEMM_compute(
handle, op_A, op_B, &alpha, mat_A, mat_B, &beta, mat_C, CUDA_R_32F,
CUSPARSE_SPGEMM_DEFAULT, spgemm_desc, &buffer_size2, nullptr);
void *d_buffer2;
void *d_buffer2 = nullptr;
CUDADriver::get_instance().malloc((void **)&d_buffer2, buffer_size2);

// 6. compute the intermediate product of A * B
Expand Down Expand Up @@ -647,9 +661,48 @@ const std::string CuSparseMatrix::to_string() const {
CUDADriver::get_instance().memcpy_device_to_host((void *)hV, (void *)dV,
(nnz) * sizeof(float));

print_triplet_from_csr<int, int, float>(rows, cols, hR, hC, hV, ostr);
print_triplets_from_csr<int, int, float>(rows, cols, hR, hC, hV, ostr);
delete[] hR;
delete[] hC;
delete[] hV;
#endif
return ostr.str();
}

float CuSparseMatrix::get_element(int row, int col) const {
float res = 0.0f;
#ifdef TI_WITH_CUDA
size_t rows, cols, nnz;
float *dR;
int *dC, *dV;
cusparseIndexType_t row_type, column_type;
cusparseIndexBase_t idx_base;
cudaDataType value_type;
CUSPARSEDriver::get_instance().cpCsrGet(
matrix_, &rows, &cols, &nnz, (void **)&dR, (void **)&dC, (void **)&dV,
&row_type, &column_type, &idx_base, &value_type);

TI_ASSERT(row < rows);
TI_ASSERT(col < cols);

auto *hR = new int[rows + 1];
auto *hC = new int[nnz];
auto *hV = new float[nnz];

CUDADriver::get_instance().memcpy_device_to_host((void *)hR, (void *)dR,
(rows + 1) * sizeof(int));
CUDADriver::get_instance().memcpy_device_to_host((void *)hC, (void *)dC,
(nnz) * sizeof(int));
CUDADriver::get_instance().memcpy_device_to_host((void *)hV, (void *)dV,
(nnz) * sizeof(float));

res = get_element_from_csr<int, int, float>(row, col, hR, hC, hV);

delete[] hR;
delete[] hC;
delete[] hV;
#endif // TI_WITH_CUDA
return res;
}

} // namespace taichi::lang
2 changes: 2 additions & 0 deletions taichi/program/sparse_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,8 @@ class CuSparseMatrix : public SparseMatrix {
return &matrix_;
};

float get_element(int row, int col) const;

const std::string to_string() const override;

void *get_row_ptr() const {
Expand Down
1 change: 1 addition & 0 deletions taichi/python/export_lang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1262,6 +1262,7 @@ void export_lang(py::module &m) {
.def(float32() * py::self)
.def("matmul", &CuSparseMatrix::matmul)
.def("transpose", &CuSparseMatrix::transpose)
.def("get_element", &CuSparseMatrix::get_element)
.def("to_string", &CuSparseMatrix::to_string);

py::class_<SparseSolver>(m, "SparseSolver")
Expand Down
79 changes: 78 additions & 1 deletion tests/python/test_sparse_matrix.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import numpy as np
import pytest

import taichi as ti
Expand Down Expand Up @@ -379,6 +378,7 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(),

@test_utils.test(arch=ti.cuda)
def test_gpu_sparse_matrix():
import numpy as np
h_coo_row = np.asarray([1, 0, 0, 0, 2, 2, 2, 3, 3], dtype=np.int32)
h_coo_col = np.asarray([1, 0, 2, 3, 0, 2, 3, 1, 3], dtype=np.int32)
h_coo_val = np.asarray([4.0, 1.0, 2.0, 3.0, 5.0, 6.0, 7.0, 8.0, 9.0],
Expand Down Expand Up @@ -414,3 +414,80 @@ def test_gpu_sparse_matrix():
A.spmv(X, Y)
for i in range(4):
assert Y[i] == h_Y[i]


@pytest.mark.parametrize('N', [5])
@test_utils.test(arch=ti.cuda)
def test_gpu_sparse_matrix_ops(N):
import numpy as np
from numpy.random import default_rng
from scipy import stats
from scipy.sparse import coo_matrix, random

seed = 2
np.random.seed(seed)
rng = default_rng(seed)
rvs = stats.poisson(3, loc=1).rvs
np_dtype = np.float32
idx_dt = ti.int32
val_dt = ti.float32

n_rows = N
n_cols = N - 1

S1 = random(n_rows, n_cols, density=0.5, random_state=rng,
data_rvs=rvs).astype(np_dtype).tocoo()
S2 = random(n_rows, n_cols, density=0.5, random_state=rng,
data_rvs=rvs).astype(np_dtype).tocoo()

nnz_A = len(S1.data)
nnz_B = len(S2.data)

row_coo_A = ti.ndarray(shape=nnz_A, dtype=idx_dt)
col_coo_A = ti.ndarray(shape=nnz_A, dtype=idx_dt)
value_coo_A = ti.ndarray(shape=nnz_A, dtype=val_dt)
row_coo_B = ti.ndarray(shape=nnz_B, dtype=idx_dt)
col_coo_B = ti.ndarray(shape=nnz_B, dtype=idx_dt)
value_coo_B = ti.ndarray(shape=nnz_B, dtype=val_dt)

row_coo_A.from_numpy(S1.row)
col_coo_A.from_numpy(S1.col)
value_coo_A.from_numpy(S1.data)

row_coo_B.from_numpy(S2.row)
col_coo_B.from_numpy(S2.col)
value_coo_B.from_numpy(S2.data)

A = ti.linalg.SparseMatrix(n=n_rows, m=n_cols, dtype=ti.f32)
B = ti.linalg.SparseMatrix(n=n_rows, m=n_cols, dtype=ti.f32)
A.build_coo(row_coo_A, col_coo_A, value_coo_A)
B.build_coo(row_coo_B, col_coo_B, value_coo_B)

def verify(scipy_spm, taichi_spm):
scipy_spm = scipy_spm.tocoo()
for i, j, v in zip(scipy_spm.row, scipy_spm.col, scipy_spm.data):
assert v == test_utils.approx(taichi_spm[i, j], rel=1e-5)

C = A + B
S3 = S1 + S2
verify(S3, C)

D = C - A
S4 = S3 - S1
verify(S4, D)

E = A * 2.5
S5 = S1 * 2.5
verify(S5, E)

F = A * 2.5
S6 = S1 * 2.5
verify(S6, F)

G = A.transpose()
S7 = S1.T
verify(S7, G)

H = A @ B.transpose()
S8 = S1 @ S2.T
verify(S8, H)

0 comments on commit f54c2fc

Please sign in to comment.