[Lang] Add element access for sparse matrix on CUDA (taichi-dev#6250)

Issue: taichi-dev#2906 taichi-dev#6082 + Support element access for sparse matrix on GPU. + Add tests for GPU sparse matrix operators. Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Yi Xu <xy_xuyi@foxmail.com>
quadpixels · May 13, 2023 · f54c2fc · f54c2fc
1 parent 8c0f9b4
commit f54c2fc
Show file tree

Hide file tree

Showing 5 changed files with 145 additions and 96 deletions.
diff --git a/misc/test_gpu_sm_op.py b/misc/test_gpu_sm_op.py
diff --git a/taichi/program/sparse_matrix.cpp b/taichi/program/sparse_matrix.cpp
@@ -35,12 +35,12 @@ struct key_hash {
 };
 
 template <typename T, typename T1, typename T2>
-void print_triplet_from_csr(int64_t n_rows,
-                            int n_cols,
-                            T *row,
-                            T1 *col,
-                            T2 *value,
-                            std::ostringstream &ostr) {
+void print_triplets_from_csr(int64_t n_rows,
+                             int n_cols,
+                             T *row,
+                             T1 *col,
+                             T2 *value,
+                             std::ostringstream &ostr) {
   using Triplets = Eigen::Triplet<T2>;
   std::vector<Triplets> trips;
   for (int64_t i = 1; i <= n_rows; ++i) {
@@ -57,6 +57,20 @@ void print_triplet_from_csr(int64_t n_rows,
   ostr << Eigen::MatrixXf(m.cast<float>()).format(clean_fmt);
 }
 
+template <typename T, typename T1, typename T2>
+T2 get_element_from_csr(int row,
+                        int col,
+                        T *row_data,
+                        T1 *col_data,
+                        T2 *value) {
+  for (T i = row_data[row]; i < row_data[row + 1]; ++i) {
+    if (col == col_data[i])
+      return value[i];
+  }
+  // zero entry
+  return 0;
+}
+
 }  // namespace
 
 namespace taichi::lang {
@@ -433,7 +447,7 @@ std::unique_ptr<SparseMatrix> CuSparseMatrix::addition(
 std::unique_ptr<SparseMatrix> CuSparseMatrix::matmul(
     const CuSparseMatrix &other) const {
 #if defined(TI_WITH_CUDA)
-  return gemm(other, 1.0f, 1.0f);
+  return gemm(other, 1.0f, 0.0f);
 #else
   TI_NOT_IMPLEMENTED;
   return std::unique_ptr<SparseMatrix>();
@@ -446,7 +460,7 @@ std::unique_ptr<SparseMatrix> CuSparseMatrix::gemm(const CuSparseMatrix &other,
                                                    const float alpha,
                                                    const float beta) const {
 #if defined(TI_WITH_CUDA)
-  cusparseHandle_t handle;
+  cusparseHandle_t handle = nullptr;
   CUSPARSEDriver::get_instance().cpCreate(&handle);
   cusparseOperation_t op_A = CUSPARSE_OPERATION_NON_TRANSPOSE;
   cusparseOperation_t op_B = CUSPARSE_OPERATION_NON_TRANSPOSE;
@@ -468,7 +482,7 @@ std::unique_ptr<SparseMatrix> CuSparseMatrix::gemm(const CuSparseMatrix &other,
   CUSPARSEDriver::get_instance().cpCreateSpGEMM(&spgemm_desc);
 
   // 3. ask buffer_size1 bytes for external memory
-  void *d_buffer1;
+  void *d_buffer1 = nullptr;
   size_t buffer_size1 = 0;
   CUSPARSEDriver::get_instance().cpSpGEMM_workEstimation(
       handle, op_A, op_B, &alpha, this->matrix_, other.matrix_, &beta, mat_C,
@@ -486,7 +500,7 @@ std::unique_ptr<SparseMatrix> CuSparseMatrix::gemm(const CuSparseMatrix &other,
   CUSPARSEDriver::get_instance().cpSpGEMM_compute(
       handle, op_A, op_B, &alpha, mat_A, mat_B, &beta, mat_C, CUDA_R_32F,
       CUSPARSE_SPGEMM_DEFAULT, spgemm_desc, &buffer_size2, nullptr);
-  void *d_buffer2;
+  void *d_buffer2 = nullptr;
   CUDADriver::get_instance().malloc((void **)&d_buffer2, buffer_size2);
 
   // 6. compute the intermediate product of A * B
@@ -647,9 +661,48 @@ const std::string CuSparseMatrix::to_string() const {
   CUDADriver::get_instance().memcpy_device_to_host((void *)hV, (void *)dV,
                                                    (nnz) * sizeof(float));
 
-  print_triplet_from_csr<int, int, float>(rows, cols, hR, hC, hV, ostr);
+  print_triplets_from_csr<int, int, float>(rows, cols, hR, hC, hV, ostr);
+  delete[] hR;
+  delete[] hC;
+  delete[] hV;
 #endif
   return ostr.str();
 }
 
+float CuSparseMatrix::get_element(int row, int col) const {
+  float res = 0.0f;
+#ifdef TI_WITH_CUDA
+  size_t rows, cols, nnz;
+  float *dR;
+  int *dC, *dV;
+  cusparseIndexType_t row_type, column_type;
+  cusparseIndexBase_t idx_base;
+  cudaDataType value_type;
+  CUSPARSEDriver::get_instance().cpCsrGet(
+      matrix_, &rows, &cols, &nnz, (void **)&dR, (void **)&dC, (void **)&dV,
+      &row_type, &column_type, &idx_base, &value_type);
+
+  TI_ASSERT(row < rows);
+  TI_ASSERT(col < cols);
+
+  auto *hR = new int[rows + 1];
+  auto *hC = new int[nnz];
+  auto *hV = new float[nnz];
+
+  CUDADriver::get_instance().memcpy_device_to_host((void *)hR, (void *)dR,
+                                                   (rows + 1) * sizeof(int));
+  CUDADriver::get_instance().memcpy_device_to_host((void *)hC, (void *)dC,
+                                                   (nnz) * sizeof(int));
+  CUDADriver::get_instance().memcpy_device_to_host((void *)hV, (void *)dV,
+                                                   (nnz) * sizeof(float));
+
+  res = get_element_from_csr<int, int, float>(row, col, hR, hC, hV);
+
+  delete[] hR;
+  delete[] hC;
+  delete[] hV;
+#endif  // TI_WITH_CUDA
+  return res;
+}
+
 }  // namespace taichi::lang
diff --git a/taichi/program/sparse_matrix.h b/taichi/program/sparse_matrix.h
@@ -268,6 +268,8 @@ class CuSparseMatrix : public SparseMatrix {
     return &matrix_;
   };
 
+  float get_element(int row, int col) const;
+
   const std::string to_string() const override;
 
   void *get_row_ptr() const {

diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
@@ -1262,6 +1262,7 @@ void export_lang(py::module &m) {
       .def(float32() * py::self)
       .def("matmul", &CuSparseMatrix::matmul)
       .def("transpose", &CuSparseMatrix::transpose)
+      .def("get_element", &CuSparseMatrix::get_element)
       .def("to_string", &CuSparseMatrix::to_string);
 
   py::class_<SparseSolver>(m, "SparseSolver")

diff --git a/tests/python/test_sparse_matrix.py b/tests/python/test_sparse_matrix.py
@@ -1,4 +1,3 @@
-import numpy as np
 import pytest
 
 import taichi as ti
@@ -379,6 +378,7 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(),
 
 @test_utils.test(arch=ti.cuda)
 def test_gpu_sparse_matrix():
+    import numpy as np
     h_coo_row = np.asarray([1, 0, 0, 0, 2, 2, 2, 3, 3], dtype=np.int32)
     h_coo_col = np.asarray([1, 0, 2, 3, 0, 2, 3, 1, 3], dtype=np.int32)
     h_coo_val = np.asarray([4.0, 1.0, 2.0, 3.0, 5.0, 6.0, 7.0, 8.0, 9.0],
@@ -414,3 +414,80 @@ def test_gpu_sparse_matrix():
     A.spmv(X, Y)
     for i in range(4):
         assert Y[i] == h_Y[i]
+
+
+@pytest.mark.parametrize('N', [5])
+@test_utils.test(arch=ti.cuda)
+def test_gpu_sparse_matrix_ops(N):
+    import numpy as np
+    from numpy.random import default_rng
+    from scipy import stats
+    from scipy.sparse import coo_matrix, random
+
+    seed = 2
+    np.random.seed(seed)
+    rng = default_rng(seed)
+    rvs = stats.poisson(3, loc=1).rvs
+    np_dtype = np.float32
+    idx_dt = ti.int32
+    val_dt = ti.float32
+
+    n_rows = N
+    n_cols = N - 1
+
+    S1 = random(n_rows, n_cols, density=0.5, random_state=rng,
+                data_rvs=rvs).astype(np_dtype).tocoo()
+    S2 = random(n_rows, n_cols, density=0.5, random_state=rng,
+                data_rvs=rvs).astype(np_dtype).tocoo()
+
+    nnz_A = len(S1.data)
+    nnz_B = len(S2.data)
+
+    row_coo_A = ti.ndarray(shape=nnz_A, dtype=idx_dt)
+    col_coo_A = ti.ndarray(shape=nnz_A, dtype=idx_dt)
+    value_coo_A = ti.ndarray(shape=nnz_A, dtype=val_dt)
+    row_coo_B = ti.ndarray(shape=nnz_B, dtype=idx_dt)
+    col_coo_B = ti.ndarray(shape=nnz_B, dtype=idx_dt)
+    value_coo_B = ti.ndarray(shape=nnz_B, dtype=val_dt)
+
+    row_coo_A.from_numpy(S1.row)
+    col_coo_A.from_numpy(S1.col)
+    value_coo_A.from_numpy(S1.data)
+
+    row_coo_B.from_numpy(S2.row)
+    col_coo_B.from_numpy(S2.col)
+    value_coo_B.from_numpy(S2.data)
+
+    A = ti.linalg.SparseMatrix(n=n_rows, m=n_cols, dtype=ti.f32)
+    B = ti.linalg.SparseMatrix(n=n_rows, m=n_cols, dtype=ti.f32)
+    A.build_coo(row_coo_A, col_coo_A, value_coo_A)
+    B.build_coo(row_coo_B, col_coo_B, value_coo_B)
+
+    def verify(scipy_spm, taichi_spm):
+        scipy_spm = scipy_spm.tocoo()
+        for i, j, v in zip(scipy_spm.row, scipy_spm.col, scipy_spm.data):
+            assert v == test_utils.approx(taichi_spm[i, j], rel=1e-5)
+
+    C = A + B
+    S3 = S1 + S2
+    verify(S3, C)
+
+    D = C - A
+    S4 = S3 - S1
+    verify(S4, D)
+
+    E = A * 2.5
+    S5 = S1 * 2.5
+    verify(S5, E)
+
+    F = A * 2.5
+    S6 = S1 * 2.5
+    verify(S6, F)
+
+    G = A.transpose()
+    S7 = S1.T
+    verify(S7, G)
+
+    H = A @ B.transpose()
+    S8 = S1 @ S2.T
+    verify(S8, H)