Gather one hop neighbors (#2117)

Add utilities to enable multi gpu gathering of adjacency lists to be used for mnmg sampling. Authors: - Kumar Aatish (https://github.com/kaatish) Approvers: - Andrei Schaffer (https://github.com/aschaffer) - AJ Schmidt (https://github.com/ajschmidt8) - Chuck Hastings (https://github.com/ChuckHastings) - Seunghwa Kang (https://github.com/seunghwak) URL: #2117
rapidsai · Mar 21, 2022 · 6c3a469 · 6c3a469
1 parent a024581
commit 6c3a469
Show file tree

Hide file tree

Showing 9 changed files with 1,156 additions and 151 deletions.
diff --git a/cpp/include/cugraph/detail/decompress_matrix_partition.cuh b/cpp/include/cugraph/detail/decompress_matrix_partition.cuh
@@ -38,7 +38,7 @@ namespace detail {
 int32_t constexpr decompress_matrix_partition_block_size = 1024;
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-__global__ void for_all_major_for_all_nbr_mid_degree(
+__global__ void decompress_to_edgelist_mid_degree(
   matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
   vertex_t major_first,
   vertex_t major_last,
@@ -67,7 +67,7 @@ __global__ void for_all_major_for_all_nbr_mid_degree(
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-__global__ void for_all_major_for_all_nbr_high_degree(
+__global__ void decompress_to_edgelist_high_degree(
   matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
   vertex_t major_first,
   vertex_t major_last,
@@ -111,10 +111,10 @@ void decompress_matrix_partition_to_fill_edgelist_majors(
                                         detail::decompress_matrix_partition_block_size,
                                         handle.get_device_properties().maxGridSize[0]);
 
-      detail::for_all_major_for_all_nbr_high_degree<<<update_grid.num_blocks,
-                                                      update_grid.block_size,
-                                                      0,
-                                                      handle.get_stream()>>>(
+      detail::decompress_to_edgelist_high_degree<<<update_grid.num_blocks,
+                                                   update_grid.block_size,
+                                                   0,
+                                                   handle.get_stream()>>>(
         matrix_partition,
         matrix_partition.get_major_first(),
         matrix_partition.get_major_first() + (*segment_offsets)[1],
@@ -125,10 +125,10 @@ void decompress_matrix_partition_to_fill_edgelist_majors(
                                        detail::decompress_matrix_partition_block_size,
                                        handle.get_device_properties().maxGridSize[0]);
 
-      detail::for_all_major_for_all_nbr_mid_degree<<<update_grid.num_blocks,
-                                                     update_grid.block_size,
-                                                     0,
-                                                     handle.get_stream()>>>(
+      detail::decompress_to_edgelist_mid_degree<<<update_grid.num_blocks,
+                                                  update_grid.block_size,
+                                                  0,
+                                                  handle.get_stream()>>>(
         matrix_partition,
         matrix_partition.get_major_first() + (*segment_offsets)[1],
         matrix_partition.get_major_first() + (*segment_offsets)[2],
@@ -180,6 +180,246 @@ void decompress_matrix_partition_to_fill_edgelist_majors(
   }
 }
 
+template <typename vertex_t, typename edge_t, typename weight_t, typename prop_t, bool multi_gpu>
+__global__ void partially_decompress_to_edgelist_high_degree(
+  matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
+  vertex_t const* input_majors,
+  edge_t const* input_major_start_offsets,
+  vertex_t input_major_count,
+  vertex_t* output_majors,
+  vertex_t* output_minors,
+  thrust::optional<thrust::tuple<prop_t const*, prop_t*>> property,
+  thrust::optional<thrust::tuple<edge_t const*, edge_t*>> global_edge_index)
+{
+  size_t idx = static_cast<size_t>(blockIdx.x);
+  while (idx < static_cast<size_t>(input_major_count)) {
+    auto major                  = input_majors[idx];
+    auto major_partition_offset = static_cast<size_t>(major - matrix_partition.get_major_first());
+    vertex_t const* indices{nullptr};
+    thrust::optional<weight_t const*> weights{thrust::nullopt};
+    edge_t local_degree{};
+    thrust::tie(indices, weights, local_degree) =
+      matrix_partition.get_local_edges(static_cast<vertex_t>(major_partition_offset));
+    auto major_offset = input_major_start_offsets[idx];
+    for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+      output_majors[major_offset + i] = major;
+      output_minors[major_offset + i] = indices[i];
+    }
+    if (property) {
+      auto input_property     = thrust::get<0>(*property)[idx];
+      prop_t* output_property = thrust::get<1>(*property);
+      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+        output_property[major_offset + i] = input_property;
+      }
+    }
+    if (global_edge_index) {
+      auto adjacency_list_offset = thrust::get<0>(*global_edge_index)[major_partition_offset];
+      auto minor_map             = thrust::get<1>(*global_edge_index);
+      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+        minor_map[major_offset + i] = adjacency_list_offset + i;
+      }
+    }
+    idx += gridDim.x;
+  }
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t, typename prop_t, bool multi_gpu>
+__global__ void partially_decompress_to_edgelist_mid_degree(
+  matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
+  vertex_t const* input_majors,
+  edge_t const* input_major_start_offsets,
+  vertex_t input_major_count,
+  vertex_t* output_majors,
+  vertex_t* output_minors,
+  thrust::optional<thrust::tuple<prop_t const*, prop_t*>> property,
+  thrust::optional<thrust::tuple<edge_t const*, edge_t*>> global_edge_index)
+{
+  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  static_assert(decompress_matrix_partition_block_size % raft::warp_size() == 0);
+  auto const lane_id = tid % raft::warp_size();
+  size_t idx         = static_cast<size_t>(tid / raft::warp_size());
+  while (idx < static_cast<size_t>(input_major_count)) {
+    auto major                  = input_majors[idx];
+    auto major_partition_offset = static_cast<size_t>(major - matrix_partition.get_major_first());
+    vertex_t const* indices{nullptr};
+    edge_t local_degree{};
+    auto major_offset = input_major_start_offsets[idx];
+    for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+      output_majors[major_offset + i] = major;
+      output_minors[major_offset + i] = indices[i];
+    }
+    if (property) {
+      auto input_property     = thrust::get<0>(*property)[idx];
+      prop_t* output_property = thrust::get<1>(*property);
+      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+        output_property[major_offset + i] = input_property;
+      }
+    }
+    if (global_edge_index) {
+      auto adjacency_list_offset = thrust::get<0>(*global_edge_index)[major_partition_offset];
+      auto minor_map             = thrust::get<1>(*global_edge_index);
+      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+        minor_map[major_offset + i] = adjacency_list_offset + i;
+      }
+    }
+    idx += gridDim.x * (blockDim.x / raft::warp_size());
+  }
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t, typename prop_t, bool multi_gpu>
+void partially_decompress_matrix_partition_to_fill_edgelist(
+  raft::handle_t const& handle,
+  matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
+  vertex_t const* input_majors,
+  edge_t const* input_major_start_offsets,
+  std::vector<vertex_t> const& segment_offsets,
+  vertex_t* majors,
+  vertex_t* minors,
+  thrust::optional<thrust::tuple<prop_t const*, prop_t*>> property,
+  thrust::optional<thrust::tuple<edge_t const*, edge_t*>> global_edge_index)
+{
+  auto execution_policy = handle.get_thrust_policy();
+  static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
+  auto& comm           = handle.get_comms();
+  auto const comm_rank = comm.get_rank();
+  if (segment_offsets[1] - segment_offsets[0] > 0) {
+    raft::grid_1d_block_t update_grid(segment_offsets[1] - segment_offsets[0],
+                                      detail::decompress_matrix_partition_block_size,
+                                      handle.get_device_properties().maxGridSize[0]);
+
+    detail::partially_decompress_to_edgelist_high_degree<<<update_grid.num_blocks,
+                                                           update_grid.block_size,
+                                                           0,
+                                                           handle.get_stream()>>>(
+      matrix_partition,
+      input_majors + segment_offsets[0],
+      input_major_start_offsets,
+      segment_offsets[1],
+      majors,
+      minors,
+      property ? thrust::make_optional(thrust::make_tuple(
+                   thrust::get<0>(*property) + segment_offsets[0], thrust::get<1>(*property)))
+               : thrust::nullopt,
+      global_edge_index);
+  }
+  if (segment_offsets[2] - segment_offsets[1] > 0) {
+    raft::grid_1d_warp_t update_grid(segment_offsets[2] - segment_offsets[1],
+                                     detail::decompress_matrix_partition_block_size,
+                                     handle.get_device_properties().maxGridSize[0]);
+
+    detail::partially_decompress_to_edgelist_mid_degree<<<update_grid.num_blocks,
+                                                          update_grid.block_size,
+                                                          0,
+                                                          handle.get_stream()>>>(
+      matrix_partition,
+      input_majors + segment_offsets[1],
+      input_major_start_offsets + segment_offsets[1] - segment_offsets[0],
+      segment_offsets[2] - segment_offsets[1],
+      majors,
+      minors,
+      property ? thrust::make_optional(thrust::make_tuple(
+                   thrust::get<0>(*property) + segment_offsets[1], thrust::get<1>(*property)))
+               : thrust::nullopt,
+      global_edge_index);
+  }
+  if (segment_offsets[3] - segment_offsets[2] > 0) {
+    thrust::for_each(
+      execution_policy,
+      thrust::make_counting_iterator(vertex_t{0}),
+      thrust::make_counting_iterator(segment_offsets[3] - segment_offsets[2]),
+      [matrix_partition,
+       input_majors = input_majors + segment_offsets[2],
+       input_major_start_offsets =
+         input_major_start_offsets + segment_offsets[2] - segment_offsets[0],
+       majors,
+       minors,
+       property = property
+                    ? thrust::make_optional(thrust::make_tuple(
+                        thrust::get<0>(*property) + segment_offsets[2], thrust::get<1>(*property)))
+                    : thrust::nullopt,
+       global_edge_index] __device__(auto idx) {
+        auto major        = input_majors[idx];
+        auto major_offset = input_major_start_offsets[idx];
+        auto major_partition_offset =
+          static_cast<size_t>(major - matrix_partition.get_major_first());
+        vertex_t const* indices{nullptr};
+        thrust::optional<weight_t const*> weights{thrust::nullopt};
+        edge_t local_degree{};
+        thrust::tie(indices, weights, local_degree) =
+          matrix_partition.get_local_edges(major_partition_offset);
+        thrust::fill(
+          thrust::seq, majors + major_offset, majors + major_offset + local_degree, major);
+        thrust::copy(thrust::seq, indices, indices + local_degree, minors + major_offset);
+        if (property) {
+          auto major_input_property  = thrust::get<0>(*property)[idx];
+          auto minor_output_property = thrust::get<1>(*property);
+          thrust::fill(thrust::seq,
+                       minor_output_property + major_offset,
+                       minor_output_property + major_offset + local_degree,
+                       major_input_property);
+        }
+        if (global_edge_index) {
+          auto adjacency_list_offset = thrust::get<0>(*global_edge_index)[major_partition_offset];
+          auto minor_map             = thrust::get<1>(*global_edge_index);
+          thrust::sequence(thrust::seq,
+                           minor_map + major_offset,
+                           minor_map + major_offset + local_degree,
+                           adjacency_list_offset);
+        }
+      });
+  }
+  if (matrix_partition.get_dcs_nzd_vertex_count() &&
+      (*(matrix_partition.get_dcs_nzd_vertex_count()) > 0)) {
+    thrust::for_each(
+      execution_policy,
+      thrust::make_counting_iterator(vertex_t{0}),
+      thrust::make_counting_iterator(segment_offsets[4] - segment_offsets[3]),
+      [matrix_partition,
+       input_majors = input_majors + segment_offsets[3],
+       input_major_start_offsets =
+         input_major_start_offsets + segment_offsets[3] - segment_offsets[0],
+       majors,
+       minors,
+       property = property
+                    ? thrust::make_optional(thrust::make_tuple(
+                        thrust::get<0>(*property) + segment_offsets[3], thrust::get<1>(*property)))
+                    : thrust::nullopt,
+       global_edge_index] __device__(auto idx) {
+        auto major        = input_majors[idx];
+        auto major_offset = input_major_start_offsets[idx];
+        auto major_idx    = matrix_partition.get_major_hypersparse_idx_from_major_nocheck(major);
+        if (major_idx) {
+          vertex_t const* indices{nullptr};
+          thrust::optional<weight_t const*> weights{thrust::nullopt};
+          edge_t local_degree{};
+          thrust::tie(indices, weights, local_degree) =
+            matrix_partition.get_local_edges(*major_idx);
+          thrust::fill(
+            thrust::seq, majors + major_offset, majors + major_offset + local_degree, major);
+          thrust::copy(thrust::seq, indices, indices + local_degree, minors + major_offset);
+          if (property) {
+            auto major_input_property  = thrust::get<0>(*property)[idx];
+            auto minor_output_property = thrust::get<1>(*property);
+            thrust::fill(thrust::seq,
+                         minor_output_property + major_offset,
+                         minor_output_property + major_offset + local_degree,
+                         major_input_property);
+          }
+          if (global_edge_index) {
+            auto major_partition_offset =
+              static_cast<size_t>(*major_idx - matrix_partition.get_major_first());
+            auto adjacency_list_offset = thrust::get<0>(*global_edge_index)[major_partition_offset];
+            auto minor_map             = thrust::get<1>(*global_edge_index);
+            thrust::sequence(thrust::seq,
+                             minor_map + major_offset,
+                             minor_map + major_offset + local_degree,
+                             adjacency_list_offset);
+          }
+        }
+      });
+  }
+}
+
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
 void decompress_matrix_partition_to_edgelist(
   raft::handle_t const& handle,