Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gather one hop neighbors #2117

Merged
merged 12 commits into from
Mar 21, 2022
260 changes: 250 additions & 10 deletions cpp/include/cugraph/detail/decompress_matrix_partition.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ namespace detail {
int32_t constexpr decompress_matrix_partition_block_size = 1024;

template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
__global__ void for_all_major_for_all_nbr_mid_degree(
__global__ void decompress_to_edgelist_mid_degree(
matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
vertex_t major_first,
vertex_t major_last,
Expand Down Expand Up @@ -67,7 +67,7 @@ __global__ void for_all_major_for_all_nbr_mid_degree(
}

template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
__global__ void for_all_major_for_all_nbr_high_degree(
__global__ void decompress_to_edgelist_high_degree(
matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
vertex_t major_first,
vertex_t major_last,
Expand Down Expand Up @@ -111,10 +111,10 @@ void decompress_matrix_partition_to_fill_edgelist_majors(
detail::decompress_matrix_partition_block_size,
handle.get_device_properties().maxGridSize[0]);

detail::for_all_major_for_all_nbr_high_degree<<<update_grid.num_blocks,
update_grid.block_size,
0,
handle.get_stream()>>>(
detail::decompress_to_edgelist_high_degree<<<update_grid.num_blocks,
update_grid.block_size,
0,
handle.get_stream()>>>(
matrix_partition,
matrix_partition.get_major_first(),
matrix_partition.get_major_first() + (*segment_offsets)[1],
Expand All @@ -125,10 +125,10 @@ void decompress_matrix_partition_to_fill_edgelist_majors(
detail::decompress_matrix_partition_block_size,
handle.get_device_properties().maxGridSize[0]);

detail::for_all_major_for_all_nbr_mid_degree<<<update_grid.num_blocks,
update_grid.block_size,
0,
handle.get_stream()>>>(
detail::decompress_to_edgelist_mid_degree<<<update_grid.num_blocks,
update_grid.block_size,
0,
handle.get_stream()>>>(
matrix_partition,
matrix_partition.get_major_first() + (*segment_offsets)[1],
matrix_partition.get_major_first() + (*segment_offsets)[2],
Expand Down Expand Up @@ -180,6 +180,246 @@ void decompress_matrix_partition_to_fill_edgelist_majors(
}
}

template <typename vertex_t, typename edge_t, typename weight_t, typename prop_t, bool multi_gpu>
__global__ void partially_decompress_to_edgelist_high_degree(
matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
vertex_t const* input_majors,
edge_t const* input_major_start_offsets,
vertex_t input_major_count,
vertex_t* output_majors,
vertex_t* output_minors,
thrust::optional<thrust::tuple<prop_t const*, prop_t*>> property,
thrust::optional<thrust::tuple<edge_t const*, edge_t*>> global_edge_index)
{
size_t idx = static_cast<size_t>(blockIdx.x);
while (idx < static_cast<size_t>(input_major_count)) {
auto major = input_majors[idx];
auto major_partition_offset = static_cast<size_t>(major - matrix_partition.get_major_first());
vertex_t const* indices{nullptr};
thrust::optional<weight_t const*> weights{thrust::nullopt};
edge_t local_degree{};
thrust::tie(indices, weights, local_degree) =
matrix_partition.get_local_edges(static_cast<vertex_t>(major_partition_offset));
auto major_offset = input_major_start_offsets[idx];
for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
output_majors[major_offset + i] = major;
output_minors[major_offset + i] = indices[i];
}
if (property) {
auto input_property = thrust::get<0>(*property)[idx];
prop_t* output_property = thrust::get<1>(*property);
for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
output_property[major_offset + i] = input_property;
}
}
if (global_edge_index) {
auto adjacency_list_offset = thrust::get<0>(*global_edge_index)[major_partition_offset];
auto minor_map = thrust::get<1>(*global_edge_index);
for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
minor_map[major_offset + i] = adjacency_list_offset + i;
}
}
idx += gridDim.x;
}
}

template <typename vertex_t, typename edge_t, typename weight_t, typename prop_t, bool multi_gpu>
__global__ void partially_decompress_to_edgelist_mid_degree(
matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
vertex_t const* input_majors,
edge_t const* input_major_start_offsets,
vertex_t input_major_count,
vertex_t* output_majors,
vertex_t* output_minors,
thrust::optional<thrust::tuple<prop_t const*, prop_t*>> property,
thrust::optional<thrust::tuple<edge_t const*, edge_t*>> global_edge_index)
{
auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
static_assert(decompress_matrix_partition_block_size % raft::warp_size() == 0);
auto const lane_id = tid % raft::warp_size();
size_t idx = static_cast<size_t>(tid / raft::warp_size());
while (idx < static_cast<size_t>(input_major_count)) {
auto major = input_majors[idx];
auto major_partition_offset = static_cast<size_t>(major - matrix_partition.get_major_first());
vertex_t const* indices{nullptr};
edge_t local_degree{};
auto major_offset = input_major_start_offsets[idx];
for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
output_majors[major_offset + i] = major;
output_minors[major_offset + i] = indices[i];
}
if (property) {
auto input_property = thrust::get<0>(*property)[idx];
prop_t* output_property = thrust::get<1>(*property);
for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
output_property[major_offset + i] = input_property;
}
}
if (global_edge_index) {
auto adjacency_list_offset = thrust::get<0>(*global_edge_index)[major_partition_offset];
auto minor_map = thrust::get<1>(*global_edge_index);
for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
minor_map[major_offset + i] = adjacency_list_offset + i;
}
}
idx += gridDim.x * (blockDim.x / raft::warp_size());
}
}

template <typename vertex_t, typename edge_t, typename weight_t, typename prop_t, bool multi_gpu>
void partially_decompress_matrix_partition_to_fill_edgelist(
raft::handle_t const& handle,
matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
vertex_t const* input_majors,
edge_t const* input_major_start_offsets,
std::vector<vertex_t> const& segment_offsets,
vertex_t* majors,
vertex_t* minors,
thrust::optional<thrust::tuple<prop_t const*, prop_t*>> property,
thrust::optional<thrust::tuple<edge_t const*, edge_t*>> global_edge_index)
{
auto execution_policy = handle.get_thrust_policy();
static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
auto& comm = handle.get_comms();
auto const comm_rank = comm.get_rank();
if (segment_offsets[1] - segment_offsets[0] > 0) {
raft::grid_1d_block_t update_grid(segment_offsets[1] - segment_offsets[0],
detail::decompress_matrix_partition_block_size,
handle.get_device_properties().maxGridSize[0]);

detail::partially_decompress_to_edgelist_high_degree<<<update_grid.num_blocks,
update_grid.block_size,
0,
handle.get_stream()>>>(
matrix_partition,
input_majors + segment_offsets[0],
input_major_start_offsets,
segment_offsets[1],
majors,
minors,
property ? thrust::make_optional(thrust::make_tuple(
thrust::get<0>(*property) + segment_offsets[0], thrust::get<1>(*property)))
: thrust::nullopt,
global_edge_index);
}
if (segment_offsets[2] - segment_offsets[1] > 0) {
raft::grid_1d_warp_t update_grid(segment_offsets[2] - segment_offsets[1],
detail::decompress_matrix_partition_block_size,
handle.get_device_properties().maxGridSize[0]);

detail::partially_decompress_to_edgelist_mid_degree<<<update_grid.num_blocks,
update_grid.block_size,
0,
handle.get_stream()>>>(
matrix_partition,
input_majors + segment_offsets[1],
input_major_start_offsets + segment_offsets[1] - segment_offsets[0],
segment_offsets[2] - segment_offsets[1],
majors,
minors,
property ? thrust::make_optional(thrust::make_tuple(
thrust::get<0>(*property) + segment_offsets[1], thrust::get<1>(*property)))
: thrust::nullopt,
global_edge_index);
}
if (segment_offsets[3] - segment_offsets[2] > 0) {
thrust::for_each(
execution_policy,
thrust::make_counting_iterator(vertex_t{0}),
thrust::make_counting_iterator(segment_offsets[3] - segment_offsets[2]),
[matrix_partition,
input_majors = input_majors + segment_offsets[2],
input_major_start_offsets =
input_major_start_offsets + segment_offsets[2] - segment_offsets[0],
majors,
minors,
property = property
? thrust::make_optional(thrust::make_tuple(
thrust::get<0>(*property) + segment_offsets[2], thrust::get<1>(*property)))
: thrust::nullopt,
global_edge_index] __device__(auto idx) {
auto major = input_majors[idx];
auto major_offset = input_major_start_offsets[idx];
auto major_partition_offset =
static_cast<size_t>(major - matrix_partition.get_major_first());
vertex_t const* indices{nullptr};
thrust::optional<weight_t const*> weights{thrust::nullopt};
edge_t local_degree{};
thrust::tie(indices, weights, local_degree) =
matrix_partition.get_local_edges(major_partition_offset);
thrust::fill(
thrust::seq, majors + major_offset, majors + major_offset + local_degree, major);
thrust::copy(thrust::seq, indices, indices + local_degree, minors + major_offset);
if (property) {
auto major_input_property = thrust::get<0>(*property)[idx];
auto minor_output_property = thrust::get<1>(*property);
thrust::fill(thrust::seq,
minor_output_property + major_offset,
minor_output_property + major_offset + local_degree,
major_input_property);
}
if (global_edge_index) {
auto adjacency_list_offset = thrust::get<0>(*global_edge_index)[major_partition_offset];
auto minor_map = thrust::get<1>(*global_edge_index);
thrust::sequence(thrust::seq,
minor_map + major_offset,
minor_map + major_offset + local_degree,
adjacency_list_offset);
}
});
}
if (matrix_partition.get_dcs_nzd_vertex_count() &&
(*(matrix_partition.get_dcs_nzd_vertex_count()) > 0)) {
thrust::for_each(
execution_policy,
thrust::make_counting_iterator(vertex_t{0}),
thrust::make_counting_iterator(segment_offsets[4] - segment_offsets[3]),
[matrix_partition,
input_majors = input_majors + segment_offsets[3],
input_major_start_offsets =
input_major_start_offsets + segment_offsets[3] - segment_offsets[0],
majors,
minors,
property = property
? thrust::make_optional(thrust::make_tuple(
thrust::get<0>(*property) + segment_offsets[3], thrust::get<1>(*property)))
: thrust::nullopt,
global_edge_index] __device__(auto idx) {
auto major = input_majors[idx];
auto major_offset = input_major_start_offsets[idx];
auto major_idx = matrix_partition.get_major_hypersparse_idx_from_major_nocheck(major);
if (major_idx) {
vertex_t const* indices{nullptr};
thrust::optional<weight_t const*> weights{thrust::nullopt};
edge_t local_degree{};
thrust::tie(indices, weights, local_degree) =
matrix_partition.get_local_edges(*major_idx);
thrust::fill(
thrust::seq, majors + major_offset, majors + major_offset + local_degree, major);
thrust::copy(thrust::seq, indices, indices + local_degree, minors + major_offset);
if (property) {
auto major_input_property = thrust::get<0>(*property)[idx];
auto minor_output_property = thrust::get<1>(*property);
thrust::fill(thrust::seq,
minor_output_property + major_offset,
minor_output_property + major_offset + local_degree,
major_input_property);
}
if (global_edge_index) {
auto major_partition_offset =
static_cast<size_t>(*major_idx - matrix_partition.get_major_first());
auto adjacency_list_offset = thrust::get<0>(*global_edge_index)[major_partition_offset];
auto minor_map = thrust::get<1>(*global_edge_index);
thrust::sequence(thrust::seq,
minor_map + major_offset,
minor_map + major_offset + local_degree,
adjacency_list_offset);
}
}
});
}
}

template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
void decompress_matrix_partition_to_edgelist(
raft::handle_t const& handle,
Expand Down