Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gather one hop neighbors #2117

Merged
merged 12 commits into from
Mar 21, 2022
7 changes: 6 additions & 1 deletion ci/gpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,13 @@ conda activate rapids
export PATH=$(conda info --base)/envs/rapids/bin:$PATH

gpuci_logger "Install dependencies"
# Assume libcudf will be installed via cudf. This is done to prevent the
# following:
# libcudf = 22.04.00a220315, cudf = 22.04.00a220308
# where cudf 220308 was chosen possibly because it has fewer/different
# dependencies and the corresponding recipes think they're compatible when they
# may not be.
gpuci_mamba_retry install -y \
"libcudf=${MINOR_VERSION}" \
"cudf=${MINOR_VERSION}" \
"librmm=${MINOR_VERSION}" \
"rmm=${MINOR_VERSION}" \
Expand Down
4 changes: 2 additions & 2 deletions conda/recipes/libcugraph/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ requirements:
- boost-cpp>=1.66
- nccl>=2.9.9
- ucx-proc=*=gpu
- gtest
- gmock
- gtest=1.10.0 # FIXME: pinned to version in https://github.com/rapidsai/integration/blob/branch-22.04/conda/recipes/versions.yaml
- gmock=1.10.0 # FIXME: pinned to version in https://github.com/rapidsai/integration/blob/branch-22.04/conda/recipes/versions.yaml
run:
- {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
- libraft-headers {{ minor_version }}
Expand Down
237 changes: 237 additions & 0 deletions cpp/include/cugraph/detail/decompress_matrix_partition.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -180,10 +180,247 @@ void decompress_matrix_partition_to_fill_edgelist_majors(
}
}

template <typename vertex_t, typename edge_t, typename weight_t, typename prop_t, bool multi_gpu>
__global__ void decompress_edgelist_high_degree(
kaatish marked this conversation as resolved.
Show resolved Hide resolved
matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
vertex_t const* input_majors,
edge_t const* input_major_start_offsets,
vertex_t input_major_count,
vertex_t* output_majors,
vertex_t* output_minors,
thrust::optional<thrust::tuple<prop_t const*, prop_t*>> property)
{
size_t idx = static_cast<size_t>(blockIdx.x);
while (idx < static_cast<size_t>(input_major_count)) {
auto major = input_majors[idx];
auto major_partition_offset = static_cast<size_t>(major - matrix_partition.get_major_first());
vertex_t const* indices{nullptr};
thrust::optional<weight_t const*> weights{thrust::nullopt};
edge_t local_degree{};
thrust::tie(indices, weights, local_degree) =
matrix_partition.get_local_edges(static_cast<vertex_t>(major_partition_offset));
auto major_offset = input_major_start_offsets[idx];
if (property) {
auto input_property = thrust::get<0>(*property)[idx];
prop_t* output_property = thrust::get<1>(*property);
for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
output_majors[major_offset + i] = major;
output_minors[major_offset + i] = indices[i];
output_property[major_offset + i] = input_property;
}
} else {
for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
output_majors[major_offset + i] = major;
output_minors[major_offset + i] = indices[i];
}
}
idx += gridDim.x;
}
}

template <typename vertex_t, typename edge_t, typename weight_t, typename prop_t, bool multi_gpu>
__global__ void decompress_edgelist_mid_degree(
matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
vertex_t const* input_majors,
edge_t const* input_major_start_offsets,
vertex_t input_major_count,
vertex_t* output_majors,
vertex_t* output_minors,
thrust::optional<thrust::tuple<prop_t const*, prop_t*>> property)
{
auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
static_assert(decompress_matrix_partition_block_size % raft::warp_size() == 0);
auto const lane_id = tid % raft::warp_size();
size_t idx = static_cast<size_t>(tid / raft::warp_size());
while (idx < static_cast<size_t>(input_major_count)) {
auto major = input_majors[idx];
auto major_partition_offset = static_cast<size_t>(major - matrix_partition.get_major_first());
vertex_t const* indices{nullptr};
edge_t local_degree{};
auto major_offset = input_major_start_offsets[idx];
if (property) {
auto input_property = thrust::get<0>(*property)[idx];
prop_t* output_property = thrust::get<1>(*property);
for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
output_majors[major_offset + i] = major;
output_minors[major_offset + i] = indices[i];
output_property[major_offset + i] = input_property;
}
} else {
for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
output_majors[major_offset + i] = major;
output_minors[major_offset + i] = indices[i];
}
}
idx += gridDim.x * (blockDim.x / raft::warp_size());
}
}

template <typename vertex_t, typename edge_t, typename weight_t, typename prop_t, bool multi_gpu>
void decompress_matrix_partition_to_fill_edgelist(
kaatish marked this conversation as resolved.
Show resolved Hide resolved
raft::handle_t const& handle,
matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
vertex_t const* input_majors,
edge_t const* input_major_start_offsets,
std::vector<vertex_t> const& segment_offsets,
vertex_t* majors,
vertex_t* minors,
thrust::optional<thrust::tuple<prop_t const*, prop_t*>> property)
{
auto execution_policy = handle.get_thrust_policy();
static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
auto& comm = handle.get_comms();
auto const comm_rank = comm.get_rank();
if (segment_offsets[1] - segment_offsets[0] > 0) {
raft::grid_1d_block_t update_grid(segment_offsets[1] - segment_offsets[0],
detail::decompress_matrix_partition_block_size,
handle.get_device_properties().maxGridSize[0]);

detail::decompress_edgelist_high_degree<<<update_grid.num_blocks,
update_grid.block_size,
0,
handle.get_stream()>>>(
matrix_partition,
input_majors + segment_offsets[0],
input_major_start_offsets,
segment_offsets[1],
majors,
minors,
property ? thrust::make_optional(thrust::make_tuple(
thrust::get<0>(*property) + segment_offsets[0], thrust::get<1>(*property)))
: thrust::nullopt);
}
if (segment_offsets[2] - segment_offsets[1] > 0) {
raft::grid_1d_warp_t update_grid(segment_offsets[2] - segment_offsets[1],
detail::decompress_matrix_partition_block_size,
handle.get_device_properties().maxGridSize[0]);

detail::decompress_edgelist_mid_degree<<<update_grid.num_blocks,
update_grid.block_size,
0,
handle.get_stream()>>>(
matrix_partition,
input_majors + segment_offsets[1],
input_major_start_offsets + segment_offsets[1] - segment_offsets[0],
segment_offsets[2] - segment_offsets[1],
majors,
minors,
property ? thrust::make_optional(thrust::make_tuple(
thrust::get<0>(*property) + segment_offsets[1], thrust::get<1>(*property)))
: thrust::nullopt);
}
if (segment_offsets[3] - segment_offsets[2] > 0) {
raft::grid_1d_warp_t update_grid(segment_offsets[3] - segment_offsets[2],
detail::decompress_matrix_partition_block_size,
handle.get_device_properties().maxGridSize[0]);
kaatish marked this conversation as resolved.
Show resolved Hide resolved

thrust::for_each(
execution_policy,
thrust::make_counting_iterator(vertex_t{0}),
thrust::make_counting_iterator(segment_offsets[3] - segment_offsets[2]),
[matrix_partition,
input_majors = input_majors + segment_offsets[2],
input_major_start_offsets =
input_major_start_offsets + segment_offsets[2] - segment_offsets[0],
majors,
minors,
property = property
? thrust::make_optional(thrust::make_tuple(
thrust::get<0>(*property) + segment_offsets[2], thrust::get<1>(*property)))
: thrust::nullopt] __device__(auto idx) {
auto major = input_majors[idx];
auto major_offset = input_major_start_offsets[idx];
auto major_partition_offset =
static_cast<size_t>(major - matrix_partition.get_major_first());
vertex_t const* indices{nullptr};
thrust::optional<weight_t const*> weights{thrust::nullopt};
edge_t local_degree{};
thrust::tie(indices, weights, local_degree) =
matrix_partition.get_local_edges(major_partition_offset);
thrust::fill(
thrust::seq, majors + major_offset, majors + major_offset + local_degree, major);
thrust::copy(thrust::seq, indices, indices + local_degree, minors + major_offset);
if (property) {
auto major_input_property = thrust::get<0>(*property)[idx];
auto minor_output_property = thrust::get<1>(*property);
thrust::fill(thrust::seq,
minor_output_property + major_offset,
minor_output_property + major_offset + local_degree,
major_input_property);
}
});
}
if (matrix_partition.get_dcs_nzd_vertex_count() &&
(*(matrix_partition.get_dcs_nzd_vertex_count()) > 0)) {
thrust::for_each(
execution_policy,
thrust::make_counting_iterator(vertex_t{0}),
thrust::make_counting_iterator(segment_offsets[4] - segment_offsets[3]),
[matrix_partition,
input_majors = input_majors + segment_offsets[3],
input_major_start_offsets =
input_major_start_offsets + segment_offsets[3] - segment_offsets[0],
majors,
minors,
property = property
? thrust::make_optional(thrust::make_tuple(
thrust::get<0>(*property) + segment_offsets[3], thrust::get<1>(*property)))
: thrust::nullopt] __device__(auto idx) {
auto major = input_majors[idx];
auto major_offset = input_major_start_offsets[idx];
auto major_idx = matrix_partition.get_major_hypersparse_idx_from_major_nocheck(major);
if (major_idx) {
vertex_t const* indices{nullptr};
thrust::optional<weight_t const*> weights{thrust::nullopt};
edge_t local_degree{};
thrust::tie(indices, weights, local_degree) =
matrix_partition.get_local_edges(*major_idx);
thrust::fill(
thrust::seq, majors + major_offset, majors + major_offset + local_degree, major);
thrust::copy(thrust::seq, indices, indices + local_degree, minors + major_offset);
if (property) {
auto major_input_property = thrust::get<0>(*property)[idx];
auto minor_output_property = thrust::get<1>(*property);
thrust::fill(thrust::seq,
minor_output_property + major_offset,
minor_output_property + major_offset + local_degree,
major_input_property);
}
}
});
}
}

template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
void decompress_matrix_partition_to_edgelist(
raft::handle_t const& handle,
matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> const matrix_partition,
vertex_t* edgelist_majors /* [OUT] */,
vertex_t* edgelist_minors /* [OUT] */,
std::optional<weight_t*> edgelist_weights /* [OUT] */,
std::optional<std::vector<vertex_t>> const& segment_offsets)
{
auto number_of_edges = matrix_partition.get_number_of_edges();

decompress_matrix_partition_to_fill_edgelist_majors(
handle, matrix_partition, edgelist_majors, segment_offsets);
thrust::copy(handle.get_thrust_policy(),
matrix_partition.get_indices(),
matrix_partition.get_indices() + number_of_edges,
edgelist_minors);
if (edgelist_weights) {
thrust::copy(handle.get_thrust_policy(),
*(matrix_partition.get_weights()),
*(matrix_partition.get_weights()) + number_of_edges,
(*edgelist_weights));
}
}
kaatish marked this conversation as resolved.
Show resolved Hide resolved

template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
void decompress_matrix_partition_to_edgelist(
raft::handle_t const& handle,
matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> const matrix_partition,
vertex_t const* majors,
kaatish marked this conversation as resolved.
Show resolved Hide resolved
vertex_t* edgelist_majors /* [OUT] */,
vertex_t* edgelist_minors /* [OUT] */,
std::optional<weight_t*> edgelist_weights /* [OUT] */,
Expand Down
33 changes: 29 additions & 4 deletions cpp/include/cugraph/detail/graph_functions.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,6 @@ partition_information(raft::handle_t const& handle, GraphViewType const& graph_v
* Collect all the edges that are present in the adjacency lists on the current gpu
*
* @tparam GraphViewType Type of the passed non-owning graph object.
* @tparam EdgeIndexIterator Type of the iterator for edge indices.
* @tparam GPUIdIterator Type of the iterator for gpu id identifiers.
kaatish marked this conversation as resolved.
Show resolved Hide resolved
* @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
* handles to various CUDA libraries) to run graph algorithms.
Expand All @@ -167,14 +166,14 @@ partition_information(raft::handle_t const& handle, GraphViewType const& graph_v
* gpus in the column communicator
* @param active_major_gpu_ids Device vector containing the gpu id associated by every vertex
* present in active_majors_in_row
* @param edge_index_first Iterator pointing to the first destination index
* @param minor_indices Device vector containing indices for the edges to be gathered on
* @param indices_per_source Number of indices supplied for every source in the range
* [vertex_input_first, vertex_input_last)
* @param global_degree_offset Global degree offset to local adjacency list for every source
* represented by current gpu
* @return A tuple of device vector containing the majors, minors and gpu_ids gathered locally
*/
template <typename GraphViewType, typename EdgeIndexIterator, typename gpu_t>
template <typename GraphViewType, typename gpu_t>
std::tuple<rmm::device_uvector<typename GraphViewType::vertex_type>,
rmm::device_uvector<typename GraphViewType::vertex_type>,
rmm::device_uvector<gpu_t>>
Expand All @@ -183,10 +182,36 @@ gather_local_edges(
GraphViewType const& graph_view,
const rmm::device_uvector<typename GraphViewType::vertex_type>& active_majors_in_row,
const rmm::device_uvector<gpu_t>& active_major_gpu_ids,
kaatish marked this conversation as resolved.
Show resolved Hide resolved
EdgeIndexIterator edge_index_first,
rmm::device_uvector<typename GraphViewType::edge_type>& minor_indices,
typename GraphViewType::edge_type indices_per_major,
const rmm::device_uvector<typename GraphViewType::edge_type>& global_degree_offsets);

/**
* @brief Gather edge list for specified vertices
*
* Collect all the edges that are present in the adjacency lists on the current gpu
*
* @tparam GraphViewType Type of the passed non-owning graph object.
* @tparam prop_t Type of the property associated with the majors.
* @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
* handles to various CUDA libraries) to run graph algorithms.
* @param graph_view Non-owning graph object.
* @param active_majors_in_row Device vector containing all the vertex id that are processed by
* gpus in the column communicator
kaatish marked this conversation as resolved.
Show resolved Hide resolved
* @param active_major_property Device vector containing the property values associated by every
* vertex present in active_majors_in_row
* @return A tuple of device vector containing the majors, minors and properties gathered locally
*/
template <typename GraphViewType, typename prop_t>
std::tuple<rmm::device_uvector<typename GraphViewType::vertex_type>,
rmm::device_uvector<typename GraphViewType::vertex_type>,
rmm::device_uvector<prop_t>>
gather_one_hop_edgelist(
raft::handle_t const& handle,
GraphViewType const& graph_view,
const rmm::device_uvector<typename GraphViewType::vertex_type>& active_majors_in_row,
kaatish marked this conversation as resolved.
Show resolved Hide resolved
const rmm::device_uvector<prop_t>& active_major_property);

} // namespace detail

} // namespace cugraph