Skip to content

Commit

Permalink
All points membership vector for HDBSCAN (#4800)
Browse files Browse the repository at this point in the history
- [x] All points distance membership vector
- [x] All points outlier membership vector
- [x] All points probability in some cluster
- [x] All points membership vector
- [x] Tests

Authors:
  - Tarang Jain (https://github.com/tarang-jain)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: #4800
  • Loading branch information
tarang-jain committed Aug 26, 2022
1 parent 8881928 commit 80621f0
Show file tree
Hide file tree
Showing 13 changed files with 4,986 additions and 29 deletions.
3 changes: 2 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,8 @@ if(BUILD_CUML_CPP_LIBRARY)
src/genetic/program.cu
src/genetic/node.cu
src/hdbscan/hdbscan.cu
src/hdbscan/condensed_hierarchy.cu)
src/hdbscan/condensed_hierarchy.cu
src/hdbscan/prediction_data.cu)
endif()

if(all_algo OR holtwinters_algo)
Expand Down
103 changes: 103 additions & 0 deletions cpp/include/cuml/cluster/hdbscan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,79 @@ class hdbscan_output : public robust_single_linkage_output<value_idx, value_t> {

template class CondensedHierarchy<int, float>;

/**
* Container object for computing and storing intermediate information needed later for computing
* membership vectors and approximate_predict.
* @tparam value_idx
* @tparam value_t
*/
template <typename value_idx, typename value_t>
class PredictionData {
public:
PredictionData(const raft::handle_t& handle_, value_idx m, value_idx n)
: handle(handle_),
exemplar_idx(0, handle.get_stream()),
exemplar_label_offsets(0, handle.get_stream()),
n_selected_clusters(0),
selected_clusters(0, handle.get_stream()),
deaths(0, handle.get_stream()),
n_exemplars(0),
n_rows(m),
n_cols(n)
{
}
size_t n_rows;
size_t n_cols;

// Using getters here, making the members private and forcing
// consistent state with the constructor. This should make
// it much easier to use / debug.
value_idx get_n_exemplars() { return n_exemplars; }
value_idx get_n_selected_clusters() { return n_selected_clusters; }
value_idx* get_exemplar_idx() { return exemplar_idx.data(); }
value_idx* get_exemplar_label_offsets() { return exemplar_label_offsets.data(); }
value_idx* get_selected_clusters() { return selected_clusters.data(); }
value_t* get_deaths() { return deaths.data(); }

/**
* Resize buffers to the required sizes for storing data
* @param handle raft handle for ordering cuda operations
* @param n_exemplars_ number of exemplar points
* @param n_selected_clusters_ number of clusters selected
*/
void allocate(const raft::handle_t& handle,
value_idx n_exemplars_,
value_idx n_selected_clusters_);

/**
* Resize buffers for cluster deaths to n_clusters
* @param handle raft handle for ordering cuda operations
* @param n_clusters_
*/
void set_n_clusters(const raft::handle_t& handle, value_idx n_clusters_)
{
deaths.resize(n_clusters_, handle.get_stream());
}

private:
const raft::handle_t& handle;
rmm::device_uvector<value_idx> exemplar_idx;
rmm::device_uvector<value_idx> exemplar_label_offsets;
value_idx n_exemplars;
value_idx n_selected_clusters;
rmm::device_uvector<value_idx> selected_clusters;
rmm::device_uvector<value_t> deaths;
};

template class PredictionData<int, float>;

void build_prediction_data(const raft::handle_t& handle,
CondensedHierarchy<int, float>& condensed_tree,
int* labels,
int* label_map,
int n_selected_clusters,
PredictionData<int, float>& prediction_data);

}; // namespace Common
}; // namespace HDBSCAN

Expand Down Expand Up @@ -338,6 +411,29 @@ void hdbscan(const raft::handle_t& handle,
HDBSCAN::Common::HDBSCANParams& params,
HDBSCAN::Common::hdbscan_output<int, float>& out);

/**
* Executes HDBSCAN clustering on an mxn-dimensional input array, X and builds the PredictionData
* object which computes and stores information needed later for prediction algorithms.
*
* @param[in] handle raft handle for resource reuse
* @param[in] X array (size m, n) on device in row-major format
* @param m number of rows in X
* @param n number of columns in X
* @param metric distance metric to use
* @param params struct of configuration hyper-parameters
* @param out struct of output data and arrays on device
* @param prediction_data_ struct for storing computing and storing information to be used during
* prediction
*/
void hdbscan(const raft::handle_t& handle,
const float* X,
size_t m,
size_t n,
raft::distance::DistanceType metric,
HDBSCAN::Common::HDBSCANParams& params,
HDBSCAN::Common::hdbscan_output<int, float>& out,
HDBSCAN::Common::PredictionData<int, float>& prediction_data_);

void build_condensed_hierarchy(const raft::handle_t& handle,
const int* children,
const float* delta,
Expand All @@ -359,4 +455,11 @@ void _extract_clusters(const raft::handle_t& handle,
bool allow_single_cluster,
int max_cluster_size,
float cluster_selection_epsilon);

void _all_points_membership_vectors(const raft::handle_t& handle,
HDBSCAN::Common::CondensedHierarchy<int, float>& condensed_tree,
HDBSCAN::Common::PredictionData<int, float>& prediction_data,
float* membership_vec,
const float* X,
raft::distance::DistanceType metric);
} // END namespace ML
90 changes: 90 additions & 0 deletions cpp/src/hdbscan/detail/kernels/soft_clustering.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

namespace ML {
namespace HDBSCAN {
namespace detail {
namespace Predict {

template <typename value_idx, typename value_t, int tpb = 256>
__global__ void merge_height_kernel(value_t* heights,
value_t* lambdas,
value_idx* index_into_children,
value_idx* parents,
size_t m,
value_idx n_selected_clusters,
value_idx* selected_clusters)
{
value_idx idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx < value_idx(m * n_selected_clusters)) {
value_idx row = idx / n_selected_clusters;
value_idx col = idx % n_selected_clusters;
value_idx right_cluster = selected_clusters[col];
value_idx left_cluster = parents[index_into_children[row]];
bool took_right_parent = false;
bool took_left_parent = false;
value_idx last_cluster;

while (left_cluster != right_cluster) {
if (left_cluster > right_cluster) {
took_left_parent = true;
last_cluster = left_cluster;
left_cluster = parents[index_into_children[left_cluster]];
} else {
took_right_parent = true;
last_cluster = right_cluster;
right_cluster = parents[index_into_children[right_cluster]];
}
}

if (took_left_parent && took_right_parent) {
heights[idx] = lambdas[index_into_children[last_cluster]];
}

else {
heights[idx] = lambdas[index_into_children[row]];
}
}
}

template <typename value_idx, typename value_t>
__global__ void prob_in_some_cluster_kernel(value_t* heights,
value_t* height_argmax,
value_t* deaths,
value_idx* index_into_children,
value_idx* selected_clusters,
value_t* lambdas,
value_t* prob_in_some_cluster,
value_idx n_selected_clusters,
value_idx n_leaves,
size_t m)
{
value_idx idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx < (value_idx)m) {
value_t max_lambda = max(lambdas[index_into_children[idx]],
deaths[selected_clusters[(int)height_argmax[idx]] - n_leaves]);
prob_in_some_cluster[idx] =
heights[idx * n_selected_clusters + (int)height_argmax[idx]] / max_lambda;
return;
}
}

}; // namespace Predict
}; // namespace detail
}; // namespace HDBSCAN
}; // namespace ML

0 comments on commit 80621f0

Please sign in to comment.