Skip to content

Commit

Permalink
Merge pull request #368 from Rhaedonius/predict_score
Browse files Browse the repository at this point in the history
Predict score
  • Loading branch information
lmcinnes committed Apr 21, 2020
2 parents fd99fbb + 0b259af commit 06450a0
Show file tree
Hide file tree
Showing 4 changed files with 147 additions and 19 deletions.
5 changes: 4 additions & 1 deletion hdbscan/__init__.py
@@ -1,6 +1,9 @@
from .hdbscan_ import HDBSCAN, hdbscan
from .robust_single_linkage_ import RobustSingleLinkage, robust_single_linkage
from .validity import validity_index
from .prediction import approximate_predict, membership_vector, all_points_membership_vectors
from .prediction import (approximate_predict,
membership_vector,
all_points_membership_vectors,
approximate_predict_scores)


3 changes: 3 additions & 0 deletions hdbscan/_hdbscan_tree.pyx
Expand Up @@ -295,6 +295,8 @@ cdef max_lambdas(np.ndarray tree):
# Initialize
current_parent = parent
max_lambda = lambda_

deaths[current_parent] = max_lambda # value for last parent

return deaths_arr

Expand Down Expand Up @@ -571,6 +573,7 @@ cpdef np.ndarray[np.double_t, ndim=1] outlier_scores(np.ndarray tree):
cluster = parent_array[n]
lambda_max = deaths[cluster]


if lambda_max == 0.0 or not np.isfinite(lambda_array[n]):
result[point] = 0.0
else:
Expand Down
132 changes: 117 additions & 15 deletions hdbscan/prediction.py
Expand Up @@ -7,7 +7,7 @@

from sklearn.neighbors import KDTree, BallTree
from .dist_metrics import DistanceMetric
from ._hdbscan_tree import compute_stability, labelling_at_cut, recurse_leaf_dfs
from ._hdbscan_tree import recurse_leaf_dfs
from ._prediction_utils import (get_tree_row_with_child,
dist_membership_vector,
outlier_membership_vector,
Expand Down Expand Up @@ -88,8 +88,7 @@ def _clusters_below(self, cluster):
return result

def _recurse_leaf_dfs(self, current_node):
children = self.cluster_tree[self.cluster_tree['parent'] ==
current_node]['child']
children = self.cluster_tree[self.cluster_tree['parent'] == current_node]['child']
if len(children) == 0:
return [current_node, ]
else:
Expand All @@ -111,8 +110,7 @@ def __init__(self, data, condensed_tree, min_samples,
self.cluster_map = {c: n for n, c in enumerate(sorted(list(selected_clusters)))}
self.reverse_cluster_map = {n: c for c, n in self.cluster_map.items()}

self.cluster_tree = raw_condensed_tree[raw_condensed_tree['child_size']
> 1]
self.cluster_tree = raw_condensed_tree[raw_condensed_tree['child_size'] > 1]
self.max_lambdas = {}
self.leaf_max_lambdas = {}
self.exemplars = []
Expand All @@ -126,8 +124,7 @@ def __init__(self, data, condensed_tree, min_samples,

for cluster in selected_clusters:
self.max_lambdas[cluster] = \
raw_condensed_tree['lambda_val'][raw_condensed_tree['parent']
== cluster].max()
raw_condensed_tree['lambda_val'][raw_condensed_tree['parent'] == cluster].max()

for sub_cluster in self._clusters_below(cluster):
self.cluster_map[sub_cluster] = self.cluster_map[cluster]
Expand All @@ -138,8 +135,9 @@ def __init__(self, data, condensed_tree, min_samples,
leaf_max_lambda = raw_condensed_tree['lambda_val'][
raw_condensed_tree['parent'] == leaf].max()
points = raw_condensed_tree['child'][
(raw_condensed_tree['parent'] == leaf) &
(raw_condensed_tree['lambda_val'] == leaf_max_lambda)]
(raw_condensed_tree['parent'] == leaf)
& (raw_condensed_tree['lambda_val'] == leaf_max_lambda)
]
cluster_exemplars = np.hstack([cluster_exemplars, points])

self.exemplars.append(self.raw_data[cluster_exemplars])
Expand Down Expand Up @@ -245,10 +243,9 @@ def _extend_condensed_tree(tree, neighbor_indices, neighbor_distances,
else:
# Find appropriate cluster based on lambda of new point
while potential_cluster > tree_root and \
tree[tree['child'] ==
potential_cluster]['lambda_val'] >= lambda_:
potential_cluster = tree['parent'][tree['child']
== potential_cluster][0]
tree[tree['child']
== potential_cluster]['lambda_val'] >= lambda_:
potential_cluster = tree['parent'][tree['child'] == potential_cluster][0]

new_tree_row = (potential_cluster, -1, 1, lambda_)

Expand Down Expand Up @@ -307,8 +304,8 @@ def _find_cluster_and_probability(tree, cluster_tree, neighbor_indices,
if neighbor_tree_row['lambda_val'] > lambda_:
# Find appropriate cluster based on lambda of new point
while potential_cluster > tree_root and \
cluster_tree['lambda_val'][cluster_tree['child']
== potential_cluster] >= lambda_:
cluster_tree['lambda_val'][cluster_tree['child']
== potential_cluster] >= lambda_:
potential_cluster = cluster_tree['parent'][cluster_tree['child']
== potential_cluster][0]

Expand Down Expand Up @@ -413,6 +410,111 @@ def approximate_predict(clusterer, points_to_predict):
return labels, probabilities


def approximate_predict_scores(clusterer, points_to_predict):
"""Predict the outlier score of new points. The returned scores
will be based on the original clustering found by ``clusterer``,
and therefore are not (necessarily) the outlier scores that would
be found by clustering the original data combined with
``points_to_predict``, hence the 'approximate' label.
If you simply wish to calculate the outlier scores for new points
in the 'best' way possible, this is the function to use. If you
want to predict the outlier score of ``points_to_predict`` with
the original data under HDBSCAN the most efficient existing approach
is to simply recluster with the new point(s) added to the original dataset.
Parameters
----------
clusterer : HDBSCAN
A clustering object that has been fit to the data and
either had ``prediction_data=True`` set, or called the
``generate_prediction_data`` method after the fact.
points_to_predict : array, or array-like (n_samples, n_features)
The new data points to predict cluster labels for. They should
have the same dimensionality as the original dataset over which
clusterer was fit.
Returns
-------
scores : array (n_samples,)
The predicted scores of the ``points_to_predict``
See Also
--------
:py:func:`hdbscan.predict.membership_vector`
:py:func:`hdbscan.predict.all_points_membership_vectors`
"""
try:
clusterer.prediction_data_
except AttributeError:
raise ValueError('Clusterer does not have prediction data!'
' Try fitting with prediction_data=True set,'
' or run generate_prediction_data on the clusterer')

points_to_predict = np.asarray(points_to_predict)

if points_to_predict.shape[1] != \
clusterer.prediction_data_.raw_data.shape[1]:
raise ValueError('New points dimension does not match fit data!')

if clusterer.prediction_data_.cluster_tree.shape[0] == 0:
warn('Clusterer does not have any defined clusters, new data'
' will be automatically predicted as outliers.')
scores = np.ones(points_to_predict.shape[0], dtype=np.int32)
return scores

scores = np.empty(points_to_predict.shape[0], dtype=np.float)

min_samples = clusterer.min_samples or clusterer.min_cluster_size
neighbor_distances, neighbor_indices = \
clusterer.prediction_data_.tree.query(points_to_predict,
k=2 * min_samples)

tree = clusterer.condensed_tree_._raw_tree

parent_array = tree['parent']

tree_root = parent_array.min()
max_lambdas = {}
for parent in np.unique(tree['parent']):
max_lambdas[parent] = tree[tree['parent'] == parent]['lambda_val'].max()

for n in np.argsort(parent_array):
cluster = tree['child'][n]
if cluster < tree_root:
break

parent = parent_array[n]
if max_lambdas[cluster] > max_lambdas[parent]:
max_lambdas[parent] = max_lambdas[cluster]

for i in range(points_to_predict.shape[0]):
neigh, lambda_ = _find_neighbor_and_lambda(
neighbor_indices[i],
neighbor_distances[i],
clusterer.prediction_data_.core_distances,
min_samples
)

neighbor_tree_row = get_tree_row_with_child(tree, neigh)
potential_cluster = neighbor_tree_row['parent']

if neighbor_distances[i].min() == 0:
# the point is in the dataset, fix lambda for rounding errors
lambda_ = neighbor_tree_row['lambda_val']

max_lambda = max_lambdas[potential_cluster]

if max_lambda > 0.0:
scores[i] = (max_lambda - lambda_) / max_lambda
else:
scores[i] = 0.0

return scores


def membership_vector(clusterer, points_to_predict):
"""Predict soft cluster membership. The result produces a vector
for each point in ``points_to_predict`` that gives a probability that
Expand Down
26 changes: 23 additions & 3 deletions hdbscan/tests/test_hdbscan.py
Expand Up @@ -21,6 +21,7 @@
hdbscan,
validity_index,
approximate_predict,
approximate_predict_scores,
membership_vector,
all_points_membership_vectors)
# from sklearn.cluster.tests.common import generate_clustered_data
Expand Down Expand Up @@ -277,17 +278,17 @@ def test_hdbscan_best_balltree_metric():

def test_hdbscan_no_clusters():
labels, p, persist, ctree, ltree, mtree = hdbscan(
X, min_cluster_size=len(X)+1)
X, min_cluster_size=len(X) + 1)
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert_equal(n_clusters_1, 0)

labels = HDBSCAN(min_cluster_size=len(X)+1).fit(X).labels_
labels = HDBSCAN(min_cluster_size=len(X) + 1).fit(X).labels_
n_clusters_2 = len(set(labels)) - int(-1 in labels)
assert_equal(n_clusters_2, 0)


def test_hdbscan_min_cluster_size():
for min_cluster_size in range(2, len(X)+1, 1):
for min_cluster_size in range(2, len(X) + 1, 1):
labels, p, persist, ctree, ltree, mtree = hdbscan(
X, min_cluster_size=min_cluster_size)
true_labels = [label for label in labels if label != -1]
Expand Down Expand Up @@ -474,6 +475,25 @@ def test_hdbscan_approximate_predict():
cluster, prob = approximate_predict(clusterer, np.array([[0.0, 0.0]]))
assert_equal(cluster, -1)


def test_hdbscan_approximate_predict_score():
clusterer = HDBSCAN(min_cluster_size=200).fit(X)
# no prediction data error
assert_raises(ValueError, approximate_predict_scores, clusterer, X)
clusterer.generate_prediction_data()
# wrong dimensions error
assert_raises(ValueError, approximate_predict_scores, clusterer, np.array([[1, 2, 3]]))
with warnings.catch_warnings(record=True) as w:
approximate_predict_scores(clusterer, np.array([[1.5, -1.0]]))
# no clusters warning
assert 'Clusterer does not have any defined clusters' in str(w[-1].message)
clusterer = HDBSCAN(prediction_data=True).fit(X)
scores = approximate_predict_scores(clusterer, X)
assert_array_almost_equal(scores, clusterer.outlier_scores_)
assert scores.min() >= 0
assert scores.max() <= 1


# def test_hdbscan_membership_vector():
# clusterer = HDBSCAN(prediction_data=True).fit(X)
# vector = membership_vector(clusterer, np.array([[-1.5, -1.0]]))
Expand Down

0 comments on commit 06450a0

Please sign in to comment.