Merge pull request #368 from Rhaedonius/predict_score

Predict score
scikit-learn-contrib · Apr 21, 2020 · 06450a0 · 06450a0
2 parents fd99fbb + 0b259af
commit 06450a0
Show file tree

Hide file tree

Showing 4 changed files with 147 additions and 19 deletions.
diff --git a/hdbscan/__init__.py b/hdbscan/__init__.py
@@ -1,6 +1,9 @@
 from .hdbscan_ import HDBSCAN, hdbscan
 from .robust_single_linkage_ import RobustSingleLinkage, robust_single_linkage
 from .validity import validity_index
-from .prediction import approximate_predict, membership_vector, all_points_membership_vectors
+from .prediction import (approximate_predict,
+                         membership_vector,
+                         all_points_membership_vectors,
+                         approximate_predict_scores)
 
 
diff --git a/hdbscan/_hdbscan_tree.pyx b/hdbscan/_hdbscan_tree.pyx
@@ -295,6 +295,8 @@ cdef max_lambdas(np.ndarray tree):
             # Initialize
             current_parent = parent
             max_lambda = lambda_
+
+    deaths[current_parent] = max_lambda # value for last parent
 
     return deaths_arr
 
@@ -571,6 +573,7 @@ cpdef np.ndarray[np.double_t, ndim=1] outlier_scores(np.ndarray tree):
         cluster = parent_array[n]
         lambda_max = deaths[cluster]
 
+
         if lambda_max == 0.0 or not np.isfinite(lambda_array[n]):
             result[point] = 0.0
         else:

diff --git a/hdbscan/prediction.py b/hdbscan/prediction.py
@@ -7,7 +7,7 @@
 
 from sklearn.neighbors import KDTree, BallTree
 from .dist_metrics import DistanceMetric
-from ._hdbscan_tree import compute_stability, labelling_at_cut, recurse_leaf_dfs
+from ._hdbscan_tree import recurse_leaf_dfs
 from ._prediction_utils import (get_tree_row_with_child,
                                 dist_membership_vector,
                                 outlier_membership_vector,
@@ -88,8 +88,7 @@ def _clusters_below(self, cluster):
         return result
 
     def _recurse_leaf_dfs(self, current_node):
-        children = self.cluster_tree[self.cluster_tree['parent'] ==
-                                     current_node]['child']
+        children = self.cluster_tree[self.cluster_tree['parent'] == current_node]['child']
         if len(children) == 0:
             return [current_node, ]
         else:
@@ -111,8 +110,7 @@ def __init__(self, data, condensed_tree, min_samples,
         self.cluster_map = {c: n for n, c in enumerate(sorted(list(selected_clusters)))}
         self.reverse_cluster_map = {n: c for c, n in self.cluster_map.items()}
 
-        self.cluster_tree = raw_condensed_tree[raw_condensed_tree['child_size']
-                                               > 1]
+        self.cluster_tree = raw_condensed_tree[raw_condensed_tree['child_size'] > 1]
         self.max_lambdas = {}
         self.leaf_max_lambdas = {}
         self.exemplars = []
@@ -126,8 +124,7 @@ def __init__(self, data, condensed_tree, min_samples,
 
         for cluster in selected_clusters:
             self.max_lambdas[cluster] = \
-                raw_condensed_tree['lambda_val'][raw_condensed_tree['parent']
-                                                 == cluster].max()
+                raw_condensed_tree['lambda_val'][raw_condensed_tree['parent'] == cluster].max()
 
             for sub_cluster in self._clusters_below(cluster):
                 self.cluster_map[sub_cluster] = self.cluster_map[cluster]
@@ -138,8 +135,9 @@ def __init__(self, data, condensed_tree, min_samples,
                 leaf_max_lambda = raw_condensed_tree['lambda_val'][
                     raw_condensed_tree['parent'] == leaf].max()
                 points = raw_condensed_tree['child'][
-                    (raw_condensed_tree['parent'] == leaf) &
-                    (raw_condensed_tree['lambda_val'] == leaf_max_lambda)]
+                    (raw_condensed_tree['parent'] == leaf)
+                    & (raw_condensed_tree['lambda_val'] == leaf_max_lambda)
+                ]
                 cluster_exemplars = np.hstack([cluster_exemplars, points])
 
             self.exemplars.append(self.raw_data[cluster_exemplars])
@@ -245,10 +243,9 @@ def _extend_condensed_tree(tree, neighbor_indices, neighbor_distances,
     else:
         # Find appropriate cluster based on lambda of new point
         while potential_cluster > tree_root and \
-                        tree[tree['child'] ==
-                                potential_cluster]['lambda_val'] >= lambda_:
-            potential_cluster = tree['parent'][tree['child']
-                                               == potential_cluster][0]
+                tree[tree['child']
+                     == potential_cluster]['lambda_val'] >= lambda_:
+            potential_cluster = tree['parent'][tree['child'] == potential_cluster][0]
 
         new_tree_row = (potential_cluster, -1, 1, lambda_)
 
@@ -307,8 +304,8 @@ def _find_cluster_and_probability(tree, cluster_tree, neighbor_indices,
     if neighbor_tree_row['lambda_val'] > lambda_:
         # Find appropriate cluster based on lambda of new point
         while potential_cluster > tree_root and \
-                        cluster_tree['lambda_val'][cluster_tree['child']
-                                == potential_cluster] >= lambda_:
+                cluster_tree['lambda_val'][cluster_tree['child']
+                                           == potential_cluster] >= lambda_:
             potential_cluster = cluster_tree['parent'][cluster_tree['child']
                                                        == potential_cluster][0]
 
@@ -413,6 +410,111 @@ def approximate_predict(clusterer, points_to_predict):
     return labels, probabilities
 
 
+def approximate_predict_scores(clusterer, points_to_predict):
+    """Predict the outlier score of new points. The returned scores
+    will be based on the original clustering found by ``clusterer``,
+    and therefore are not (necessarily) the outlier scores that would
+    be found by clustering the original data combined with
+    ``points_to_predict``, hence the 'approximate' label.
+
+    If you simply wish to calculate the outlier scores for new points
+    in the 'best' way possible, this is the function to use. If you
+    want to predict the outlier score of ``points_to_predict`` with
+    the original data under HDBSCAN the most efficient existing approach
+    is to simply recluster with the new point(s) added to the original dataset.
+
+    Parameters
+    ----------
+    clusterer : HDBSCAN
+        A clustering object that has been fit to the data and
+        either had ``prediction_data=True`` set, or called the
+        ``generate_prediction_data`` method after the fact.
+
+    points_to_predict : array, or array-like (n_samples, n_features)
+        The new data points to predict cluster labels for. They should
+        have the same dimensionality as the original dataset over which
+        clusterer was fit.
+
+    Returns
+    -------
+    scores : array (n_samples,)
+        The predicted scores of the ``points_to_predict``
+
+    See Also
+    --------
+    :py:func:`hdbscan.predict.membership_vector`
+    :py:func:`hdbscan.predict.all_points_membership_vectors`
+
+    """
+    try:
+        clusterer.prediction_data_
+    except AttributeError:
+        raise ValueError('Clusterer does not have prediction data!'
+                         ' Try fitting with prediction_data=True set,'
+                         ' or run generate_prediction_data on the clusterer')
+
+    points_to_predict = np.asarray(points_to_predict)
+
+    if points_to_predict.shape[1] != \
+            clusterer.prediction_data_.raw_data.shape[1]:
+        raise ValueError('New points dimension does not match fit data!')
+
+    if clusterer.prediction_data_.cluster_tree.shape[0] == 0:
+        warn('Clusterer does not have any defined clusters, new data'
+             ' will be automatically predicted as outliers.')
+        scores = np.ones(points_to_predict.shape[0], dtype=np.int32)
+        return scores
+
+    scores = np.empty(points_to_predict.shape[0], dtype=np.float)
+
+    min_samples = clusterer.min_samples or clusterer.min_cluster_size
+    neighbor_distances, neighbor_indices = \
+        clusterer.prediction_data_.tree.query(points_to_predict,
+                                              k=2 * min_samples)
+
+    tree = clusterer.condensed_tree_._raw_tree
+
+    parent_array = tree['parent']
+
+    tree_root = parent_array.min()
+    max_lambdas = {}
+    for parent in np.unique(tree['parent']):
+        max_lambdas[parent] = tree[tree['parent'] == parent]['lambda_val'].max()
+
+    for n in np.argsort(parent_array):
+        cluster = tree['child'][n]
+        if cluster < tree_root:
+            break
+
+        parent = parent_array[n]
+        if max_lambdas[cluster] > max_lambdas[parent]:
+            max_lambdas[parent] = max_lambdas[cluster]
+
+    for i in range(points_to_predict.shape[0]):
+        neigh, lambda_ = _find_neighbor_and_lambda(
+            neighbor_indices[i],
+            neighbor_distances[i],
+            clusterer.prediction_data_.core_distances,
+            min_samples
+        )
+
+        neighbor_tree_row = get_tree_row_with_child(tree, neigh)
+        potential_cluster = neighbor_tree_row['parent']
+
+        if neighbor_distances[i].min() == 0:
+            # the point is in the dataset, fix lambda for rounding errors
+            lambda_ = neighbor_tree_row['lambda_val']
+
+        max_lambda = max_lambdas[potential_cluster]
+
+        if max_lambda > 0.0:
+            scores[i] = (max_lambda - lambda_) / max_lambda
+        else:
+            scores[i] = 0.0
+
+    return scores
+
+
 def membership_vector(clusterer, points_to_predict):
     """Predict soft cluster membership. The result produces a vector
     for each point in ``points_to_predict`` that gives a probability that

diff --git a/hdbscan/tests/test_hdbscan.py b/hdbscan/tests/test_hdbscan.py
@@ -21,6 +21,7 @@
                      hdbscan,
                      validity_index,
                      approximate_predict,
+                     approximate_predict_scores,
                      membership_vector,
                      all_points_membership_vectors)
 # from sklearn.cluster.tests.common import generate_clustered_data
@@ -277,17 +278,17 @@ def test_hdbscan_best_balltree_metric():
 
 def test_hdbscan_no_clusters():
     labels, p, persist, ctree, ltree, mtree = hdbscan(
-        X, min_cluster_size=len(X)+1)
+        X, min_cluster_size=len(X) + 1)
     n_clusters_1 = len(set(labels)) - int(-1 in labels)
     assert_equal(n_clusters_1, 0)
 
-    labels = HDBSCAN(min_cluster_size=len(X)+1).fit(X).labels_
+    labels = HDBSCAN(min_cluster_size=len(X) + 1).fit(X).labels_
     n_clusters_2 = len(set(labels)) - int(-1 in labels)
     assert_equal(n_clusters_2, 0)
 
 
 def test_hdbscan_min_cluster_size():
-    for min_cluster_size in range(2, len(X)+1, 1):
+    for min_cluster_size in range(2, len(X) + 1, 1):
         labels, p, persist, ctree, ltree, mtree = hdbscan(
             X, min_cluster_size=min_cluster_size)
         true_labels = [label for label in labels if label != -1]
@@ -474,6 +475,25 @@ def test_hdbscan_approximate_predict():
     cluster, prob = approximate_predict(clusterer, np.array([[0.0, 0.0]]))
     assert_equal(cluster, -1)
 
+
+def test_hdbscan_approximate_predict_score():
+    clusterer = HDBSCAN(min_cluster_size=200).fit(X)
+    # no prediction data error
+    assert_raises(ValueError, approximate_predict_scores, clusterer, X)
+    clusterer.generate_prediction_data()
+    # wrong dimensions error
+    assert_raises(ValueError, approximate_predict_scores, clusterer, np.array([[1, 2, 3]]))
+    with warnings.catch_warnings(record=True) as w:
+        approximate_predict_scores(clusterer, np.array([[1.5, -1.0]]))
+        # no clusters warning
+        assert 'Clusterer does not have any defined clusters' in str(w[-1].message)
+    clusterer = HDBSCAN(prediction_data=True).fit(X)
+    scores = approximate_predict_scores(clusterer, X)
+    assert_array_almost_equal(scores, clusterer.outlier_scores_)
+    assert scores.min() >= 0
+    assert scores.max() <= 1
+
+
 # def test_hdbscan_membership_vector():
 #     clusterer = HDBSCAN(prediction_data=True).fit(X)
 #     vector = membership_vector(clusterer, np.array([[-1.5, -1.0]]))