diff --git a/hdbscan/hdbscan_.py b/hdbscan/hdbscan_.py index 9b8965f0..e29e2af4 100644 --- a/hdbscan/hdbscan_.py +++ b/hdbscan/hdbscan_.py @@ -71,6 +71,13 @@ def _hdbscan_generic(X, min_samples=5, alpha=1.0, metric='minkowski', p=2, distance_matrix = pairwise_distances(X, metric=metric, p=p) elif metric == 'arccos': distance_matrix = pairwise_distances(X, metric='cosine', **kwargs) + elif metric == 'precomputed': + # Treating this case explicitly, instead of letting + # sklearn.metrics.pairwise_distances handle it, + # enables the usage of numpy.inf in the distance + # matrix to indicate missing distance information. + # TODO: Check if copying is necessary + distance_matrix = X.copy() else: distance_matrix = pairwise_distances(X, metric=metric, **kwargs) @@ -86,6 +93,13 @@ def _hdbscan_generic(X, min_samples=5, alpha=1.0, metric='minkowski', p=2, min_spanning_tree = mst_linkage_core(mutual_reachability_) + # Warn if the MST couldn't be constructed around the missing distances + if np.isinf(min_spanning_tree.T[2]).any(): + warn('The minimum spanning tree contains edge weights with value ' + 'infinity. Potentially, you are missing too many distances ' + 'in the initial distance matrix for the given neighborhood ' + 'size.', UserWarning) + # mst_linkage_core does not generate a full minimal spanning tree # If a tree is required then we must build the edges from the information # returned by mst_linkage_core (i.e. just the order of points to be merged) @@ -282,6 +296,14 @@ def _hdbscan_boruvka_balltree(X, min_samples=5, alpha=1.0, return single_linkage_tree, None +def check_precomputed_distance_matrix(X): + """Perform check_array(X) after removing infinite values (numpy.inf) from the given distance matrix. + """ + tmp = X.copy() + tmp[np.isinf(tmp)] = 1 + check_array(tmp) + + def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0, metric='minkowski', p=2, leaf_size=40, algorithm='best', memory=Memory(cachedir=None, verbose=0), @@ -464,7 +486,13 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0, 'Should be one of: "eom", "leaf"\n') # Checks input and converts to an nd-array where possible - X = check_array(X, accept_sparse='csr') + if metric != 'precomputed' or issparse(X): + X = check_array(X, accept_sparse='csr') + else: + # Only non-sparse, precomputed distance matrices are handled here + # and thereby allowed to contain numpy.inf for missing distances + check_precomputed_distance_matrix(X) + # Python 2 and 3 compliant string_type checking if isinstance(memory, six.string_types): memory = Memory(cachedir=memory, verbose=0) @@ -798,9 +826,16 @@ def fit(self, X, y=None): self : object Returns self """ - X = check_array(X, accept_sparse='csr') if self.metric != 'precomputed': + X = check_array(X, accept_sparse='csr') self._raw_data = X + elif issparse(X): + # Handle sparse precomputed distance matrices separately + X = check_array(X, accept_sparse='csr') + else: + # Only non-sparse, precomputed distance matrices are allowed + # to have numpy.inf values indicating missing distances + check_precomputed_distance_matrix(X) kwargs = self.get_params() # prediction data only applies to the persistent model, so remove