Skip to content

Commit

Permalink
Merge 9cdc513 into f3769a5
Browse files Browse the repository at this point in the history
  • Loading branch information
LGro committed Mar 22, 2018
2 parents f3769a5 + 9cdc513 commit 7839af3
Showing 1 changed file with 37 additions and 2 deletions.
39 changes: 37 additions & 2 deletions hdbscan/hdbscan_.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,13 @@ def _hdbscan_generic(X, min_samples=5, alpha=1.0, metric='minkowski', p=2,
distance_matrix = pairwise_distances(X, metric=metric, p=p)
elif metric == 'arccos':
distance_matrix = pairwise_distances(X, metric='cosine', **kwargs)
elif metric == 'precomputed':
# Treating this case explicitly, instead of letting
# sklearn.metrics.pairwise_distances handle it,
# enables the usage of numpy.inf in the distance
# matrix to indicate missing distance information.
# TODO: Check if copying is necessary
distance_matrix = X.copy()
else:
distance_matrix = pairwise_distances(X, metric=metric, **kwargs)

Expand All @@ -86,6 +93,13 @@ def _hdbscan_generic(X, min_samples=5, alpha=1.0, metric='minkowski', p=2,

min_spanning_tree = mst_linkage_core(mutual_reachability_)

# Warn if the MST couldn't be constructed around the missing distances
if np.isinf(min_spanning_tree.T[2]).any():
warn('The minimum spanning tree contains edge weights with value '
'infinity. Potentially, you are missing too many distances '
'in the initial distance matrix for the given neighborhood '
'size.', UserWarning)

# mst_linkage_core does not generate a full minimal spanning tree
# If a tree is required then we must build the edges from the information
# returned by mst_linkage_core (i.e. just the order of points to be merged)
Expand Down Expand Up @@ -282,6 +296,14 @@ def _hdbscan_boruvka_balltree(X, min_samples=5, alpha=1.0,
return single_linkage_tree, None


def check_precomputed_distance_matrix(X):
"""Perform check_array(X) after removing infinite values (numpy.inf) from the given distance matrix.
"""
tmp = X.copy()
tmp[np.isinf(tmp)] = 1
check_array(tmp)


def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
metric='minkowski', p=2, leaf_size=40,
algorithm='best', memory=Memory(cachedir=None, verbose=0),
Expand Down Expand Up @@ -464,7 +486,13 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
'Should be one of: "eom", "leaf"\n')

# Checks input and converts to an nd-array where possible
X = check_array(X, accept_sparse='csr')
if metric != 'precomputed' or issparse(X):
X = check_array(X, accept_sparse='csr')
else:
# Only non-sparse, precomputed distance matrices are handled here
# and thereby allowed to contain numpy.inf for missing distances
check_precomputed_distance_matrix(X)

# Python 2 and 3 compliant string_type checking
if isinstance(memory, six.string_types):
memory = Memory(cachedir=memory, verbose=0)
Expand Down Expand Up @@ -798,9 +826,16 @@ def fit(self, X, y=None):
self : object
Returns self
"""
X = check_array(X, accept_sparse='csr')
if self.metric != 'precomputed':
X = check_array(X, accept_sparse='csr')
self._raw_data = X
elif issparse(X):
# Handle sparse precomputed distance matrices separately
X = check_array(X, accept_sparse='csr')
else:
# Only non-sparse, precomputed distance matrices are allowed
# to have numpy.inf values indicating missing distances
check_precomputed_distance_matrix(X)

kwargs = self.get_params()
# prediction data only applies to the persistent model, so remove
Expand Down

0 comments on commit 7839af3

Please sign in to comment.