Skip to content

Commit

Permalink
Merge pull request #394 from GregDemand/master
Browse files Browse the repository at this point in the history
Fixed off by one errors in min_samples for multiple algorithms
  • Loading branch information
lmcinnes committed Nov 24, 2021
2 parents 54da636 + 2555a38 commit 5425cd2
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 13 deletions.
19 changes: 12 additions & 7 deletions hdbscan/_hdbscan_boruvka.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -454,8 +454,10 @@ cdef class KDTreeBoruvkaAlgorithm (object):
# issues, but we'll get quite a few, and they are the hard ones to
# get, so fill in any we can and then run update components.
for n in range(self.num_points):
for i in range(1, self.min_samples + 1):
for i in range(0, self.min_samples + 1):
m = knn_indices[n, i]
if n == m:
continue
if self.core_distance[m] <= self.core_distance[n]:
self.candidate_point[n] = n
self.candidate_neighbor[n] = m
Expand Down Expand Up @@ -745,7 +747,7 @@ cdef class KDTreeBoruvkaAlgorithm (object):
# then propagate the results of that computation
# up the tree.
new_bound = min(new_upper_bound,
new_lower_bound + 2 * node1_info.radius)
new_lower_bound + 2 * self.dist._dist_to_rdist(node1_info.radius))
# new_bound = new_upper_bound
if new_bound < self.bounds_ptr[node1]:
self.bounds_ptr[node1] = new_bound
Expand Down Expand Up @@ -1028,33 +1030,36 @@ cdef class BallTreeBoruvkaAlgorithm (object):
knn_data = Parallel(n_jobs=self.n_jobs, max_nbytes=None)(
delayed(_core_dist_query)
(self.core_dist_tree, points,
self.min_samples)
self.min_samples + 1)
for points in datasets)
knn_dist = np.vstack([x[0] for x in knn_data])
knn_indices = np.vstack([x[1] for x in knn_data])
else:
knn_dist, knn_indices = self.core_dist_tree.query(
self.tree.data,
k=self.min_samples,
k=self.min_samples + 1,
dualtree=True,
breadth_first=True)

self.core_distance_arr = knn_dist[:, self.min_samples - 1].copy()
self.core_distance_arr = knn_dist[:, self.min_samples].copy()
self.core_distance = (<np.double_t[:self.num_points:1]> (
<np.double_t *> self.core_distance_arr.data))

# Since we already computed NN distances for the min_samples closest
# points we can use this to do the first round of boruvka -- we won't
# get every point due to core_distance/mutual reachability distance
# issues, but we'll get quite a few, and they are the hard ones to get,
# so fill in any we ca and then run update components.
# so fill in any we can and then run update components.
for n in range(self.num_points):
for i in range(self.min_samples - 1, 0):
for i in range(0, self.min_samples + 1):
m = knn_indices[n, i]
if n == m:
continue
if self.core_distance[m] <= self.core_distance[n]:
self.candidate_point[n] = n
self.candidate_neighbor[n] = m
self.candidate_distance[n] = self.core_distance[n]
break

self.update_components()

Expand Down
4 changes: 2 additions & 2 deletions hdbscan/_hdbscan_reachability.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ cpdef sparse_mutual_reachability(object lil_matrix, np.intp_t min_points=5,

for i in range(lil_matrix.shape[0]):
sorted_row_data = sorted(lil_matrix.data[i])
if min_points < len(sorted_row_data):
core_distance[i] = sorted_row_data[min_points]
if min_points - 1 < len(sorted_row_data):
core_distance[i] = sorted_row_data[min_points - 1]
else:
core_distance[i] = np.infty

Expand Down
5 changes: 3 additions & 2 deletions hdbscan/hdbscan_.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,9 +243,10 @@ def _hdbscan_prims_kdtree(
dist_metric = DistanceMetric.get_metric(metric, **kwargs)

# Get distance to kth nearest neighbour
core_distances = tree.query(X, k=min_samples, dualtree=True, breadth_first=True)[0][
core_distances = tree.query(X, k=min_samples + 1, dualtree=True, breadth_first=True)[0][
:, -1
].copy(order="C")

# Mutual reachability distance is implicit in mst_linkage_core_vector
min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha)

Expand Down Expand Up @@ -288,7 +289,7 @@ def _hdbscan_prims_balltree(
dist_metric = DistanceMetric.get_metric(metric, **kwargs)

# Get distance to kth nearest neighbour
core_distances = tree.query(X, k=min_samples, dualtree=True, breadth_first=True)[0][
core_distances = tree.query(X, k=min_samples + 1, dualtree=True, breadth_first=True)[0][
:, -1
].copy(order="C")

Expand Down
5 changes: 3 additions & 2 deletions hdbscan/tests/test_flat.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@
std = [0.5, 0.08, 0.06, 0.35, 0.35, 0.35]
X0, y0 = make_blobs(n_samples=[70, 30, 80, 100, 40, 150],
centers=centers,
cluster_std=std)
X1, y1 = make_moons(n_samples=300, noise=0.07)
cluster_std=std,
random_state=1)
X1, y1 = make_moons(n_samples=300, noise=0.07, random_state=42)
X1 += 3.
y1 += len(centers)
X = np.vstack((X0, X1))
Expand Down

0 comments on commit 5425cd2

Please sign in to comment.