Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG+1] OPTICS fit uses the selected extract_method parameter #12087

Merged
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
3ab2e63
add extract method parameter
adrinjalali Sep 15, 2018
7190098
change fancy to sqlnk
adrinjalali Sep 16, 2018
eb12095
test for invalid extract_method value
adrinjalali Sep 16, 2018
58913d8
test easy dbscan
adrinjalali Sep 16, 2018
4b06ba8
pep8
adrinjalali Sep 16, 2018
c9819d2
add extract_sqlnk
adrinjalali Sep 16, 2018
61b440f
mention which parameter is used for which extact method
adrinjalali Sep 16, 2018
9a65082
add tests for extract methods with no params given
adrinjalali Sep 16, 2018
bff04ee
merge master
adrinjalali Sep 17, 2018
36b8126
Merge remote-tracking branch 'upstream/master' into optics/choose_ext…
adrinjalali Sep 20, 2018
0d6006d
Merge branch 'master' into optics/choose_extractor
adrinjalali Sep 24, 2018
9a63e27
Add the reference for the SQLNK method.
adrinjalali Sep 24, 2018
456bd58
make extract_optics_dbscan and extract_optics_sqlnk public
adrinjalali Sep 24, 2018
36da998
merge upstream
adrinjalali Oct 7, 2018
841b19a
fix references, sync docstrings
adrinjalali Oct 16, 2018
80f3a13
Merge remote-tracking branch 'upstream/master' into optics/choose_ext…
adrinjalali Oct 30, 2018
4afc4f6
fix more morege conflicts
adrinjalali Oct 30, 2018
9811585
Merge remote-tracking branch 'upstream/master' into optics/choose_ext…
adrinjalali Nov 23, 2018
fbaf24b
reorganize the code and get it closer to what we want
adrinjalali Nov 23, 2018
8a9346a
pep8
adrinjalali Nov 23, 2018
7cfb68c
Merge remote-tracking branch 'upstream/master' into optics/choose_ext…
adrinjalali Feb 26, 2019
4a7821e
remove sqlnk, make compute_optics_graph public
adrinjalali Feb 26, 2019
3265c7d
pep8
adrinjalali Feb 26, 2019
79e12cb
remove unused reference
adrinjalali Feb 26, 2019
fa3f89d
make tests pass
adrinjalali Feb 26, 2019
751b33d
fix docstrings
adrinjalali Feb 26, 2019
d4ac534
fix removal of removed parameter
adrinjalali Feb 26, 2019
8b3c4fb
add back min_cluster size, xi needs it
adrinjalali Feb 26, 2019
6eead2f
Merge remote-tracking branch 'upstream/master' into optics/choose_ext…
adrinjalali Feb 26, 2019
1377cb0
pep8
adrinjalali Feb 26, 2019
48d7702
fix docstring issue
adrinjalali Feb 26, 2019
d06e5ff
address Hanmin's comments
adrinjalali Feb 26, 2019
4230567
Merge remote-tracking branch 'upstream/master' into optics/choose_ext…
adrinjalali Feb 26, 2019
fd3e16a
remove core_sample_indices_
adrinjalali Feb 26, 2019
98a34a9
fix comment
adrinjalali Feb 27, 2019
c926c9c
pep8
adrinjalali Feb 27, 2019
a8a40ca
apply comments, remove example
adrinjalali Feb 27, 2019
bb99ab0
add public functions to classes.rst and __init__
adrinjalali Feb 28, 2019
43447a6
address Joel's comments
adrinjalali Mar 1, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
162 changes: 145 additions & 17 deletions sklearn/cluster/optics_.py
Expand Up @@ -22,7 +22,8 @@


def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean',
p=2, metric_params=None, maxima_ratio=.75,
p=2, metric_params=None, extract_method='sqlnk',
eps=0.5, maxima_ratio=.75,
rejection_ratio=.7, similarity_threshold=0.4,
significant_min=.003, min_cluster_size=.005,
min_maxima_ratio=0.001, algorithm='ball_tree',
Expand Down Expand Up @@ -67,18 +68,29 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean',
metric_params : dict, optional (default=None)
Additional keyword arguments for the metric function.

extract_method : string, optional (default='sqlnk')
The extraction method used to extract clusters using the calculated
reachability and ordering. Possible values are "dbscan"
and "sqlnk".

eps : float, optional (default=0.5)
The maximum distance between two samples for them to be considered
as in the same neighborhood. Used ony when `extract_method='dbscan'`.

maxima_ratio : float, optional (default=.75)
The maximum ratio we allow of average height of clusters on the
right and left to the local maxima in question. The higher the
ratio, the more generous the algorithm is to preserving local
minima, and the more cuts the resulting tree will have.
Used only when `extract_method='sqlnk'`.

rejection_ratio : float, optional (default=.7)
Adjusts the fitness of the clustering. When the maxima_ratio is
exceeded, determine which of the clusters to the left and right to
reject based on rejection_ratio. Higher values will result in points
being more readily classified as noise; conversely, lower values will
result in more points being clustered.
Used only when `extract_method='sqlnk'`.

similarity_threshold : float, optional (default=.4)
Used to check if nodes can be moved up one level, that is, if the
Expand All @@ -89,19 +101,23 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean',
relative to the average of the reachability values of the parent
node. A lower value for the similarity threshold means less levels
in the tree.
Used only when `extract_method='sqlnk'`.

significant_min : float, optional (default=.003)
Sets a lower threshold on how small a significant maxima can be.
Used only when `extract_method='sqlnk'`.

min_cluster_size : int > 1 or float between 0 and 1 (default=0.005)
Minimum number of samples in an OPTICS cluster, expressed as an
absolute number or a fraction of the number of samples (rounded
to be at least 2).
Used only when `extract_method='sqlnk'`.

min_maxima_ratio : float, optional (default=.001)
Used to determine neighborhood size for minimum cluster membership.
Each local maxima should be a largest value in a neighborhood
of the `size min_maxima_ratio * len(X)` from left and right.
Used only when `extract_method='sqlnk'`.

algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
Algorithm used to compute the nearest neighbors:
Expand Down Expand Up @@ -151,7 +167,7 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean',
"""

clust = OPTICS(min_samples, max_eps, metric, p, metric_params,
maxima_ratio, rejection_ratio,
extract_method, eps, maxima_ratio, rejection_ratio,
similarity_threshold, significant_min,
min_cluster_size, min_maxima_ratio,
algorithm, leaf_size, n_jobs)
Expand Down Expand Up @@ -197,18 +213,29 @@ class OPTICS(BaseEstimator, ClusterMixin):
metric_params : dict, optional (default=None)
Additional keyword arguments for the metric function.

extract_method : string, optional (default='sqlnk')
The extraction method used to extract clusters using the calculated
reachability and ordering. Possible values are "dbscan"
and "sqlnk".
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sqlnk is not mentioned in the repo. The user has little way to identify its meaning.


eps : float, optional (default=0.5)
The maximum distance between two samples for them to be considered
as in the same neighborhood. Used ony when `extract_method='dbscan'`.

maxima_ratio : float, optional (default=.75)
The maximum ratio we allow of average height of clusters on the
right and left to the local maxima in question. The higher the
ratio, the more generous the algorithm is to preserving local
minima, and the more cuts the resulting tree will have.
Used only when `extract_method='sqlnk'`.

rejection_ratio : float, optional (default=.7)
Adjusts the fitness of the clustering. When the maxima_ratio is
exceeded, determine which of the clusters to the left and right to
reject based on rejection_ratio. Higher values will result in points
being more readily classified as noise; conversely, lower values will
result in more points being clustered.
Used only when `extract_method='sqlnk'`.

similarity_threshold : float, optional (default=.4)
Used to check if nodes can be moved up one level, that is, if the
Expand All @@ -219,19 +246,23 @@ class OPTICS(BaseEstimator, ClusterMixin):
relative to the average of the reachability values of the parent
node. A lower value for the similarity threshold means less levels
in the tree.
Used only when `extract_method='sqlnk'`.

significant_min : float, optional (default=.003)
Sets a lower threshold on how small a significant maxima can be.
Used only when `extract_method='sqlnk'`.

min_cluster_size : int > 1 or float between 0 and 1 (default=0.005)
Minimum number of samples in an OPTICS cluster, expressed as an
absolute number or a fraction of the number of samples (rounded
to be at least 2).
Used only when `extract_method='sqlnk'`.

min_maxima_ratio : float, optional (default=.001)
Used to determine neighborhood size for minimum cluster membership.
Each local maxima should be a largest value in a neighborhood
of the `size min_maxima_ratio * len(X)` from left and right.
Used only when `extract_method='sqlnk'`.

algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
Algorithm used to compute the nearest neighbors:
Expand Down Expand Up @@ -291,7 +322,8 @@ class OPTICS(BaseEstimator, ClusterMixin):
"""

def __init__(self, min_samples=5, max_eps=np.inf, metric='euclidean',
p=2, metric_params=None, maxima_ratio=.75,
p=2, metric_params=None, extract_method='sqlnk',
eps=0.5, maxima_ratio=.75,
rejection_ratio=.7, similarity_threshold=0.4,
significant_min=.003, min_cluster_size=.005,
min_maxima_ratio=0.001, algorithm='ball_tree',
Expand All @@ -310,6 +342,8 @@ def __init__(self, min_samples=5, max_eps=np.inf, metric='euclidean',
self.metric_params = metric_params
self.p = p
self.leaf_size = leaf_size
self.extract_method = extract_method
self.eps = eps
self.n_jobs = n_jobs

def fit(self, X, y=None):
Expand Down Expand Up @@ -352,6 +386,11 @@ def fit(self, X, y=None):
'number of samples (%d). Got %d' %
(n_samples, self.min_cluster_size))

if self.extract_method not in ['dbscan', 'sqlnk']:
raise ValueError("extract_method should be one of"
" 'dbscan' or 'sqlnk', but is %s" %
self.extract_method)

# Start all points as 'unprocessed' ##
self.reachability_ = np.empty(n_samples)
self.reachability_.fill(np.inf)
Expand All @@ -362,25 +401,34 @@ def fit(self, X, y=None):

nbrs = NearestNeighbors(n_neighbors=self.min_samples,
algorithm=self.algorithm,
leaf_size=self.leaf_size, metric=self.metric,
metric_params=self.metric_params, p=self.p,
leaf_size=self.leaf_size,
metric=self.metric,
metric_params=self.metric_params,
p=self.p,
n_jobs=self.n_jobs)

nbrs.fit(X)
self.core_distances_[:] = nbrs.kneighbors(X,
self.min_samples)[0][:, -1]
self.core_distances_[:] = nbrs.kneighbors(
X, self.min_samples)[0][:, -1]

self.ordering_ = self._calculate_optics_order(X, nbrs)

indices_, self.labels_ = _extract_optics(self.ordering_,
self.reachability_,
self.maxima_ratio,
self.rejection_ratio,
self.similarity_threshold,
self.significant_min,
self.min_cluster_size,
self.min_maxima_ratio)
# Extract clusters from the calculated orders and reachability
if self.extract_method == 'sqlnk':
extract_params = {
'maxima_ratio': self.maxima_ratio,
'rejection_ratio': self.rejection_ratio,
'similarity_threshold': self.similarity_threshold,
'significant_min': self.significant_min,
'min_cluster_size': self.min_cluster_size,
'min_maxima_ratio': self.min_maxima_ratio
}
indices_, labels_ = self.extract_sqlnk(**extract_params)
elif self.extract_method == 'dbscan':
indices_, labels_ = self.extract_dbscan(self.eps)
adrinjalali marked this conversation as resolved.
Show resolved Hide resolved

self.core_sample_indices_ = indices_
self.labels_ = labels_
return self

# OPTICS helper functions
Expand Down Expand Up @@ -430,7 +478,7 @@ def _set_reach_dist(self, point_index, processed, X, nbrs):
return (unproc[quick_scan(np.take(self.reachability_, unproc),
dists)])

def extract_dbscan(self, eps):
def extract_dbscan(self, eps=None):
"""Performs DBSCAN extraction for an arbitrary epsilon.

Extraction runs in linear time. Note that if the `max_eps` OPTICS
Expand All @@ -441,7 +489,7 @@ def extract_dbscan(self, eps):

Parameters
----------
eps : float or int, required
eps : float, optional
DBSCAN `eps` parameter. Must be set to < `max_eps`. Equivalence
with DBSCAN algorithm is achieved if `eps` is < (`max_eps` / 5)

Expand All @@ -454,6 +502,8 @@ def extract_dbscan(self, eps):
The estimated labels.
"""
check_is_fitted(self, 'reachability_')
if eps is None:
eps = self.eps

if eps > self.max_eps:
raise ValueError('Specify an epsilon smaller than %s. Got %s.'
Expand All @@ -469,6 +519,84 @@ def extract_dbscan(self, eps):
return _extract_dbscan(self.ordering_, self.core_distances_,
self.reachability_, eps)

def extract_sqlnk(self, maxima_ratio=None,
rejection_ratio=None, similarity_threshold=None,
significant_min=None, min_cluster_size=None,
min_maxima_ratio=None):
"""Performs automatic cluster extraction for variable density data.
All parameters will use the value present in the class instance if
not provided.
adrinjalali marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
maxima_ratio : float, optional
The maximum ratio we allow of average height of clusters on the
right and left to the local maxima in question. The higher the
ratio, the more generous the algorithm is to preserving local
minima, and the more cuts the resulting tree will have.

rejection_ratio : float, optional
Adjusts the fitness of the clustering. When the maxima_ratio is
exceeded, determine which of the clusters to the left and right to
reject based on rejection_ratio. Higher values will result in
points being more readily classified as noise; conversely, lower
values will result in more points being clustered.

similarity_threshold : float, optional
Used to check if nodes can be moved up one level, that is, if the
new cluster created is too "similar" to its parent, given the
similarity threshold. Similarity can be determined by 1) the size
of the new cluster relative to the size of the parent node or
2) the average of the reachability values of the new cluster
relative to the average of the reachability values of the parent
node. A lower value for the similarity threshold means less levels
in the tree.

significant_min : float, optional
Sets a lower threshold on how small a significant maxima can be.

min_cluster_size : int > 1 or float between 0 and 1
Minimum number of samples in an OPTICS cluster, expressed as an
absolute number or a fraction of the number of samples (rounded
to be at least 2).

min_maxima_ratio : float, optional
Used to determine neighborhood size for minimum cluster membership.

Returns
-------
core_sample_indices_ : array, shape (n_core_samples,)
The indices of the core samples.

labels_ : array, shape (n_samples,)
The estimated labels.
"""
check_is_fitted(self, 'reachability_')

if maxima_ratio is None:
maxima_ratio = self.maxima_ratio
if rejection_ratio is None:
rejection_ratio = self.rejection_ratio
if similarity_threshold is None:
similarity_threshold = self.similarity_threshold
if significant_min is None:
significant_min = self.significant_min
if min_cluster_size is None:
min_cluster_size = self.min_cluster_size
if min_maxima_ratio is None:
min_maxima_ratio = self.min_maxima_ratio

return _extract_optics(
ordering=self.ordering_,
reachability=self.reachability_,
maxima_ratio=maxima_ratio,
rejection_ratio=rejection_ratio,
similarity_threshold=similarity_threshold,
significant_min=significant_min,
min_cluster_size=min_cluster_size,
min_maxima_ratio=min_maxima_ratio
)


def _extract_dbscan(ordering, core_distances, reachability, eps):
"""Performs DBSCAN extraction for an arbitrary epsilon (`eps`).
Expand Down
47 changes: 47 additions & 0 deletions sklearn/cluster/tests/test_optics.py
Expand Up @@ -436,3 +436,50 @@ def test_reach_dists():
else:
# we compare to truncated decimals, so use atol
assert_allclose(clust.reachability_, np.array(v), atol=1e-5)


def test_wrong_extract_method():
clust = OPTICS(extract_method='superfancy')
with pytest.raises(ValueError, match="extract_method should be one of "):
clust.fit(X)


def test_extract_dbscan():
# testing an easy dbscan case. Not including clusters with different
# densities.
rng = np.random.RandomState(0)
n_points_per_cluster = 20
C1 = [-5, -2] + .2 * rng.randn(n_points_per_cluster, 2)
C2 = [4, -1] + .2 * rng.randn(n_points_per_cluster, 2)
C3 = [1, 2] + .2 * rng.randn(n_points_per_cluster, 2)
C4 = [-2, 3] + .2 * rng.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2, C3, C4))

clust = OPTICS(extract_method='dbscan', eps=.5).fit(X)
assert_array_equal(np.sort(np.unique(clust.labels_)), [0, 1, 2, 3])


def test_extract_dbscan_no_param():
# run on smaller data for speed
locX = X[::10]
cl = OPTICS(extract_method='dbscan').fit(locX)
res = cl.extract_dbscan()
l1, c1 = res[0].copy(), res[1].copy()
res = cl.extract_dbscan(cl.eps)
l2, c2 = res[0].copy(), res[1].copy()
assert_array_equal(l1, l2)
assert_array_equal(c1, c2)


def test_extract_sqlnk_no_param():
# run on smaller data for speed
locX = X[::10]
cl = OPTICS(extract_method='sqlnk').fit(locX)
res = cl.extract_sqlnk()
l1, c1 = res[0].copy(), res[1].copy()
res = cl.extract_sqlnk(cl.maxima_ratio, cl.rejection_ratio,
cl.similarity_threshold, cl.significant_min,
cl.min_cluster_size, cl.min_maxima_ratio)
l2, c2 = res[0].copy(), res[1].copy()
assert_array_equal(l1, l2)
assert_array_equal(c1, c2)