scikit-learn · GaelVaroquaux · Mar 3, 2019 · Sep 15, 2018 · Sep 16, 2018 · Sep 16, 2018
diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
@@ -22,7 +22,8 @@
 
 
 def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean',
-           p=2, metric_params=None, maxima_ratio=.75,
+           p=2, metric_params=None, extract_method='sqlnk',
+           eps=0.5, maxima_ratio=.75,
            rejection_ratio=.7, similarity_threshold=0.4,
            significant_min=.003, min_cluster_size=.005,
            min_maxima_ratio=0.001, algorithm='ball_tree',
@@ -67,18 +68,29 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean',
     metric_params : dict, optional (default=None)
         Additional keyword arguments for the metric function.
 
+    extract_method : string, optional (default='sqlnk')
+        The extraction method used to extract clusters using the calculated
+        reachability and ordering. Possible values are "dbscan"
+        and "sqlnk".
+
+    eps : float, optional (default=0.5)
+        The maximum distance between two samples for them to be considered
+        as in the same neighborhood. Used ony when `extract_method='dbscan'`.
+
     maxima_ratio : float, optional (default=.75)
         The maximum ratio we allow of average height of clusters on the
         right and left to the local maxima in question. The higher the
         ratio, the more generous the algorithm is to preserving local
         minima, and the more cuts the resulting tree will have.
+        Used only when `extract_method='sqlnk'`.
 
     rejection_ratio : float, optional (default=.7)
         Adjusts the fitness of the clustering. When the maxima_ratio is
         exceeded, determine which of the clusters to the left and right to
         reject based on rejection_ratio. Higher values will result in points
         being more readily classified as noise; conversely, lower values will
         result in more points being clustered.
+        Used only when `extract_method='sqlnk'`.
 
     similarity_threshold : float, optional (default=.4)
         Used to check if nodes can be moved up one level, that is, if the
@@ -89,19 +101,23 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean',
         relative to the average of the reachability values of the parent
         node. A lower value for the similarity threshold means less levels
         in the tree.
+        Used only when `extract_method='sqlnk'`.
 
     significant_min : float, optional (default=.003)
         Sets a lower threshold on how small a significant maxima can be.
+        Used only when `extract_method='sqlnk'`.
 
     min_cluster_size : int > 1 or float between 0 and 1 (default=0.005)
         Minimum number of samples in an OPTICS cluster, expressed as an
         absolute number or a fraction of the number of samples (rounded
         to be at least 2).
+        Used only when `extract_method='sqlnk'`.
 
     min_maxima_ratio : float, optional (default=.001)
         Used to determine neighborhood size for minimum cluster membership.
         Each local maxima should be a largest value in a neighborhood
         of the `size min_maxima_ratio * len(X)` from left and right.
+        Used only when `extract_method='sqlnk'`.
 
     algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
         Algorithm used to compute the nearest neighbors:
@@ -151,7 +167,7 @@ def optics(X, min_samples=5, max_eps=np.inf, metric='euclidean',
     """
 
     clust = OPTICS(min_samples, max_eps, metric, p, metric_params,
-                   maxima_ratio, rejection_ratio,
+                   extract_method, eps, maxima_ratio, rejection_ratio,
                    similarity_threshold, significant_min,
                    min_cluster_size, min_maxima_ratio,
                    algorithm, leaf_size, n_jobs)
@@ -197,18 +213,29 @@ class OPTICS(BaseEstimator, ClusterMixin):
     metric_params : dict, optional (default=None)
         Additional keyword arguments for the metric function.
 
+    extract_method : string, optional (default='sqlnk')
+        The extraction method used to extract clusters using the calculated
+        reachability and ordering. Possible values are "dbscan"
+        and "sqlnk".
+
+    eps : float, optional (default=0.5)
+        The maximum distance between two samples for them to be considered
+        as in the same neighborhood. Used ony when `extract_method='dbscan'`.
+
     maxima_ratio : float, optional (default=.75)
         The maximum ratio we allow of average height of clusters on the
         right and left to the local maxima in question. The higher the
         ratio, the more generous the algorithm is to preserving local
         minima, and the more cuts the resulting tree will have.
+        Used only when `extract_method='sqlnk'`.
 
     rejection_ratio : float, optional (default=.7)
         Adjusts the fitness of the clustering. When the maxima_ratio is
         exceeded, determine which of the clusters to the left and right to
         reject based on rejection_ratio. Higher values will result in points
         being more readily classified as noise; conversely, lower values will
         result in more points being clustered.
+        Used only when `extract_method='sqlnk'`.
 
     similarity_threshold : float, optional (default=.4)
         Used to check if nodes can be moved up one level, that is, if the
@@ -219,19 +246,23 @@ class OPTICS(BaseEstimator, ClusterMixin):
         relative to the average of the reachability values of the parent
         node. A lower value for the similarity threshold means less levels
         in the tree.
+        Used only when `extract_method='sqlnk'`.
 
     significant_min : float, optional (default=.003)
         Sets a lower threshold on how small a significant maxima can be.
+        Used only when `extract_method='sqlnk'`.
 
     min_cluster_size : int > 1 or float between 0 and 1 (default=0.005)
         Minimum number of samples in an OPTICS cluster, expressed as an
         absolute number or a fraction of the number of samples (rounded
         to be at least 2).
+        Used only when `extract_method='sqlnk'`.
 
     min_maxima_ratio : float, optional (default=.001)
         Used to determine neighborhood size for minimum cluster membership.
         Each local maxima should be a largest value in a neighborhood
         of the `size min_maxima_ratio * len(X)` from left and right.
+        Used only when `extract_method='sqlnk'`.
 
     algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
         Algorithm used to compute the nearest neighbors:
@@ -291,7 +322,8 @@ class OPTICS(BaseEstimator, ClusterMixin):
     """
 
     def __init__(self, min_samples=5, max_eps=np.inf, metric='euclidean',
-                 p=2, metric_params=None, maxima_ratio=.75,
+                 p=2, metric_params=None, extract_method='sqlnk',
+                 eps=0.5, maxima_ratio=.75,
                  rejection_ratio=.7, similarity_threshold=0.4,
                  significant_min=.003, min_cluster_size=.005,
                  min_maxima_ratio=0.001, algorithm='ball_tree',
@@ -310,6 +342,8 @@ def __init__(self, min_samples=5, max_eps=np.inf, metric='euclidean',
         self.metric_params = metric_params
         self.p = p
         self.leaf_size = leaf_size
+        self.extract_method = extract_method
+        self.eps = eps
         self.n_jobs = n_jobs
 
     def fit(self, X, y=None):
@@ -352,6 +386,11 @@ def fit(self, X, y=None):
                              'number of samples (%d). Got %d' %
                              (n_samples, self.min_cluster_size))
 
+        if self.extract_method not in ['dbscan', 'sqlnk']:
+            raise ValueError("extract_method should be one of"
+                             " 'dbscan' or 'sqlnk', but is %s" %
+                             self.extract_method)
+
         # Start all points as 'unprocessed' ##
         self.reachability_ = np.empty(n_samples)
         self.reachability_.fill(np.inf)
@@ -362,25 +401,34 @@ def fit(self, X, y=None):
 
         nbrs = NearestNeighbors(n_neighbors=self.min_samples,
                                 algorithm=self.algorithm,
-                                leaf_size=self.leaf_size, metric=self.metric,
-                                metric_params=self.metric_params, p=self.p,
+                                leaf_size=self.leaf_size,
+                                metric=self.metric,
+                                metric_params=self.metric_params,
+                                p=self.p,
                                 n_jobs=self.n_jobs)
 
         nbrs.fit(X)
-        self.core_distances_[:] = nbrs.kneighbors(X,
-                                                  self.min_samples)[0][:, -1]
+        self.core_distances_[:] = nbrs.kneighbors(
+            X, self.min_samples)[0][:, -1]
 
         self.ordering_ = self._calculate_optics_order(X, nbrs)
 
-        indices_, self.labels_ = _extract_optics(self.ordering_,
-                                                 self.reachability_,
-                                                 self.maxima_ratio,
-                                                 self.rejection_ratio,
-                                                 self.similarity_threshold,
-                                                 self.significant_min,
-                                                 self.min_cluster_size,
-                                                 self.min_maxima_ratio)
+        # Extract clusters from the calculated orders and reachability
+        if self.extract_method == 'sqlnk':
+            extract_params = {
+                'maxima_ratio': self.maxima_ratio,
+                'rejection_ratio': self.rejection_ratio,
+                'similarity_threshold': self.similarity_threshold,
+                'significant_min': self.significant_min,
+                'min_cluster_size': self.min_cluster_size,
+                'min_maxima_ratio': self.min_maxima_ratio
+            }
+            indices_, labels_ = self.extract_sqlnk(**extract_params)
+        elif self.extract_method == 'dbscan':
+            indices_, labels_ = self.extract_dbscan(self.eps)
+
         self.core_sample_indices_ = indices_
+        self.labels_ = labels_
         return self
 
     # OPTICS helper functions
@@ -430,7 +478,7 @@ def _set_reach_dist(self, point_index, processed, X, nbrs):
         return (unproc[quick_scan(np.take(self.reachability_, unproc),
                                   dists)])
 
-    def extract_dbscan(self, eps):
+    def extract_dbscan(self, eps=None):
         """Performs DBSCAN extraction for an arbitrary epsilon.
 
         Extraction runs in linear time. Note that if the `max_eps` OPTICS
@@ -441,7 +489,7 @@ def extract_dbscan(self, eps):
 
         Parameters
         ----------
-        eps : float or int, required
+        eps : float, optional
             DBSCAN `eps` parameter. Must be set to < `max_eps`. Equivalence
             with DBSCAN algorithm is achieved if `eps` is < (`max_eps` / 5)
 
@@ -454,6 +502,8 @@ def extract_dbscan(self, eps):
             The estimated labels.
         """
         check_is_fitted(self, 'reachability_')
+        if eps is None:
+            eps = self.eps
 
         if eps > self.max_eps:
             raise ValueError('Specify an epsilon smaller than %s. Got %s.'
@@ -469,6 +519,84 @@ def extract_dbscan(self, eps):
         return _extract_dbscan(self.ordering_, self.core_distances_,
                                self.reachability_, eps)
 
+    def extract_sqlnk(self, maxima_ratio=None,
+                      rejection_ratio=None, similarity_threshold=None,
+                      significant_min=None, min_cluster_size=None,
+                      min_maxima_ratio=None):
+        """Performs automatic cluster extraction for variable density data.
+        All parameters will use the value present in the class instance if
+        not provided.
+
+        Parameters
+        ----------
+        maxima_ratio : float, optional
+            The maximum ratio we allow of average height of clusters on the
+            right and left to the local maxima in question. The higher the
+            ratio, the more generous the algorithm is to preserving local
+            minima, and the more cuts the resulting tree will have.
+
+        rejection_ratio : float, optional
+            Adjusts the fitness of the clustering. When the maxima_ratio is
+            exceeded, determine which of the clusters to the left and right to
+            reject based on rejection_ratio. Higher values will result in
+            points being more readily classified as noise; conversely, lower
+            values will result in more points being clustered.
+
+        similarity_threshold : float, optional
+            Used to check if nodes can be moved up one level, that is, if the
+            new cluster created is too "similar" to its parent, given the
+            similarity threshold. Similarity can be determined by 1) the size
+            of the new cluster relative to the size of the parent node or
+            2) the average of the reachability values of the new cluster
+            relative to the average of the reachability values of the parent
+            node. A lower value for the similarity threshold means less levels
+            in the tree.
+
+        significant_min : float, optional
+            Sets a lower threshold on how small a significant maxima can be.
+
+        min_cluster_size : int > 1 or float between 0 and 1
+            Minimum number of samples in an OPTICS cluster, expressed as an
+            absolute number or a fraction of the number of samples (rounded
+            to be at least 2).
+
+        min_maxima_ratio : float, optional
+            Used to determine neighborhood size for minimum cluster membership.
+
+        Returns
+        -------
+        core_sample_indices_ : array, shape (n_core_samples,)
+            The indices of the core samples.
+
+        labels_ : array, shape (n_samples,)
+            The estimated labels.
+        """
+        check_is_fitted(self, 'reachability_')
+
+        if maxima_ratio is None:
+            maxima_ratio = self.maxima_ratio
+        if rejection_ratio is None:
+            rejection_ratio = self.rejection_ratio
+        if similarity_threshold is None:
+            similarity_threshold = self.similarity_threshold
+        if significant_min is None:
+            significant_min = self.significant_min
+        if min_cluster_size is None:
+            min_cluster_size = self.min_cluster_size
+        if min_maxima_ratio is None:
+            min_maxima_ratio = self.min_maxima_ratio
+
+        return _extract_optics(
+            ordering=self.ordering_,
+            reachability=self.reachability_,
+            maxima_ratio=maxima_ratio,
+            rejection_ratio=rejection_ratio,
+            similarity_threshold=similarity_threshold,
+            significant_min=significant_min,
+            min_cluster_size=min_cluster_size,
+            min_maxima_ratio=min_maxima_ratio
+        )
+
 
 def _extract_dbscan(ordering, core_distances, reachability, eps):
     """Performs DBSCAN extraction for an arbitrary epsilon (`eps`).

diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
@@ -436,3 +436,50 @@ def test_reach_dists():
     else:
         # we compare to truncated decimals, so use atol
         assert_allclose(clust.reachability_, np.array(v), atol=1e-5)
+
+
+def test_wrong_extract_method():
+    clust = OPTICS(extract_method='superfancy')
+    with pytest.raises(ValueError, match="extract_method should be one of "):
+        clust.fit(X)
+
+
+def test_extract_dbscan():
+    # testing an easy dbscan case. Not including clusters with different
+    # densities.
+    rng = np.random.RandomState(0)
+    n_points_per_cluster = 20
+    C1 = [-5, -2] + .2 * rng.randn(n_points_per_cluster, 2)
+    C2 = [4, -1] + .2 * rng.randn(n_points_per_cluster, 2)
+    C3 = [1, 2] + .2 * rng.randn(n_points_per_cluster, 2)
+    C4 = [-2, 3] + .2 * rng.randn(n_points_per_cluster, 2)
+    X = np.vstack((C1, C2, C3, C4))
+
+    clust = OPTICS(extract_method='dbscan', eps=.5).fit(X)
+    assert_array_equal(np.sort(np.unique(clust.labels_)), [0, 1, 2, 3])
+
+
+def test_extract_dbscan_no_param():
+    # run on smaller data for speed
+    locX = X[::10]
+    cl = OPTICS(extract_method='dbscan').fit(locX)
+    res = cl.extract_dbscan()
+    l1, c1 = res[0].copy(), res[1].copy()
+    res = cl.extract_dbscan(cl.eps)
+    l2, c2 = res[0].copy(), res[1].copy()
+    assert_array_equal(l1, l2)
+    assert_array_equal(c1, c2)
+
+
+def test_extract_sqlnk_no_param():
+    # run on smaller data for speed
+    locX = X[::10]
+    cl = OPTICS(extract_method='sqlnk').fit(locX)
+    res = cl.extract_sqlnk()
+    l1, c1 = res[0].copy(), res[1].copy()
+    res = cl.extract_sqlnk(cl.maxima_ratio, cl.rejection_ratio,
+                           cl.similarity_threshold, cl.significant_min,
+                           cl.min_cluster_size, cl.min_maxima_ratio)
+    l2, c2 = res[0].copy(), res[1].copy()
+    assert_array_equal(l1, l2)
+    assert_array_equal(c1, c2)