scikit-learn · thomasjpfan · Apr 26, 2021 · Feb 18, 2021 · Feb 18, 2021 · Feb 18, 2021
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
@@ -142,6 +142,12 @@ Changelog
   during affinity matrix computation for :class:`manifold.TSNE`.
   :pr:`19472` by :user:`Dmitry Kobak <dkobak>`.
 
+- |Enhancement| Implement `'auto'` heuristic for the `learning_rate` in
+  :class:`manifold.TSNE`. It will become default in 1.2. The default
+  initialization will change to `pca` in 1.2. PCA initialization will
+  be scaled to have standard deviation 1e-4 in 1.2.
+  :pr:`19491` by :user:`Dmitry Kobak <dkobak>`.
+
 :mod:`sklearn.metrics`
 ......................
 

diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py
@@ -517,13 +517,21 @@ class TSNE(BaseEstimator):
         optimization, the early exaggeration factor or the learning rate
         might be too high.
 
-    learning_rate : float, default=200.0
+    learning_rate : float or 'auto', default=200.0
         The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If
         the learning rate is too high, the data may look like a 'ball' with any
         point approximately equidistant from its nearest neighbours. If the
         learning rate is too low, most points may look compressed in a dense
         cloud with few outliers. If the cost function gets stuck in a bad local
         minimum increasing the learning rate may help.
+        Note that many other t-SNE implementations (bhtsne, FIt-SNE, openTSNE,
+        etc.) use a definition of learning_rate that is 4 times smaller than
+        ours. So our learning_rate=200 corresponds to learning_rate=800 in
+        those other implementations.
+        The 'auto' option sets the learning_rate to N / early_exaggeration / 4,
+        where N is the sample size, following Belkina et al. 2019 and
+        Kobak et al. 2019, Nature Communications (or to 50.0, if
+        N / early_exaggeration / 4 < 50). This will become default in 1.2.
     References 
     References 
 
     n_iter : int, default=1000
         Maximum number of iterations for the optimization. Should be at
@@ -559,7 +567,8 @@ class TSNE(BaseEstimator):
         Initialization of embedding. Possible options are 'random', 'pca',
         and a numpy array of shape (n_samples, n_components).
         PCA initialization cannot be used with precomputed distances and is
-        usually more globally stable than random initialization.
+        usually more globally stable than random initialization. It will
+        become default in 1.2.
 
     verbose : int, default=0
         Verbosity level.
@@ -631,7 +640,8 @@ class TSNE(BaseEstimator):
     >>> import numpy as np
     >>> from sklearn.manifold import TSNE
     >>> X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
-    >>> X_embedded = TSNE(n_components=2).fit_transform(X)
+    >>> X_embedded = TSNE(n_components=2, learning_rate='auto',
+    ...                   init='random').fit_transform(X)
     >>> X_embedded.shape
     (4, 2)
 
@@ -656,9 +666,9 @@ class TSNE(BaseEstimator):
 
     @_deprecate_positional_args
     def __init__(self, n_components=2, *, perplexity=30.0,
-                 early_exaggeration=12.0, learning_rate=200.0, n_iter=1000,
+                 early_exaggeration=12.0, learning_rate="warn", n_iter=1000,
                  n_iter_without_progress=300, min_grad_norm=1e-7,
-                 metric="euclidean", init="random", verbose=0,
+                 metric="euclidean", init="warn", verbose=0,
                  random_state=None, method='barnes_hut', angle=0.5,
                  n_jobs=None, square_distances='legacy'):
         self.n_components = n_components
@@ -681,12 +691,35 @@ def __init__(self, n_components=2, *, perplexity=30.0,
     def _fit(self, X, skip_num_points=0):
         """Private function to fit the model using X as training data."""
 
+        if self.init == 'warn':
+            # See issue #18018
+            warnings.warn("The default initialization in TSNE will change "
+                          "from 'random' to 'pca' in 1.2.", FutureWarning)
+            self._init = 'random'
+        else:
+            self._init = self.init
+        if self.learning_rate == 'warn':
+            # See issue #18018
+            warnings.warn("The default learning rate in TSNE will change "
+                          "from 200.0 to 'auto' in 1.2.", FutureWarning)
+            self._learning_rate = 200.0
+        else:
+            self._learning_rate = self.learning_rate
+
         if self.method not in ['barnes_hut', 'exact']:
             raise ValueError("'method' must be 'barnes_hut' or 'exact'")
         if self.angle < 0.0 or self.angle > 1.0:
             raise ValueError("'angle' must be between 0.0 - 1.0")
         if self.square_distances not in [True, 'legacy']:
             raise ValueError("'square_distances' must be True or 'legacy'.")
+        if self._learning_rate == 'auto':
+            # See issue #18018
+            self._learning_rate = X.shape[0] / self.early_exaggeration / 4
+            self._learning_rate = np.maximum(self._learning_rate, 50)
+        else:
+            if not (self._learning_rate > 0):
+                raise ValueError("'learning_rate' must be a positive number "
+                                 "or 'auto'.")
         if self.metric != "euclidean" and self.square_distances is not True:
             warnings.warn(
                 "'square_distances' has been introduced in 0.24 to help phase "
@@ -706,7 +739,7 @@ def _fit(self, X, skip_num_points=0):
             X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'],
                                     dtype=[np.float32, np.float64])
         if self.metric == "precomputed":
-            if isinstance(self.init, str) and self.init == 'pca':
+            if isinstance(self._init, str) and self._init == 'pca':
                 raise ValueError("The parameter init=\"pca\" cannot be "
                                  "used with metric=\"precomputed\".")
             if X.shape[0] != X.shape[1]:
@@ -817,13 +850,20 @@ def _fit(self, X, skip_num_points=0):
             P = _joint_probabilities_nn(distances_nn, self.perplexity,
                                         self.verbose)
 
-        if isinstance(self.init, np.ndarray):
-            X_embedded = self.init
-        elif self.init == 'pca':
+        if isinstance(self._init, np.ndarray):
+            X_embedded = self._init
+        elif self._init == 'pca':
             pca = PCA(n_components=self.n_components, svd_solver='randomized',
                       random_state=random_state)
             X_embedded = pca.fit_transform(X).astype(np.float32, copy=False)
-        elif self.init == 'random':
+            # PCA is rescaled so that PC1 has standard deviation 1e-4 which is
+            # the default value for random initialization. See issue #18018.
+            warnings.warn("The PCA initialization in TSNE will change to "
+                          "have the standard deviation of PC1 equal to 1e-4 "
+                          "in 1.2. This will ensure better convergence.",
+                          FutureWarning)
+            # X_embedded = X_embedded / np.std(X_embedded[:, 0]) * 1e-4
+        elif self._init == 'random':
             # The embedding is initialized with iid samples from Gaussians with
             # standard deviation 1e-4.
             X_embedded = 1e-4 * random_state.randn(
@@ -857,7 +897,7 @@ def _tsne(self, P, degrees_of_freedom, n_samples, X_embedded,
             "it": 0,
             "n_iter_check": self._N_ITER_CHECK,
             "min_grad_norm": self.min_grad_norm,
-            "learning_rate": self.learning_rate,
+            "learning_rate": self._learning_rate,
             "verbose": self.verbose,
             "kwargs": dict(skip_num_points=skip_num_points),
             "args": [P, degrees_of_freedom, n_samples, self.n_components],