scverse · pavlin-policar · Dec 31, 2020 · Apr 18, 2021 · pavlin-policar · Apr 18, 2021
diff --git a/requirements.txt b/requirements.txt
@@ -22,3 +22,4 @@ umap-learn>=0.3.10,<0.5
 legacy-api-wrap
 packaging
 sinfo
+openTSNE>=0.5.0
diff --git a/scanpy/neighbors/__init__.py b/scanpy/neighbors/__init__.py
@@ -2,6 +2,7 @@
 from typing import Union, Optional, Any, Mapping, Callable, NamedTuple, Generator, Tuple
 
 import numpy as np
+import openTSNE
 import scipy
 from anndata import AnnData
 from scipy.sparse import issparse, coo_matrix, csr_matrix
@@ -163,6 +164,137 @@ def neighbors(
     return adata if copy else None
 
 
+@_doc_params(n_pcs=doc_n_pcs, use_rep=doc_use_rep)
+def neighbors_tsne(
+    adata: AnnData,
+    perplexity: float = 30,
+    n_pcs: Optional[int] = None,
+    use_rep: Optional[str] = None,
+    knn: bool = True,
+    random_state: AnyRandom = 0,
+    metric: Union[_Metric, _MetricFn] = 'euclidean',
+    metric_kwds: Mapping[str, Any] = MappingProxyType({}),
+    key_added: Optional[str] = None,
+    copy: bool = False,
+) -> Optional[AnnData]:
+    """\
+    Compute a neighborhood graph of observations [McInnes18]_.
+
+    The neighbor search efficiency of this heavily relies on UMAP [McInnes18]_,
+    which also provides a method for estimating connectivities of data points -
+    the connectivity of the manifold (`method=='umap'`). If `method=='gauss'`,
+    connectivities are computed according to [Coifman05]_, in the adaption of
+    [Haghverdi16]_.
+
+    Parameters
+    ----------
+    adata
+        Annotated data matrix.
+    n_neighbors
+        The size of local neighborhood (in terms of number of neighboring data
+        points) used for manifold approximation. Larger values result in more
+        global views of the manifold, while smaller values result in more local
+        data being preserved. In general values should be in the range 2 to 100.
+        If `knn` is `True`, number of nearest neighbors to be searched. If `knn`
+        is `False`, a Gaussian kernel width is set to the distance of the
+        `n_neighbors` neighbor.
+    {n_pcs}
+    {use_rep}
+    knn
+        If `True`, use a hard threshold to restrict the number of neighbors to
+        `n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian
+        Kernel to assign low weights to neighbors more distant than the
+        `n_neighbors` nearest neighbor.
+    random_state
+        A numpy random seed.
+    method
+        Use 'umap' [McInnes18]_ or 'gauss' (Gauss kernel following [Coifman05]_
+        with adaptive width [Haghverdi16]_) for computing connectivities.
+        Use 'rapids' for the RAPIDS implementation of UMAP (experimental, GPU
+        only).
+    metric
+        A known metric’s name or a callable that returns a distance.
+    metric_kwds
+        Options for the metric.
+    key_added
+        If not specified, the neighbors data is stored in .uns['neighbors'],
+        distances and connectivities are stored in .obsp['distances'] and
+        .obsp['connectivities'] respectively.
+        If specified, the neighbors data is added to .uns[key_added],
+        distances are stored in .obsp[key_added+'_distances'] and
+        connectivities in .obsp[key_added+'_connectivities'].
+    copy
+        Return a copy instead of writing to adata.
+
+    Returns
+    -------
+    Depending on `copy`, updates or returns `adata` with the following:
+
+    See `key_added` parameter description for the storage path of
+    connectivities and distances.
+
+    **connectivities** : sparse matrix of dtype `float32`.
+        Weighted adjacency matrix of the neighborhood graph of data
+        points. Weights should be interpreted as connectivities.
+    **distances** : sparse matrix of dtype `float32`.
+        Instead of decaying weights, this stores distances for each pair of
+        neighbors.
+    """
+    start = logg.info('computing neighbors')
+    adata = adata.copy() if copy else adata
+    if adata.is_view:  # we shouldn't need this here...
+        adata._init_as_actual(adata.copy())
+
+    n_neighbors = int(np.ceil(perplexity * 3)) + 1  # +1 because we it includes self
+
+    neighbors = Neighbors(adata)
+    neighbors.compute_neighbors(
+        n_neighbors=n_neighbors, knn=knn, n_pcs=n_pcs, use_rep=use_rep,
+        method="tsne", metric=metric, metric_kwds=metric_kwds,
+        random_state=random_state,
+    )
+
+    if key_added is None:
+        key_added = 'neighbors'
+        conns_key = 'connectivities'
+        dists_key = 'distances'
+    else:
+        conns_key = key_added + '_connectivities'
+        dists_key = key_added + '_distances'
+
+    adata.uns[key_added] = {}
+
+    neighbors_dict = adata.uns[key_added]
+
+    neighbors_dict['connectivities_key'] = conns_key
+    neighbors_dict['distances_key'] = dists_key
+
+    neighbors_dict['params'] = {'n_neighbors': neighbors.n_neighbors, 'method': 'tsne'}
+    neighbors_dict['params']['metric'] = metric
+    if metric_kwds:
+        neighbors_dict['params']['metric_kwds'] = metric_kwds
+    if use_rep is not None:
+        neighbors_dict['params']['use_rep'] = use_rep
+    if n_pcs is not None:
+        neighbors_dict['params']['n_pcs'] = n_pcs
+
+    adata.obsp[dists_key] = neighbors.distances
+    adata.obsp[conns_key] = neighbors.connectivities
+
+    if neighbors.rp_forest is not None:
+        neighbors_dict['rp_forest'] = neighbors.rp_forest
+    logg.info(
+        '    finished',
+        time=start,
+        deep=(
+            f'added to `.uns[{key_added!r}]`\n'
+            f'    `.obsp[{dists_key!r}]`, distances for each pair of neighbors\n'
+            f'    `.obsp[{conns_key!r}]`, weighted adjacency matrix'
+        ),
+    )
+    return adata if copy else None
+
+
 class FlatTree(NamedTuple):
     hyperplanes: None
     offsets: None
@@ -369,6 +501,19 @@ def _compute_connectivities_umap(
     return distances, connectivities.tocsr()
 
 
+def _compute_connectivities_tsne(knn_indices, knn_dists, n_jobs):
+    n_samples, k_neighbors = knn_indices.shape
+    connectivities = openTSNE.affinity.joint_probabilities_nn(
+        knn_indices, knn_dists, [k_neighbors / 3], n_jobs=n_jobs,
+    )
+
+    distances = _get_sparse_matrix_from_indices_distances_umap(
+        knn_indices, knn_dists, n_samples, k_neighbors
+    )
+
+    return distances, connectivities.tocsr()
+
+
 def _get_sparse_matrix_from_indices_distances_numpy(indices, distances, n_obs, n_neighbors):
     n_nonzero = n_obs * n_neighbors
     indptr = np.arange(0, n_nonzero + 1, n_neighbors)
@@ -692,11 +837,11 @@ def compute_neighbors(
         if n_neighbors > self._adata.shape[0]:  # very small datasets
             n_neighbors = 1 + int(0.5*self._adata.shape[0])
             logg.warning(f'n_obs too small: adjusting to `n_neighbors = {n_neighbors}`')
-        if method == 'umap' and not knn:
+        if method in {'umap', 'tsne'} and not knn:
             raise ValueError('`method = \'umap\' only with `knn = True`.')
         if method == 'rapids' and metric != 'euclidean':
             raise ValueError("`method` 'rapids' only supports the 'euclidean' `metric`.")
-        if method not in {'umap', 'gauss', 'rapids'}:
+        if method not in {'umap', 'gauss', 'rapids', 'tsne'}:
             raise ValueError("`method` needs to be 'umap', 'gauss', or 'rapids'.")
         if self._adata.shape[0] >= 10000 and not knn:
             logg.warning('Using high n_obs without `knn=True` takes a lot of memory...')
@@ -743,6 +888,10 @@ def compute_neighbors(
                 self._adata.shape[0],
                 self.n_neighbors,
             )
+        elif method == 'tsne':
+            self._distances, self._connectivities = _compute_connectivities_tsne(
+                knn_indices[:, 1:], knn_distances[:, 1:], n_jobs=-1
+            )
         # overwrite the umap connectivities if method is 'gauss'
         # self._distances is unaffected by this
         if method == 'gauss':

diff --git a/scanpy/preprocessing/__init__.py b/scanpy/preprocessing/__init__.py
@@ -9,4 +9,4 @@
 from ._combat import combat
 from ._normalization import normalize_total
 
-from ..neighbors import neighbors
+from ..neighbors import neighbors, neighbors_tsne
diff --git a/scanpy/tests/test_embedding.py b/scanpy/tests/test_embedding.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 import pytest
-from sklearn.utils.testing import assert_array_almost_equal
+from numpy.testing import assert_array_almost_equal
 
 import scanpy as sc
 
@@ -28,3 +28,8 @@ def test_umap_init_paga(layout):
     sc.tl.paga(pbmc)
     sc.pl.paga(pbmc, layout=layout, show=False)
     sc.tl.umap(pbmc, init_pos="paga")
+
+
+def test_tsne():
+    pbmc = sc.datasets.pbmc3k_processed()
+    sc.tl.tsne(pbmc, n_jobs=4)