Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch t-SNE implementation to openTSNE #1561

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Expand Up @@ -22,3 +22,4 @@ umap-learn>=0.3.10,<0.5
legacy-api-wrap
packaging
sinfo
openTSNE>=0.5.0
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needs to be updated to 0.6, when I release it. Until then, CI will fail here.

153 changes: 151 additions & 2 deletions scanpy/neighbors/__init__.py
Expand Up @@ -2,6 +2,7 @@
from typing import Union, Optional, Any, Mapping, Callable, NamedTuple, Generator, Tuple

import numpy as np
import openTSNE
import scipy
from anndata import AnnData
from scipy.sparse import issparse, coo_matrix, csr_matrix
Expand Down Expand Up @@ -163,6 +164,137 @@ def neighbors(
return adata if copy else None


@_doc_params(n_pcs=doc_n_pcs, use_rep=doc_use_rep)
def neighbors_tsne(
adata: AnnData,
perplexity: float = 30,
n_pcs: Optional[int] = None,
use_rep: Optional[str] = None,
knn: bool = True,
random_state: AnyRandom = 0,
metric: Union[_Metric, _MetricFn] = 'euclidean',
metric_kwds: Mapping[str, Any] = MappingProxyType({}),
key_added: Optional[str] = None,
copy: bool = False,
) -> Optional[AnnData]:
"""\
Compute a neighborhood graph of observations [McInnes18]_.

The neighbor search efficiency of this heavily relies on UMAP [McInnes18]_,
which also provides a method for estimating connectivities of data points -
the connectivity of the manifold (`method=='umap'`). If `method=='gauss'`,
connectivities are computed according to [Coifman05]_, in the adaption of
[Haghverdi16]_.

Parameters
----------
adata
Annotated data matrix.
n_neighbors
The size of local neighborhood (in terms of number of neighboring data
points) used for manifold approximation. Larger values result in more
global views of the manifold, while smaller values result in more local
data being preserved. In general values should be in the range 2 to 100.
If `knn` is `True`, number of nearest neighbors to be searched. If `knn`
is `False`, a Gaussian kernel width is set to the distance of the
`n_neighbors` neighbor.
{n_pcs}
{use_rep}
knn
If `True`, use a hard threshold to restrict the number of neighbors to
`n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian
Kernel to assign low weights to neighbors more distant than the
`n_neighbors` nearest neighbor.
random_state
A numpy random seed.
method
Use 'umap' [McInnes18]_ or 'gauss' (Gauss kernel following [Coifman05]_
with adaptive width [Haghverdi16]_) for computing connectivities.
Use 'rapids' for the RAPIDS implementation of UMAP (experimental, GPU
only).
metric
A known metric’s name or a callable that returns a distance.
metric_kwds
Options for the metric.
key_added
If not specified, the neighbors data is stored in .uns['neighbors'],
distances and connectivities are stored in .obsp['distances'] and
.obsp['connectivities'] respectively.
If specified, the neighbors data is added to .uns[key_added],
distances are stored in .obsp[key_added+'_distances'] and
connectivities in .obsp[key_added+'_connectivities'].
copy
Return a copy instead of writing to adata.

Returns
-------
Depending on `copy`, updates or returns `adata` with the following:

See `key_added` parameter description for the storage path of
connectivities and distances.

**connectivities** : sparse matrix of dtype `float32`.
Weighted adjacency matrix of the neighborhood graph of data
points. Weights should be interpreted as connectivities.
**distances** : sparse matrix of dtype `float32`.
Instead of decaying weights, this stores distances for each pair of
neighbors.
"""
start = logg.info('computing neighbors')
adata = adata.copy() if copy else adata
if adata.is_view: # we shouldn't need this here...
adata._init_as_actual(adata.copy())

n_neighbors = int(np.ceil(perplexity * 3)) + 1 # +1 because we it includes self
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is really the only functionally different LOC from the original neighbors method.


neighbors = Neighbors(adata)
neighbors.compute_neighbors(
n_neighbors=n_neighbors, knn=knn, n_pcs=n_pcs, use_rep=use_rep,
method="tsne", metric=metric, metric_kwds=metric_kwds,
random_state=random_state,
)

if key_added is None:
key_added = 'neighbors'
conns_key = 'connectivities'
dists_key = 'distances'
else:
conns_key = key_added + '_connectivities'
dists_key = key_added + '_distances'

adata.uns[key_added] = {}

neighbors_dict = adata.uns[key_added]

neighbors_dict['connectivities_key'] = conns_key
neighbors_dict['distances_key'] = dists_key

neighbors_dict['params'] = {'n_neighbors': neighbors.n_neighbors, 'method': 'tsne'}
neighbors_dict['params']['metric'] = metric
if metric_kwds:
neighbors_dict['params']['metric_kwds'] = metric_kwds
if use_rep is not None:
neighbors_dict['params']['use_rep'] = use_rep
if n_pcs is not None:
neighbors_dict['params']['n_pcs'] = n_pcs

adata.obsp[dists_key] = neighbors.distances
adata.obsp[conns_key] = neighbors.connectivities

if neighbors.rp_forest is not None:
neighbors_dict['rp_forest'] = neighbors.rp_forest
logg.info(
' finished',
time=start,
deep=(
f'added to `.uns[{key_added!r}]`\n'
f' `.obsp[{dists_key!r}]`, distances for each pair of neighbors\n'
f' `.obsp[{conns_key!r}]`, weighted adjacency matrix'
),
)
return adata if copy else None


class FlatTree(NamedTuple):
hyperplanes: None
offsets: None
Expand Down Expand Up @@ -369,6 +501,19 @@ def _compute_connectivities_umap(
return distances, connectivities.tocsr()


def _compute_connectivities_tsne(knn_indices, knn_dists, n_jobs):
n_samples, k_neighbors = knn_indices.shape
connectivities = openTSNE.affinity.joint_probabilities_nn(
knn_indices, knn_dists, [k_neighbors / 3], n_jobs=n_jobs,
)

distances = _get_sparse_matrix_from_indices_distances_umap(
knn_indices, knn_dists, n_samples, k_neighbors
)

return distances, connectivities.tocsr()


def _get_sparse_matrix_from_indices_distances_numpy(indices, distances, n_obs, n_neighbors):
n_nonzero = n_obs * n_neighbors
indptr = np.arange(0, n_nonzero + 1, n_neighbors)
Expand Down Expand Up @@ -692,11 +837,11 @@ def compute_neighbors(
if n_neighbors > self._adata.shape[0]: # very small datasets
n_neighbors = 1 + int(0.5*self._adata.shape[0])
logg.warning(f'n_obs too small: adjusting to `n_neighbors = {n_neighbors}`')
if method == 'umap' and not knn:
if method in {'umap', 'tsne'} and not knn:
raise ValueError('`method = \'umap\' only with `knn = True`.')
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This ValueError should now probably print method=\'{method}\' and not method=\'umap\'.

if method == 'rapids' and metric != 'euclidean':
raise ValueError("`method` 'rapids' only supports the 'euclidean' `metric`.")
if method not in {'umap', 'gauss', 'rapids'}:
if method not in {'umap', 'gauss', 'rapids', 'tsne'}:
raise ValueError("`method` needs to be 'umap', 'gauss', or 'rapids'.")
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add 'tsne' to the ValueError message?

if self._adata.shape[0] >= 10000 and not knn:
logg.warning('Using high n_obs without `knn=True` takes a lot of memory...')
Expand Down Expand Up @@ -743,6 +888,10 @@ def compute_neighbors(
self._adata.shape[0],
self.n_neighbors,
)
elif method == 'tsne':
self._distances, self._connectivities = _compute_connectivities_tsne(
knn_indices[:, 1:], knn_distances[:, 1:], n_jobs=-1
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use n_jobs=-1 meaning all available cores, since this is what the other functions do as well (I guess).

)
# overwrite the umap connectivities if method is 'gauss'
# self._distances is unaffected by this
if method == 'gauss':
Expand Down
2 changes: 1 addition & 1 deletion scanpy/preprocessing/__init__.py
Expand Up @@ -9,4 +9,4 @@
from ._combat import combat
from ._normalization import normalize_total

from ..neighbors import neighbors
from ..neighbors import neighbors, neighbors_tsne
7 changes: 6 additions & 1 deletion scanpy/tests/test_embedding.py
Expand Up @@ -2,7 +2,7 @@

import numpy as np
import pytest
from sklearn.utils.testing import assert_array_almost_equal
from numpy.testing import assert_array_almost_equal
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

scikit-learn imported this internally, and was throwing warnings that it will be removed from the scikit-learn public API.


import scanpy as sc

Expand All @@ -28,3 +28,8 @@ def test_umap_init_paga(layout):
sc.tl.paga(pbmc)
sc.pl.paga(pbmc, layout=layout, show=False)
sc.tl.umap(pbmc, init_pos="paga")


def test_tsne():
pbmc = sc.datasets.pbmc3k_processed()
sc.tl.tsne(pbmc, n_jobs=4)