New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Switch t-SNE implementation to openTSNE #1561
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,3 +22,4 @@ umap-learn>=0.3.10,<0.5 | |
legacy-api-wrap | ||
packaging | ||
sinfo | ||
openTSNE>=0.5.0 | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ | |
from typing import Union, Optional, Any, Mapping, Callable, NamedTuple, Generator, Tuple | ||
|
||
import numpy as np | ||
import openTSNE | ||
import scipy | ||
from anndata import AnnData | ||
from scipy.sparse import issparse, coo_matrix, csr_matrix | ||
|
@@ -163,6 +164,137 @@ def neighbors( | |
return adata if copy else None | ||
|
||
|
||
@_doc_params(n_pcs=doc_n_pcs, use_rep=doc_use_rep) | ||
def neighbors_tsne( | ||
adata: AnnData, | ||
perplexity: float = 30, | ||
n_pcs: Optional[int] = None, | ||
use_rep: Optional[str] = None, | ||
knn: bool = True, | ||
random_state: AnyRandom = 0, | ||
metric: Union[_Metric, _MetricFn] = 'euclidean', | ||
metric_kwds: Mapping[str, Any] = MappingProxyType({}), | ||
key_added: Optional[str] = None, | ||
copy: bool = False, | ||
) -> Optional[AnnData]: | ||
"""\ | ||
Compute a neighborhood graph of observations [McInnes18]_. | ||
|
||
The neighbor search efficiency of this heavily relies on UMAP [McInnes18]_, | ||
which also provides a method for estimating connectivities of data points - | ||
the connectivity of the manifold (`method=='umap'`). If `method=='gauss'`, | ||
connectivities are computed according to [Coifman05]_, in the adaption of | ||
[Haghverdi16]_. | ||
|
||
Parameters | ||
---------- | ||
adata | ||
Annotated data matrix. | ||
n_neighbors | ||
The size of local neighborhood (in terms of number of neighboring data | ||
points) used for manifold approximation. Larger values result in more | ||
global views of the manifold, while smaller values result in more local | ||
data being preserved. In general values should be in the range 2 to 100. | ||
If `knn` is `True`, number of nearest neighbors to be searched. If `knn` | ||
is `False`, a Gaussian kernel width is set to the distance of the | ||
`n_neighbors` neighbor. | ||
{n_pcs} | ||
{use_rep} | ||
knn | ||
If `True`, use a hard threshold to restrict the number of neighbors to | ||
`n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian | ||
Kernel to assign low weights to neighbors more distant than the | ||
`n_neighbors` nearest neighbor. | ||
random_state | ||
A numpy random seed. | ||
method | ||
Use 'umap' [McInnes18]_ or 'gauss' (Gauss kernel following [Coifman05]_ | ||
with adaptive width [Haghverdi16]_) for computing connectivities. | ||
Use 'rapids' for the RAPIDS implementation of UMAP (experimental, GPU | ||
only). | ||
metric | ||
A known metric’s name or a callable that returns a distance. | ||
metric_kwds | ||
Options for the metric. | ||
key_added | ||
If not specified, the neighbors data is stored in .uns['neighbors'], | ||
distances and connectivities are stored in .obsp['distances'] and | ||
.obsp['connectivities'] respectively. | ||
If specified, the neighbors data is added to .uns[key_added], | ||
distances are stored in .obsp[key_added+'_distances'] and | ||
connectivities in .obsp[key_added+'_connectivities']. | ||
copy | ||
Return a copy instead of writing to adata. | ||
|
||
Returns | ||
------- | ||
Depending on `copy`, updates or returns `adata` with the following: | ||
|
||
See `key_added` parameter description for the storage path of | ||
connectivities and distances. | ||
|
||
**connectivities** : sparse matrix of dtype `float32`. | ||
Weighted adjacency matrix of the neighborhood graph of data | ||
points. Weights should be interpreted as connectivities. | ||
**distances** : sparse matrix of dtype `float32`. | ||
Instead of decaying weights, this stores distances for each pair of | ||
neighbors. | ||
""" | ||
start = logg.info('computing neighbors') | ||
adata = adata.copy() if copy else adata | ||
if adata.is_view: # we shouldn't need this here... | ||
adata._init_as_actual(adata.copy()) | ||
|
||
n_neighbors = int(np.ceil(perplexity * 3)) + 1 # +1 because we it includes self | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is really the only functionally different LOC from the original |
||
|
||
neighbors = Neighbors(adata) | ||
neighbors.compute_neighbors( | ||
n_neighbors=n_neighbors, knn=knn, n_pcs=n_pcs, use_rep=use_rep, | ||
method="tsne", metric=metric, metric_kwds=metric_kwds, | ||
random_state=random_state, | ||
) | ||
|
||
if key_added is None: | ||
key_added = 'neighbors' | ||
conns_key = 'connectivities' | ||
dists_key = 'distances' | ||
else: | ||
conns_key = key_added + '_connectivities' | ||
dists_key = key_added + '_distances' | ||
|
||
adata.uns[key_added] = {} | ||
|
||
neighbors_dict = adata.uns[key_added] | ||
|
||
neighbors_dict['connectivities_key'] = conns_key | ||
neighbors_dict['distances_key'] = dists_key | ||
|
||
neighbors_dict['params'] = {'n_neighbors': neighbors.n_neighbors, 'method': 'tsne'} | ||
neighbors_dict['params']['metric'] = metric | ||
if metric_kwds: | ||
neighbors_dict['params']['metric_kwds'] = metric_kwds | ||
if use_rep is not None: | ||
neighbors_dict['params']['use_rep'] = use_rep | ||
if n_pcs is not None: | ||
neighbors_dict['params']['n_pcs'] = n_pcs | ||
|
||
adata.obsp[dists_key] = neighbors.distances | ||
adata.obsp[conns_key] = neighbors.connectivities | ||
|
||
if neighbors.rp_forest is not None: | ||
neighbors_dict['rp_forest'] = neighbors.rp_forest | ||
logg.info( | ||
' finished', | ||
time=start, | ||
deep=( | ||
f'added to `.uns[{key_added!r}]`\n' | ||
f' `.obsp[{dists_key!r}]`, distances for each pair of neighbors\n' | ||
f' `.obsp[{conns_key!r}]`, weighted adjacency matrix' | ||
), | ||
) | ||
return adata if copy else None | ||
|
||
|
||
class FlatTree(NamedTuple): | ||
hyperplanes: None | ||
offsets: None | ||
|
@@ -369,6 +501,19 @@ def _compute_connectivities_umap( | |
return distances, connectivities.tocsr() | ||
|
||
|
||
def _compute_connectivities_tsne(knn_indices, knn_dists, n_jobs): | ||
n_samples, k_neighbors = knn_indices.shape | ||
connectivities = openTSNE.affinity.joint_probabilities_nn( | ||
knn_indices, knn_dists, [k_neighbors / 3], n_jobs=n_jobs, | ||
) | ||
|
||
distances = _get_sparse_matrix_from_indices_distances_umap( | ||
knn_indices, knn_dists, n_samples, k_neighbors | ||
) | ||
|
||
return distances, connectivities.tocsr() | ||
|
||
|
||
def _get_sparse_matrix_from_indices_distances_numpy(indices, distances, n_obs, n_neighbors): | ||
n_nonzero = n_obs * n_neighbors | ||
indptr = np.arange(0, n_nonzero + 1, n_neighbors) | ||
|
@@ -692,11 +837,11 @@ def compute_neighbors( | |
if n_neighbors > self._adata.shape[0]: # very small datasets | ||
n_neighbors = 1 + int(0.5*self._adata.shape[0]) | ||
logg.warning(f'n_obs too small: adjusting to `n_neighbors = {n_neighbors}`') | ||
if method == 'umap' and not knn: | ||
if method in {'umap', 'tsne'} and not knn: | ||
raise ValueError('`method = \'umap\' only with `knn = True`.') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This ValueError should now probably print |
||
if method == 'rapids' and metric != 'euclidean': | ||
raise ValueError("`method` 'rapids' only supports the 'euclidean' `metric`.") | ||
if method not in {'umap', 'gauss', 'rapids'}: | ||
if method not in {'umap', 'gauss', 'rapids', 'tsne'}: | ||
raise ValueError("`method` needs to be 'umap', 'gauss', or 'rapids'.") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add |
||
if self._adata.shape[0] >= 10000 and not knn: | ||
logg.warning('Using high n_obs without `knn=True` takes a lot of memory...') | ||
|
@@ -743,6 +888,10 @@ def compute_neighbors( | |
self._adata.shape[0], | ||
self.n_neighbors, | ||
) | ||
elif method == 'tsne': | ||
self._distances, self._connectivities = _compute_connectivities_tsne( | ||
knn_indices[:, 1:], knn_distances[:, 1:], n_jobs=-1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use |
||
) | ||
# overwrite the umap connectivities if method is 'gauss' | ||
# self._distances is unaffected by this | ||
if method == 'gauss': | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,7 @@ | |
|
||
import numpy as np | ||
import pytest | ||
from sklearn.utils.testing import assert_array_almost_equal | ||
from numpy.testing import assert_array_almost_equal | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. scikit-learn imported this internally, and was throwing warnings that it will be removed from the scikit-learn public API. |
||
|
||
import scanpy as sc | ||
|
||
|
@@ -28,3 +28,8 @@ def test_umap_init_paga(layout): | |
sc.tl.paga(pbmc) | ||
sc.pl.paga(pbmc, layout=layout, show=False) | ||
sc.tl.umap(pbmc, init_pos="paga") | ||
|
||
|
||
def test_tsne(): | ||
pbmc = sc.datasets.pbmc3k_processed() | ||
sc.tl.tsne(pbmc, n_jobs=4) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This needs to be updated to 0.6, when I release it. Until then, CI will fail here.