-
Notifications
You must be signed in to change notification settings - Fork 342
/
_datasets.py
677 lines (557 loc) · 21.1 KB
/
_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
from __future__ import annotations
import anndata
from scvi._types import AnnOrMuData
from ._built_in_data._brain_large import _load_brainlarge_dataset
from ._built_in_data._cellxgene import _load_cellxgene_dataset
from ._built_in_data._cite_seq import (
_load_pbmc_seurat_v4_cite_seq,
_load_pbmcs_10x_cite_seq,
_load_spleen_lymph_cite_seq,
)
from ._built_in_data._cortex import _load_cortex
from ._built_in_data._csv import _load_breast_cancer_dataset, _load_mouse_ob_dataset
from ._built_in_data._dataset_10x import _load_dataset_10x
from ._built_in_data._heartcellatlas import _load_heart_cell_atlas_subsampled
from ._built_in_data._loom import (
_load_annotation_simulation,
_load_frontalcortex_dropseq,
_load_prefrontalcortex_starmap,
_load_retina,
)
from ._built_in_data._pbmc import _load_pbmc_dataset, _load_purified_pbmc_dataset
from ._built_in_data._smfish import _load_smfish
from ._built_in_data._synthetic import _generate_synthetic
def pbmc_dataset(
save_path: str = "data/",
remove_extracted_data: bool = True,
) -> anndata.AnnData:
"""Loads pbmc dataset.
We considered scRNA-seq data from two batches of peripheral blood mononuclear cells (PBMCs) from a healthy donor
(4K PBMCs and 8K PBMCs). We derived quality control metrics using the cellrangerRkit R package (v. 1.1.0).
Quality metrics were extracted from CellRanger throughout the molecule specific information file. After filtering,
we extract 12,039 cells with 10,310 sampled genes and get biologically meaningful clusters with the
software Seurat. We then filter genes that we could not match with the bulk data used for differential
expression to be left with g = 3346.
Parameters
----------
save_path
Location to use when saving/loading the data.
remove_extracted_data
If true, will remove the folder the data was extracted to
Returns
-------
AnnData with batch info (``.obs['batch']``), label info (``.obs['labels']``)
Examples
--------
>>> import scvi
>>> adata = scvi.data.pbmc_dataset()
"""
return _load_pbmc_dataset(
save_path=save_path,
remove_extracted_data=remove_extracted_data,
)
def dataset_10x(
dataset_name: str | None = None,
filename: str | None = None,
save_path: str = "data/10X",
url: str = None,
return_filtered: bool = True,
remove_extracted_data: bool = False,
**scanpy_read_10x_kwargs,
) -> anndata.AnnData:
"""Loads a file from `10x <http://cf.10xgenomics.com/>`_ website.
Parameters
----------
dataset_name
Name of the dataset file. Has to be one of:
"frozen_pbmc_donor_a", "frozen_pbmc_donor_b", "frozen_pbmc_donor_c", "fresh_68k_pbmc_donor_a",
"cd14_monocytes", "b_cells", "cd34", "cd56_nk", "cd4_t_helper", "regulatory_t", "naive_t",
"memory_t", "cytotoxic_t", "naive_cytotoxic", "pbmc8k", "pbmc4k", "t_3k", "t_4k", "neuron_9k",
"pbmc_1k_protein_v3", "pbmc_10k_protein_v3", "malt_10k_protein_v3", "pbmc_1k_v2", "pbmc_1k_v3",
"pbmc_10k_v3", "hgmm_1k_v2", "hgmm_1k_v3", "hgmm_5k_v3", "hgmm_10k_v3", "neuron_1k_v2",
"neuron_1k_v3", "neuron_10k_v3", "heart_1k_v2", "heart_1k_v3", "heart_10k_v3", 5k_pbmc_protein_v3",
"5k_pbmc_protein_v3_nextgem", 1M_neurons".
filename
manual override of the filename to write to.
save_path
Location to use when saving/loading the data.
url
manual override of the download remote location.
Note that we already provide urls for most 10X datasets,
which are automatically formed only using the ``dataset_name``.
return_filtered
Either `filtered` data or `raw` data.
remove_extracted_data
Whether to remove extracted archives in the case of `.tar.gz` downloads.
**scanpy_read_10x_kwargs
Kwargs for scanpy's read_10x function
Returns
-------
adata initialized with 10x data
Examples
--------
>>> import scvi
>>> neuron = scvi.data.dataset10X("neuron_9k")
"""
return _load_dataset_10x(
dataset_name=dataset_name,
filename=filename,
save_path=save_path,
url=url,
return_filtered=return_filtered,
remove_extracted_data=remove_extracted_data,
**scanpy_read_10x_kwargs,
)
def cellxgene(
url: str,
filename: str | None = None,
save_path: str = "data/",
return_path: bool = False,
) -> anndata.AnnData | str:
"""Loads a file from `cellxgene <https://cellxgene.cziscience.com/>`_ portal.
Parameters
----------
url
URL to cellxgene session
filename
manual override of the filename to write to.
save_path
Location to use when saving/loading the data.
return_path
Whether to return the path to the downloaded file.
Returns
-------
adata initialized with cellxgene data
"""
return _load_cellxgene_dataset(
url=url,
filename=filename,
save_path=save_path,
return_path=return_path,
)
def smfish(
save_path: str = "data/",
use_high_level_cluster: bool = True,
) -> anndata.AnnData:
"""Loads osmFISH data of mouse cortex cells from the Linarsson lab.
Parameters
----------
save_path
Location to use when saving/loading the data.
use_high_level_cluster
If True, use higher-level agglomerate clusters.
The resulting cell types are "Astrocytes", "Endothelials", "Inhibitory",
"Microglias", "Oligodendrocytes" and "Pyramidals".
Returns
-------
AnnData with batch info (``.obs['batch']``), label info (``.obs['labels']``),
spatial info (``.obs['x_coord']``, ``.obs['y_coord']``)
Examples
--------
>>> import scvi
>>> adata = scvi.data.smfish()
"""
return _load_smfish(
save_path=save_path,
use_high_level_cluster=use_high_level_cluster,
)
def purified_pbmc_dataset(
save_path: str = "data/",
subset_datasets: list[str] | None = None,
) -> anndata.AnnData:
"""Purified PBMC dataset from: "Massively parallel digital transcriptional profiling of single cells".
Parameters
----------
save_path
Location to use when saving/loading the data.
subset_datasets
index for subsetting the follwing list of datasets
which are used to form the ``PurifiedPBMCDataset``:
"cd4_t_helper", "regulatory_t", "naive_t", "memory_t", "cytotoxic_t", "naive_cytotoxic",
"b_cells", "cd4_t_helper", "cd34", "cd56_nk", "cd14_monocytes".
Returns
-------
AnnData with batch info (``.obs['batch']``) and label info (``.obs['labels']``)
Examples
--------
>>> import scvi
>>> adata = scvi.data.purified_pbmc_dataset()
"""
return _load_purified_pbmc_dataset(
save_path=save_path,
subset_datasets=subset_datasets,
)
def prefrontalcortex_starmap(save_path: str = "data/") -> anndata.AnnData:
"""Loads a starMAP dataset of mouse pre-frontal cortex (Wang et al., 2018).
3,704 cells and 166 genes.
Parameters
----------
save_path
Location to use when saving/loading the data.
Returns
-------
AnnData with batch info (``.obs['batch']``), label info (``.obs['labels']``),
spatial info (``.obs['x_coord']``, ``.obs['y_coord']``)
Examples
--------
>>> import scvi
>>> adata = scvi.data.prefrontalcortex_starmap()
"""
return _load_prefrontalcortex_starmap(save_path=save_path)
def frontalcortex_dropseq(save_path: str = "data/") -> anndata.AnnData:
"""Load the cells from the mouse frontal cortex sequenced by the Dropseq technology (Saunders et al., 2018).
Load the 71639 annotated cells located in the frontal cortex of adult mouses among the 690,000 cells
studied by (Saunders et al., 2018) using the Drop-seq method. We have a 71639*7611 gene expression matrix
Among the 7611 genes, we offer the user to provide a list of genes to subsample from. If not provided,
all genes are kept.
Parameters
----------
save_path
Location to use when saving/loading the data.
Returns
-------
AnnData with batch info (``.obs['batch']``) and label info (``.obs['labels']``)
Examples
--------
>>> import scvi
>>> adata = scvi.data.frontalcortex_dropseq()
"""
return _load_frontalcortex_dropseq(save_path=save_path)
def annotation_simulation(name: str, save_path: str = "data/") -> anndata.AnnData:
"""Simulated datasets for scANVI tutorials.
Parameters
----------
name
One of "1", "2", or "3"
save_path
Location to use when saving/loading the data.
Returns
-------
AnnData with batch info (``.obs['batch']``) and label info (``.obs['labels']``)
Examples
--------
>>> import scvi
>>> adata = scvi.data.annontation_simulation("1")
"""
return _load_annotation_simulation(name=name, save_path=save_path)
def retina(save_path: str = "data/") -> anndata.AnnData:
"""Loads retina dataset.
The dataset of bipolar cells contains after their original pipeline for filtering 27,499 cells and
13,166 genes coming from two batches. We use the cluster annotation from 15 cell-types from the author.
We also extract their normalized data with Combat and use it for benchmarking.
Parameters
----------
save_path
Location to use when saving/loading the data.
Returns
-------
AnnData with batch info (``.obs['batch']``) and label info (``.obs['labels']``)
Examples
--------
>>> adata = retina()
"""
return _load_retina(save_path=save_path)
def mouse_ob_dataset(save_path: str = "data/") -> anndata.AnnData:
"""Loads mouse ob dataset.
Parameters
----------
save_path
Location to use when saving/loading the data.
Returns
-------
AnnData with batch info (``.obs['batch']``) and label info (``.obs['labels']``)
Examples
--------
>>> import scvi
>>> adata = scvi.data.mouse_ob_dataset()
"""
return _load_mouse_ob_dataset(save_path=save_path)
def breast_cancer_dataset(save_path: str = "data/") -> anndata.AnnData:
"""Loads breast cancer dataset.
Parameters
----------
save_path
Location to use when saving/loading the data.
Returns
-------
AnnData with batch info (``.obs['batch']``) and label info (``.obs['labels']``)
Examples
--------
>>> import scvi
>>> adata = scvi.data.breast_cancer_dataset()
"""
return _load_breast_cancer_dataset(save_path=save_path)
def pbmcs_10x_cite_seq(
save_path: str = "data/",
protein_join: str = "inner",
) -> anndata.AnnData:
"""Filtered PBMCs from 10x Genomics profiled with RNA and protein.
Datasets were filtered for doublets and other outliers as in
https://github.com/YosefLab/totalVI_reproducibility/blob/master/data/data_filtering_scripts/pbmc_10k/pbmc_10k.py
Parameters
----------
save_path
Location to use when saving/loading the data.
protein_join
Whether to take an inner join or outer join of proteins
Returns
-------
AnnData with batch info (``.obs['batch']``),
and protein expression (``.obsm["protein_expression"]``)
Missing protein values are zero, when ``protein_join == "outer`` and are identified during ``AnnData`` setup.
Examples
--------
>>> import scvi
>>> adata = scvi.data.pbmcs_10x_cite_seq()
"""
return _load_pbmcs_10x_cite_seq(
save_path=save_path,
protein_join=protein_join,
)
def spleen_lymph_cite_seq(
save_path: str = "data/",
protein_join: str = "inner",
remove_outliers: bool = True,
) -> anndata.AnnData:
"""Immune cells from the murine spleen and lymph nodes :cite:p:`GayosoSteier21`.
This dataset was used throughout the totalVI manuscript, and named SLN-all.
Parameters
----------
save_path
Location to use when saving/loading the data.
protein_join
Whether to take an inner join or outer join of proteins
remove_outliers
Whether to remove clusters annotated as doublet or low quality
Returns
-------
AnnData with batch info (``.obs['batch']``), label info (``.obs['cell_types']``),
protein expression (``.obsm["protein_expression"]``), and tissue (``.obs['tissue']``).
Missing protein values are zero, when ``protein_join == "outer`` and are identified during ``AnnData`` setup.
Examples
--------
>>> import scvi
>>> adata = scvi.data.spleen_lymph_cite_seq()
"""
return _load_spleen_lymph_cite_seq(
save_path=save_path,
protein_join=protein_join,
remove_outliers=remove_outliers,
)
def brainlarge_dataset(
save_path: str = "data/",
sample_size_gene_var: int = 10000,
max_cells_to_keep: int | None = None,
n_genes_to_keep: int = 720,
loading_batch_size: int = 100000,
) -> anndata.AnnData:
"""Loads brain-large dataset.
This dataset contains 1.3 million brain cells from
`10x Genomics <https://support.10xgenomics.com/single-cell-gene-expression/datasets>`_.
We randomly shuffle the data to get a 1M subset of cells and order genes by variance to retain first 10,000 and then 720 sampled variable genes.
This dataset is then sampled multiple times in cells for the runtime and goodness-of-fit analysis.
We report imputation scores on the 10k cells and 720 genes samples only.
Parameters
----------
save_path
Location to use when saving/loading the data.
sample_size_gene_var
Number of cells to use to estimate gene variances.
max_cells_to_keep
Maximum number of cells to keep.
n_genes_to_keep
Number of genes to keep, ordered by decreasing variance.
loading_batch_size
Number of cells to use for each chunk loaded.
Returns
-------
AnnData with batch info (``.obs['batch']``) and label info (``.obs['labels']``)
Examples
--------
>>> import scvi
>>> adata = scvi.data.brainlarge_dataset()
"""
return _load_brainlarge_dataset(
save_path=save_path,
sample_size_gene_var=sample_size_gene_var,
max_cells_to_keep=max_cells_to_keep,
n_genes_to_keep=n_genes_to_keep,
loading_batch_size=loading_batch_size,
)
def cortex(save_path: str = "data/") -> anndata.AnnData:
"""Loads cortex dataset.
The
`Mouse Cortex Cells dataset <https://storage.googleapis.com/linnarsson-lab-www-blobs/blobs/cortex/expression_mRNA_17-Aug-2014.txt>`_
contains 3005 mouse cortex cells and gold-standard labels for seven distinct cell types. Each cell type corresponds
to a cluster to recover.
Parameters
----------
save_path
Location to use when saving/loading the data.
Returns
-------
AnnData with batch info (``.obs['batch']``) and label info (``.obs['labels']``)
Examples
--------
>>> import scvi
>>> adata = scvi.data.cortex()
"""
return _load_cortex(save_path)
def synthetic_iid(
batch_size: int = 200,
n_genes: int = 100,
n_proteins: int = 100,
n_regions: int = 100,
n_batches: int = 2,
n_labels: int = 3,
dropout_ratio: float = 0.7,
sparse_format: str | None = None,
return_mudata: bool = False,
) -> AnnOrMuData:
"""Synthetic multimodal dataset.
RNA and accessibility data are generated from a zero-inflated negative binomial,
while protein data is generated from a negative binomial distribution. This dataset
is just for testing purposes and not meant for modeling or research. Each value is
independently and identically distributed.
Parameters
----------
batch_size
The number of cells per batch such that the total number of cells in the data is
`batch_size * n_batches`.
n_genes
The number of genes to generate.
n_proteins
The number of proteins to generate.
n_regions
The number of accessibility regions to generate.
n_batches
The number of batches to generate.
n_labels
The number of cell type labels, distributed uniformly across batches.
sparse
Whether to store ZINB generated data as a :class:`scipy.sparse.csr_matrix`.
dropout_ratio
The expected percentage of zeros artificially added into the data for RNA
and accessibility data.
sparse_format
Whether to store RNA, accessibility, and protein data as sparse arrays. One of
the following:
* `None`: Store as a dense :class:`numpy.ndarray`.
* `"csr_matrix"`: Store as a :class:`scipy.sparse.csr_matrix`.
* `"csc_matrix"`: Store as a :class:`scipy.sparse.csc_matrix`.
return_mudata
Returns a :class:`~mudata.MuData` if `True`, else :class:`~anndata.AnnData`.
Returns
-------
:class:`~anndata.AnnData` (if `return_mudata=False`) with the following fields:
* `.obs["batch"]`: Categorical batch labels in the format `batch_{i}`.
* `.obs["labels"]`: Categorical cell type labels in the format `label_{i}`.
* `.obsm["protein_expression"]`: Protein expression matrix.
* `.uns["protein_names"]`: Array of protein names.
* `.obsm["accessibility"]`: Accessibility expression matrix.
:class:`~mudata.MuData` (if `return_mudata=True`) with the following fields:
* `.obs["batch"]`: Categorical batch labels in the format `batch_{i}`.
* `.obs["labels"]`: Categorical cell type labels in the format `label_{i}`.
* `.mod["rna"]`: RNA expression data.
* `.mod["protein_expression"]`: Protein expression data.
* `.mod["accessibility"]`: Accessibility expression data.
Examples
--------
>>> import scvi
>>> adata = scvi.data.synthetic_iid()
"""
if n_batches < 1:
raise ValueError("`n_batches` must be greater than 0")
if n_genes < 1:
raise ValueError("`n_genes` must be greater than 0")
return _generate_synthetic(
batch_size=batch_size,
n_genes=n_genes,
n_proteins=n_proteins,
n_regions=n_regions,
n_batches=n_batches,
n_labels=n_labels,
dropout_ratio=dropout_ratio,
sparse_format=sparse_format,
return_mudata=return_mudata,
)
def heart_cell_atlas_subsampled(
save_path: str = "data/",
remove_nuisance_clusters: bool = True,
) -> anndata.AnnData:
"""Combined single cell and single nuclei RNA-Seq data of 485K cardiac cells with annotations.
Dataset was filtered down randomly to 20k cells using :meth:`~scanpy.pp.subsample`. The original
data can be downloaded from https://www.heartcellatlas.org/#DataSources.
Parameters
----------
save_path
Location to use when saving/loading the data.
remove_nuisance_clusters
Remove doublets and unsassigned cells
Returns
-------
AnnData
Notes
-----
The data were filtered using the following sequence::
>>> adata = anndata.read_h5ad(path_to_anndata)
>>> bdata = sc.pp.subsample(adata, n_obs=20000, copy=True)
>>> sc.pp.filter_genes(bdata, min_counts=3)
>>> bdata.write_h5ad(path, compression="gzip")
Examples
--------
>>> import scvi
>>> adata = scvi.data.heart_cell_atlas_subsampled()
"""
return _load_heart_cell_atlas_subsampled(
save_path=save_path,
remove_nuisance_clusters=remove_nuisance_clusters,
)
def pbmc_seurat_v4_cite_seq(
save_path: str = "data/",
apply_filters: bool = True,
aggregate_proteins: bool = True,
mask_protein_batches: int = 0,
) -> anndata.AnnData:
"""Dataset of PBMCs measured with CITE-seq (161764 cells).
This dataset was first presented in the Seurat v4 paper:
https://doi.org/10.1016/j.cell.2021.04.048
It contains 8 volunteers in an HIV vaccine trial measured
at 3 time points; thus, there are 24 batches in this dataset.
Parameters
----------
save_path
Location to use when saving/loading the data.
apply_filters
Apply filters at cell and protein level. At the cell level,
this filters on protein library size, number proteins detected,
percent mito, and removes cells labeled as doublets.
aggregate_proteins
Antibodies targeting the same surface protein are added together,
and isotype controls are removed. See the source code for full details.
mask_protein_batches
Set proteins in this many batches to be all zero (considered missing
for :class:`~scvi.model.TOTALVI`.). This improves transfer learning
with this dataset.
Returns
-------
AnnData
Notes
-----
This is not the same exact dataset as can be downloaded from:
https://satijalab.org/seurat/articles/multimodal_reference_mapping.html
This is due to the fact that the object linked in the tutorial above does
not contain the actual UMI count data for RNA. UMI counts had to be separately
downloaded from GEO (GSE164378). The counts in that object are an output of the
scTransform method and should not be treated like UMI counts.
Examples
--------
>>> import scvi
>>> adata = scvi.data.pbmc_seurat_v4_cite_seq()
"""
return _load_pbmc_seurat_v4_cite_seq(
save_path=save_path,
apply_filters=apply_filters,
aggregate_proteins=aggregate_proteins,
mask_protein_batches=mask_protein_batches,
)