In [None]:
import tiledbsoma

In [None]:
from tiledbsoma.io.ingest import from_h5ad

In [None]:
e = from_h5ad("test", "/opt/cellxgene/example-dataset/pbmc3k.h5ad", "test_exp_name")
e.close()

## Experiment

An `Experiment` is a class that represents a single-cell experiment. It always contains two objects:
1. `obs`: A `DataFrame` with primary annotations on the observation axis.
2. `ms`: A `Collection` of measurements.

In [83]:
experiment = tiledbsoma.open("/opt/TileDB-SOMA/test/")
experiment

<Experiment '/opt/TileDB-SOMA/test/' (open for 'r') (2 items)
    'ms': 'file:///opt/TileDB-SOMA/test/ms' (unopened)
    'obs': 'file:///opt/TileDB-SOMA/test/obs' (unopened)>

Each object can be opened like this:

In [85]:
experiment.ms

<Collection 'file:///opt/TileDB-SOMA/test/ms' (open for 'r') (2 items)
    'raw': 'file:///opt/TileDB-SOMA/test/ms/raw' (unopened)
    'test_exp_name': 'file:///opt/TileDB-SOMA/test/ms/test_exp_name' (unopened)>

In [86]:
experiment.obs

<DataFrame 'file:///opt/TileDB-SOMA/test/obs' (open for 'r')>

Note that by default an `Experiment` is opened lazily, i.e. only the minimal requested objects are opened. 

Also, opening an object doesn't mean that it will entirely be fetched in memory. It only returns a pointer to the object on disk.

## DataFrame

A `DataFrame` is a multi-column table with a user-defined schema. The schema is expressed as an Arrow Schema, and defines the column names and value types.

As an example, let's take a look at `obs`, which is represented as a DataFrame.

We can inspect the schema using `.schema`:

In [100]:
obs = experiment.obs
obs.schema

soma_joinid: int64
obs_id: large_string
n_genes: int64
percent_mito: float
n_counts: float
louvain: large_string

Note that `soma_joinid` is a field that exists in each `DataFrame` and acts as a join key for other objects, such as `SparseNDArray` (more on this later).

When a `DataFrame` is accessed, only metadata is retrieved, not actual data. This is important since a DataFrame can be very large and might not fit in memory.

To materialize the dataframe (or a subset) in memory, we call `df.read()`. 

If the dataframe is small, we can convert it to an in-memory Pandas object like this:

In [103]:
obs.read().concat().to_pandas()

Unnamed: 0,soma_joinid,obs_id,n_genes,percent_mito,n_counts,louvain
0,0,AAACATACAACCAC-1,781,0.030178,2419.0,CD4 T cells
1,1,AAACATTGAGCTAC-1,1352,0.037936,4903.0,B cells
2,2,AAACATTGATCAGC-1,1131,0.008897,3147.0,CD4 T cells
3,3,AAACCGTGCTTCCG-1,960,0.017431,2639.0,CD14+ Monocytes
4,4,AAACCGTGTATGCG-1,522,0.012245,980.0,NK cells
...,...,...,...,...,...,...
2633,2633,TTTCGAACTCTCAT-1,1155,0.021104,3459.0,CD14+ Monocytes
2634,2634,TTTCTACTGAGGCA-1,1227,0.009294,3443.0,B cells
2635,2635,TTTCTACTTCCTCG-1,622,0.021971,1684.0,B cells
2636,2636,TTTGCATGAGAGGC-1,454,0.020548,1022.0,B cells


Here, `read()` returns an iterator, `concat()` materializes all rows to memory and `to_pandas()` returns a Pandas view of the dataframe.

If the dataframe is bigger, we can only select a subset of it before materializing. This will only retrieve the required subset from disk to memory, so very large dataframes can be queried this way. In this example, we will only select the first 10 rows:

In [109]:
obs.read((slice(0,10),)).concat().to_pandas()

Unnamed: 0,soma_joinid,obs_id,n_genes,percent_mito,n_counts,louvain
0,0,AAACATACAACCAC-1,781,0.030178,2419.0,CD4 T cells
1,1,AAACATTGAGCTAC-1,1352,0.037936,4903.0,B cells
2,2,AAACATTGATCAGC-1,1131,0.008897,3147.0,CD4 T cells
3,3,AAACCGTGCTTCCG-1,960,0.017431,2639.0,CD14+ Monocytes
4,4,AAACCGTGTATGCG-1,522,0.012245,980.0,NK cells
5,5,AAACGCACTGGTAC-1,782,0.016644,2163.0,CD8 T cells
6,6,AAACGCTGACCAGT-1,783,0.038161,2175.0,CD8 T cells
7,7,AAACGCTGGTTCTT-1,790,0.030973,2260.0,CD8 T cells
8,8,AAACGCTGTAGCCA-1,533,0.011765,1275.0,CD4 T cells
9,9,AAACGCTGTTTCTG-1,550,0.029012,1103.0,FCGR3A+ Monocytes


We can also select a subset of the columns:

In [122]:
obs.read((slice(0, 10),), column_names=["obs_id", "n_genes"]).concat().to_pandas()

Unnamed: 0,obs_id,n_genes
0,AAACATACAACCAC-1,781
1,AAACATTGAGCTAC-1,1352
2,AAACATTGATCAGC-1,1131
3,AAACCGTGCTTCCG-1,960
4,AAACCGTGTATGCG-1,522
5,AAACGCACTGGTAC-1,782
6,AAACGCTGACCAGT-1,783
7,AAACGCTGGTTCTT-1,790
8,AAACGCTGTAGCCA-1,533
9,AAACGCTGTTTCTG-1,550


Finally, we can use `value_filter` to retrieve a filtered subset of rows that match a certain condition.

In [131]:
obs.read((slice(None),), value_filter="n_genes > 1500").concat().to_pandas()

Unnamed: 0,soma_joinid,obs_id,n_genes,percent_mito,n_counts,louvain
0,26,AAATCAACCCTATT-1,1545,0.024313,5676.0,CD4 T cells
1,59,AACCTACTGTGAGG-1,1652,0.015839,5682.0,CD14+ Monocytes
2,107,AAGCACTGGTTCTT-1,1717,0.023566,6153.0,B cells
3,109,AAGCCATGAACTGC-1,1877,0.014015,7064.0,Dendritic cells
4,247,ACCCAGCTGTTAGC-1,1547,0.020600,5534.0,CD14+ Monocytes
...,...,...,...,...,...,...
70,2508,TTACTCGACGCAAT-1,1603,0.024851,5030.0,Dendritic cells
71,2530,TTATGGCTTATGGC-1,1783,0.022064,6164.0,Dendritic cells
72,2597,TTGAGGACTACGCA-1,1794,0.024440,6342.0,Dendritic cells
73,2623,TTTAGCTGTACTCT-1,1567,0.021160,5671.0,Dendritic cells


## Collection

## Measurement

## DenseNDArray

## SparseNDArray

In [None]:
# DataFrame
obs = experiment.obs
obs

In [None]:
# Collection
ms = experiment.ms
ms

In [94]:
# Measurement 
meas = ms["test_exp_name"]
meas

<Measurement 'file:///opt/TileDB-SOMA/test/ms/test_exp_name' (open for 'r') (5 items)
    'obsm': 'file:///opt/TileDB-SOMA/test/ms/test_exp_name/obsm' (unopened)
    'varm': 'file:///opt/TileDB-SOMA/test/ms/test_exp_name/varm' (unopened)
    'X': Collection 'file:///opt/TileDB-SOMA/test/ms/test_exp_name/X' (open for 'r') (1 item)
        'data': DenseNDArray 'file:///opt/TileDB-SOMA/test/ms/test_exp_name/X/data' (open for 'r')
    'obsp': 'file:///opt/TileDB-SOMA/test/ms/test_exp_name/obsp' (unopened)
    'var': DataFrame 'file:///opt/TileDB-SOMA/test/ms/test_exp_name/var' (open for 'r')>

In [None]:
# DenseNDArray
meas.X["data"]

In [None]:
# For SparseNDArray, convert the original h5ad to use a sparse matrix for X and re-open it

import anndata as ad