In [1]:
import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix
print(ad.__version__)

0.10.7


In [30]:
#build basic anndata object
counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32)
adata = ad.AnnData(counts)
adata

AnnData object with n_obs × n_vars = 100 × 2000

In [31]:
adata.X

<100x2000 sparse matrix of type '<class 'numpy.float32'>'
	with 126677 stored elements in Compressed Sparse Row format>

In [32]:
#provide index for obs (cells) and var (genes)
adata.obs_names = [f"Cell_{i:d}" for i in range(adata.n_obs)]
adata.var_names = [f"Gene_{i:d}" for i in range(adata.n_vars)]
print(adata.obs_names[:10])

Index(['Cell_0', 'Cell_1', 'Cell_2', 'Cell_3', 'Cell_4', 'Cell_5', 'Cell_6',
       'Cell_7', 'Cell_8', 'Cell_9'],
      dtype='object')


In [33]:
#subset anndata
adata[["Cell_1", "Cell_10"], ["Gene_5", "Gene_1900"]]

View of AnnData object with n_obs × n_vars = 2 × 2

In [34]:
#add metadata at both the observation and variable levels
ct = np.random.choice(["B", "T", "Monocyte"], size=(adata.n_obs,))
adata.obs["cell_type"] = pd.Categorical(ct)  # Categoricals are preferred for efficiency
adata.obs

Unnamed: 0,cell_type
Cell_0,B
Cell_1,Monocyte
Cell_2,T
Cell_3,Monocyte
Cell_4,B
...,...
Cell_95,T
Cell_96,B
Cell_97,B
Cell_98,T


In [9]:
adata

AnnData object with n_obs × n_vars = 100 × 2000
    obs: 'cell_type'

In [10]:
#randomly generated matrix that we can interpret as a UMAP
adata.obsm["X_umap"] = np.random.normal(0, 1, size=(adata.n_obs, 2))
adata.varm["gene_stuff"] = np.random.normal(0, 1, size=(adata.n_vars, 5))
adata.obsm

AxisArrays with keys: X_umap

In [11]:
#Unstructured metadata
adata.uns["random"] = [1, 2, 3]
adata.uns

OrderedDict([('random', [1, 2, 3])])

In [13]:
#layers - different forms of our original core data
adata.layers["log_transformed"] = np.log1p(adata.X)
adata

AnnData object with n_obs × n_vars = 100 × 2000
    obs: 'cell_type'
    uns: 'random'
    obsm: 'X_umap'
    varm: 'gene_stuff'
    layers: 'log_transformed'

In [15]:
#Conversion to DataFrames
adata.to_df(layer="log_transformed")

Unnamed: 0,Gene_0,Gene_1,Gene_2,Gene_3,Gene_4,Gene_5,Gene_6,Gene_7,Gene_8,Gene_9,...,Gene_1990,Gene_1991,Gene_1992,Gene_1993,Gene_1994,Gene_1995,Gene_1996,Gene_1997,Gene_1998,Gene_1999
Cell_0,0.693147,0.000000,0.000000,1.098612,0.000000,0.693147,1.098612,0.693147,0.000000,1.098612,...,0.693147,0.693147,0.693147,0.693147,1.386294,0.693147,0.693147,0.000000,1.098612,0.000000
Cell_1,0.000000,0.000000,0.693147,0.693147,0.693147,0.693147,0.000000,0.000000,0.693147,0.000000,...,1.098612,1.098612,1.098612,0.693147,0.693147,0.693147,0.693147,0.693147,1.098612,0.693147
Cell_2,1.386294,0.000000,0.693147,1.098612,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.000000,0.693147,1.098612,0.693147,1.098612,0.000000,0.000000,1.386294,1.386294
Cell_3,0.693147,0.693147,0.000000,0.000000,0.693147,0.000000,0.693147,0.000000,0.000000,0.000000,...,0.000000,0.693147,0.693147,0.693147,0.693147,0.000000,0.693147,0.693147,0.000000,0.693147
Cell_4,0.693147,0.000000,1.386294,1.098612,1.386294,1.098612,0.000000,1.386294,0.693147,1.609438,...,0.000000,0.000000,0.000000,0.000000,1.098612,0.000000,0.693147,0.000000,0.693147,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cell_95,1.098612,0.693147,0.693147,1.386294,0.000000,0.693147,1.098612,0.000000,0.000000,0.000000,...,0.693147,0.693147,1.098612,1.098612,0.000000,1.386294,0.693147,1.098612,0.000000,1.098612
Cell_96,1.098612,1.098612,0.693147,0.000000,0.693147,0.000000,1.098612,1.098612,0.000000,0.693147,...,0.693147,0.693147,0.693147,0.693147,0.000000,0.693147,0.000000,1.386294,0.000000,0.000000
Cell_97,1.098612,1.098612,1.098612,0.000000,0.000000,0.693147,1.098612,1.098612,0.693147,1.098612,...,0.000000,0.693147,1.386294,0.000000,0.000000,0.693147,0.693147,0.000000,0.693147,1.098612
Cell_98,0.000000,0.000000,0.693147,0.693147,1.098612,1.098612,0.000000,1.609438,1.098612,0.693147,...,0.000000,1.098612,1.098612,0.693147,0.000000,0.000000,0.000000,0.693147,0.000000,0.000000


In [None]:
#Writing the results to disk
#adata.write('my_results.h5ad', compression="gzip")

In [37]:
#another metadata use case
obs_meta = pd.DataFrame({
        'time_yr': np.random.choice([0, 2, 4, 8], adata.n_obs),
        'subject_id': np.random.choice(['subject 1', 'subject 2', 'subject 4', 'subject 8'], adata.n_obs),
        'instrument_type': np.random.choice(['type a', 'type b'], adata.n_obs),
        'site': np.random.choice(['site x', 'site y'], adata.n_obs),
    },
    index=adata.obs.index,    # these are the same IDs of observations as above!
)
#adata = ad.AnnData(adata.X, obs=obs_meta, var=adata.var)
adata.obs["time_yr"] = obs_meta["time_yr"]
adata.obs["subject_id"] = obs_meta["subject_id"]
adata.obs["instrument_type"] = obs_meta["instrument_type"]
adata.obs["site"] = obs_meta["site"]

In [38]:
print(adata.obs)

        cell_type  time_yr subject_id instrument_type    site
Cell_0          B        4  subject 4          type a  site y
Cell_1   Monocyte        4  subject 1          type a  site y
Cell_2          T        8  subject 1          type b  site y
Cell_3   Monocyte        4  subject 4          type b  site y
Cell_4          B        2  subject 8          type a  site y
...           ...      ...        ...             ...     ...
Cell_95         T        0  subject 2          type a  site y
Cell_96         B        8  subject 2          type a  site x
Cell_97         B        2  subject 2          type b  site y
Cell_98         T        4  subject 1          type a  site x
Cell_99         T        8  subject 4          type b  site y

[100 rows x 5 columns]


In [None]:
#Partial reading of large data
#adata = ad.read('my_results.h5ad', backed='r')