In [None]:
import numpy as np
import anndata as ad
import scanpy as sc
from rosa.preprocessing import (
    clean_cells_genes,
)

RAW_ADATA_PT = "/Users/nsofroniew/Documents/data/multiomics/cell_census/tabula_sapiens_by_features.h5ad"
EMBEDS_ADATA_PT = "/Users/nsofroniew/Documents/data/multiomics/cell_census/tabula_sapiens_by_features_with_embeds_new_norm.h5ad"


In [None]:
adata = ad.read_h5ad(EMBEDS_ADATA_PT)

In [None]:
adata.layers['counts'].sum(axis=1)

In [None]:
adata.layers['counts_normalized'] = adata.layers['counts'].copy()
sc.pp.normalize_total(adata, target_sum=1e5, layer='counts_normalized')

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(adata.layers['counts_normalized'].flatten());

In [None]:
from rosa.preprocessing import bin_expression, reconstruct_expression

In [None]:
bin_expression(adata, 128)

In [None]:
reconstruct_expression(adata)

In [None]:
((adata.X - adata.layers['reconstructed'])**2).mean()

In [None]:
# Plot residuals
plt.hist((adata.layers['reconstructed'] - adata.X).ravel(), bins=1000);
plt.xlim([-.25, .25]);

In [None]:
# Identify cells and genes not trained on (when possible)
adata.layers['prediction'] = adata.layers['reconstructed']
test_genes = np.logical_not(adata.var["train"])
test_cells = np.logical_not(adata.obs["train"])
adata_test = adata[test_cells, test_genes]
sc.tl.dendrogram(adata_test, groupby="label", use_rep="X")


In [None]:
from rosa.plotting import plot_marker_gene_heatmap

In [None]:
marker_genes_dict = adata_test.obs.set_index('label').to_dict()['marker_feature_name']
plot_marker_gene_heatmap(adata_test, marker_genes_dict)

In [None]:
plt.hist(adata.layers['binned'].flatten(), bins=25, density=True);

In [None]:
plt.hist(adata.X.flatten(), bins=250, density=True);
plt.ylim([0, 1]);
plt.xlim([0, 8])

In [None]:
adata.X

In [None]:
from typing import Union

In [None]:
adata.layers['X']

In [None]:
np.empty((10, 0))[0]

In [None]:
from enum import Enum, auto

class EmbeddingType(Enum):
    JOINT = auto()
    VAR = auto()
    OBS = auto()

In [None]:
EmbeddingType.JOINT

In [None]:
list(EmbeddingType.__members__)

In [None]:
adata.X = np.ceil(adata.X)
sc.pp.filter_genes(adata, min_cells=1)
sc.experimental.pp.normalize_pearson_residuals(adata)
adata.X[adata.X<0] = 0

In [None]:
2**10

In [None]:
import torch

In [None]:
np.isinf(adata.X).sum()

In [None]:
np.isnan(adata.X).sum()

In [None]:
sc.pp.log1p(adata)

In [None]:
adata.X

In [None]:
torch.tensor([0])

In [None]:
adata.X = np.ceil(adata.X)
adata.layers["counts"] = adata.X.copy()

In [None]:
sc.pp.filter_genes(adata, min_cells=1)

In [None]:
sc.pp.filter_cells(adata, min_genes=1)

In [None]:
adata

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(adata.layers['counts'].sum(axis=1));

In [None]:
print(adata.layers['counts'].sum(axis=1).mean() / 1e5)
print(adata.layers['counts'].sum(axis=1).var() / 1e10)

In [None]:
adata.layers["counts_normalized_total"] = adata.X.copy()
sc.pp.normalize_total(adata, 1e5, layer="counts_normalized_total")

In [None]:
print(adata.layers['counts_normalized_total'].sum(axis=1).mean() / 1e5)
print(adata.layers['counts_normalized_total'].sum(axis=1).var() / 1e10)

In [None]:
adata.layers["counts_normalized_pearson"] = adata.X.copy()
adata.layers['counts_normalized_pearson'] = np.ceil(adata.layers['counts_normalized_pearson'])
sc.experimental.pp.normalize_pearson_residuals(adata, layer="counts_normalized_pearson", theta=1e2)

In [None]:
np.isnan(adata.layers['counts_normalized_pearson']).sum()

In [None]:
print(adata.layers['counts_normalized_pearson'].sum(axis=1).mean())
print(adata.layers['counts_normalized_pearson'].sum(axis=1).var())

In [None]:
plt.hist(adata.layers['counts_normalized_pearson'].flatten(), np.linspace(0, 100, 1000));

In [None]:
plt.hist(adata.layers['counts'].flatten(), np.linspace(0, 100, 1000));

In [None]:
np.log1p(0.1)

In [None]:
(adata.layers['counts_normalized_pearson'] - adata.layers['counts']).max()

In [None]:
adata.layers['counts_normalized_pearson'] = np.round(adata.layers['counts_normalized_pearson'])

In [None]:
np.isnan(adata.layers['counts_normalized_pearson'])[0]

In [None]:
adata.layers['counts_normalized_pearson'].min()

In [None]:
adata.uns['pearson_residuals_normalization']

In [None]:
TABULA_SAPIENS_BY_CELL_TYPE_WITH_EMBEDS_PT = "/Users/nsofroniew/Documents/data/multiomics/cell_census/tabula_sapiens_by_features_with_embeds_new_norm.h5ad"

In [None]:
adata_norm = ad.read_h5ad(TABULA_SAPIENS_BY_CELL_TYPE_WITH_EMBEDS_PT)

In [None]:
y = np.exp(adata_norm.X) - 1

In [None]:
y

In [None]:
import scanpy as sc

In [None]:
adata_norm.X = adata_norm.layers['counts'].copy()

In [None]:
sc.pp.normalize_total(adata_norm, 1e5)

In [None]:
(abs(adata_norm.X - y)).max()

In [None]:
# from scipy.special import kl_div

# y_hat = np.asarray(adata[keep_cells].X.flatten())
# y = np.asarray(adata[keep_cells].layers['prediction'].flatten())

# kl_div(y, y_hat).mean()

In [None]:
from scipy.stats import kstest, poisson

y_hat = np.asarray(adata[keep_cells].X.flatten())
y = np.asarray(adata[keep_cells].layers['prediction'].flatten())

kstest(y_hat, y)

In [None]:
kstest(y, 'poisson', args=(np.mean(y),))

In [None]:
from scipy.stats import kstest, poisson

poisson_dist = poisson(np.mean(y))

In [None]:
y_p = poisson_dist.rvs(size=10000)

In [None]:
hist, _ = np.histogram(y, bins=bins)
hist_hat, _ = np.histogram(y_hat, bins=bins)
plt.bar(bins[:-1], hist/hist.sum(), width=10/1000)
plt.bar(bins[:-1], hist_hat/hist_hat.sum(), alpha=0.5, width=10/1000)
plt.ylim([0, 1e-2]);
plt.xlim([0, 8]);

In [None]:
poisson_dist = poisson(np.mean(y))
y_new = poisson_dist.rvs(size=10000)


hist, _ = np.histogram(y, bins=bins)
hist_new, _ = np.histogram(y_new, bins=bins)
plt.bar(bins[:-1], hist/hist.sum(), width=10/1000)
plt.bar(bins[:-1], hist_new//hist_new.sum(), alpha=0.5, width=10/1000)
plt.ylim([0, 1e-2]);
plt.xlim([0, 8]);

In [None]:
initial_params = [.2, 0.1, 1.2, 0.0.001]

result = minimize(negative_binomial, initial_params, args=(y,), method='Nelder-Mead')

In [None]:
# Extract the optimal parameters
r1, p1, r2, p2 = result.x
data = y

In [None]:
from scipy.stats import nbinom

r1, p1, r2, p2 = (0.1, 0.1, 1.1, 0.1)

# nbinom_dist_1 = nbinom(9.1, 0.6)
# nbinom_dist_2 = nbinom(1.2, .001)
y_new = (nbinom.rvs(.2, 0.1, size=10000) + nbinom.rvs(1.2, 0.001, size=10000)) / 1000


hist, _ = np.histogram(y, bins=bins)
hist_new, _ = np.histogram(y_new, bins=bins)
plt.bar(bins[:-1], hist/hist.sum(), width=10/1000)
plt.bar(bins[:-1], hist_new/hist_new.sum(), alpha=0.5, width=10/1000)
plt.ylim([0, 1e-2]);
plt.xlim([0, 8]);

In [None]:
import numpy as np
from scipy.optimize import minimize
from scipy.stats import nbinom
import matplotlib.pyplot as plt

# Generate sample data
data = y

# Define the negative binomial function
def negative_binomial(params, data):
    r1, p1, r2, p2 = params
    pmf1 = nbinom.pmf(1000 * data, r1, p1)
    pmf2 = nbinom.pmf(1000 * data, r2, p2)
    return -np.log(pmf1 + pmf2).sum()

# Define the initial values for the parameters
initial_params = (0.1, 0.1, 1.1, 0.1)


# Minimize the negative binomial function using the Nelder-Mead algorithm
result = minimize(negative_binomial, initial_params, args=(data,), method='Nelder-Mead')

# Extract the optimal parameters
r1, p1, r2, p2 = result.x

# Plot the histogram of the data
plt.hist(data, bins=30, density=True, alpha=0.5, label='Data')

# Plot the sum of the negative binomials
x = np.arange(0, data.max())
pmf1 = nbinom.pmf(x, r1, p1)
pmf2 = nbinom.pmf(x, r2, p2)
plt.plot(x, pmf1 + pmf2, 'r-', lw=2, label='Sum of Negative Binomials')

plt.xlabel('x')
plt.ylabel('Probability')
plt.legend()
plt.show()


In [None]:
try:
  print(x)
except NameError:
  print("Variable x is not defined")

In [None]:
import torch

In [None]:
torch.long

In [None]:
a = torch.tensor(2.3).type(torch.long)

In [None]:
adata.varm['embedding'].shape

In [None]:
import matplotlib.pyplot as plt

plt.hist(adata.varm['embedding'].ravel(), bins=2000);
plt.xlim([-0.5, 0.5]);

In [None]:
adata.varm['embedding']

In [None]:
from sklearn.decomposition import PCA

In [None]:
# fit pca on training data
pca = PCA()
pca.fit(adata.varm['embedding'])


In [None]:
E = pca.transform(adata.varm['embedding'])

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_));
plt.xlim([0, 3042])

In [None]:
np.cumsum(pca.explained_variance_ratio_)[512]

In [None]:
plt.hist(E[:, :512].ravel(), bins=2000);
plt.xlim([-1.5, 1.5]);

In [None]:
E.shape

In [None]:
import anndata as ad
from rosa.preprocessing import (
    calculate_gene_embeddings_pca,
)


EMBEDS_ADATA_PT = "/Users/nsofroniew/Documents/data/multiomics/cell_census/tabula_sapiens_by_features_with_embeds_new_norm.h5ad"

In [None]:
adata = ad.read_h5ad(EMBEDS_ADATA_PT)
adata = calculate_gene_embeddings_pca(adata, 256)

In [None]:
adata.uns["embedding_pca"]

In [None]:
adata.write_h5ad(EMBEDS_ADATA_PT)

In [None]:
adata.varm['embedding_pca'].shape

In [None]:
adata.varm

In [None]:
ADATA_BULK_PT = "/Users/nsofroniew/Documents/data/multiomics/cell_census/tabula_sapiens_pbulk.h5ad"
adata = ad.read_h5ad(ADATA_BULK_PT)


In [None]:
adata.var.set_index("feature_id")

In [None]:
base_pt = '/home/ec2-user/enformer/Homo_sapiens.GRCh38.genes.enformer_embeddings'
var_id = 'ENSG00000280445'
full_pt = f'{base_pt}/{var_id}.pt'

In [None]:
import torch

In [None]:
for i in range(64):
    var = torch.load(full_pt, map_location='cpu')['embedding']
    var = torch.from_numpy(var).type(torch.float32)

In [None]:
type(var)

In [None]:
896 // 2

In [None]:
a = var.unsqueeze(0).unsqueeze(-3)
a.shape

In [None]:
fc = torch.nn.Conv2d(1, 10, (896, 1))

In [None]:
fc(a).view(a.shape[0], -1).shape

In [None]:
view(a.shape[0], -1)

In [None]:
import torch.nn.functional as F


class ToTensor(torch.nn.Module):
    """Convert ``numpy.ndarray`` to tensor.
    """
    def __init__(self, dtype: torch.dtype = torch.float32) -> None:
        super().__init__()
        self.dtype = dtype

    def forward(self, tensor: np.ndarray) -> torch.Tensor:
        return torch.from_numpy(tensor).type(torch.float32)

class CountNormalize(torch.nn.Module):
    """Normalize a tensor to a fixed total counts.
    """
    def __init__(self, total_counts=1):
        super().__init__()
        self.total_counts = total_counts

    def forward(self, tensor: torch.Tensor) -> torch.Tensor:
        return self.total_counts * F.normalize(tensor, p=1.0, eps=1e-12)


class Log1p(torch.nn.Module):
    """Log1p normalize a tensor.
    """
    def __init__(self):
        super().__init__()

    def forward(self, tensor: torch.Tensor) -> torch.Tensor:
        return torch.log1p(tensor)


class QuantileNormalize(torch.nn.Module):
    """Normalize a tensor by quantiles.
    """
    def __init__(self, n_bins):
        super().__init__()
        self.n_bins = n_bins

    def forward(self, tensor: torch.Tensor) -> torch.Tensor:
        boundaries = torch.quantile(tensor, torch.linspace(0, 1, self.n_bins))
        return torch.bucketize(tensor, boundaries)

In [None]:
class ExpressionTransform(torch.nn.Sequential):
    def __init__(self, cfg):
        # Add base transform
        transforms = [ToTensor()]

        if cfg.total_counts is not None:
            transforms.append(CountNormalize(cfg.total_counts))

        if cfg.log1p:
            transforms.append(Log1p())

        if cfg.n_bins is not None:
            transforms.append(QuantileNormalize(cfg.n_bins))

        super().__init__(*transforms)


from dataclasses import dataclass
from typing import Optional


@dataclass
class ExpressionTransformConfig:
    total_counts: Optional[int] = None
    log1p: Optional[bool] = None
    n_bins: Optional[int] = None

In [None]:
exp_cfg = ExpressionTransformConfig(n_bins=10)

In [None]:
tf = ExpressionTransform(exp_cfg)

In [None]:
torch.quantile(ToTensor()(X).unsqueeze(0), torch.linspace(0, 1, 5), dim=-1, keepdim=True).shape

In [None]:
ToTensor()(X).unsqueeze(0).shape

In [None]:
tf(np.random.rand(20))

In [None]:
X[1]

In [None]:
a = torch.nn.Sequential(ToTensor(), CountNormalize(10), Log1p(), QuantileNormalize(10))
b = torch.nn.Sequential(ToTensor(), QuantileNormalize(10))

In [None]:
X = np.random.rand(3, 30)
X [0, :3] = 0
X [0, :] = X [0, :] * 1000
# X = torch.randint(1, 5, size=(3, 3))

In [None]:
X

In [None]:
a(X)

In [None]:
b(X)

In [None]:
a(X).sum(dim=1)

In [None]:
20 * 512 / 1e3

In [None]:
type(torch.float32)

In [None]:
import numpy as np

In [None]:
np.unravel_index(10, (9, 2))

In [None]:
EMBEDS_ADATA_PT = '/Users/nsofroniew/Documents/data/multiomics/cell_census/tabula_sapiens_by_features_with_embeds_new_norm.h5ad'

In [None]:
EMBEDS_ADATA_PT_2 = "/Users/nsofroniew/Documents/data/multiomics/cell_census/tabula_sapiens_by_features_raw.h5ad"


In [None]:
import anndata as ad

In [None]:
adata = ad.read_h5ad(EMBEDS_ADATA_PT)
adata_2 = ad.read_h5ad(EMBEDS_ADATA_PT_2)

In [None]:
adata.layers['counts'][0, 0]

In [None]:
adata_2.X.sum(axis=1).mean()

In [None]:
adata.X[0, 0]

In [None]:
adata.layers['binned'][0, 0]

In [None]:
adata_2.X[0, 0]

In [None]:
from enum import Enum, auto
from typing import Optional


class ExpressionActivations(Enum):
    SOFTPLUS = auto()
    SOFTMAX = auto()

In [None]:
str(ExpressionActivations.SOFTPLUS)

In [None]:
ExpressionActivations.SOFTPLUS.name.lower()

In [None]:
import torch.nn as nn

In [None]:
a=nn.Sequential()

In [None]:
a.append(nn.Linear(1, 1))

In [None]:
body = nn.Identity()

In [None]:
body(torch.rand(10, 10)).shape

In [None]:
map((body, body), (torch.rand(10, 10), torch.rand(10, 10)))

In [None]:
torch.add((torch.rand(10, 10), torch.rand(10, 10))).shape

In [None]:
torch.cat((torch.rand(10, 10), torch.rand(10, 10)), dim=-1).shape

In [None]:
from typing import Tuple

class AttentionEmbeds(nn.Module):
    def __init__(self, in_dim: Tuple[int, int], out_dim) -> None:
        super(AttentionEmbeds, self).__init__()
        
        self.value = nn.Parameter(torch.randn(out_dim))
        self.activation = nn.GELU()
        self.out_dim = out_dim

    def forward(self, x: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
        atten = self.activation(torch.einsum('...i, ...i ->...', *x))
        return torch.einsum('..., i -> ...i', atten, self.value)

In [None]:
x_1 = torch.rand((10, 20))
x_2 = torch.rand((10, 20))
v = torch.rand(20)

In [None]:
atten = torch.einsum('...i, ...i ->...', x_1, x_2)

In [None]:
out = torch.einsum('..., i -> ...i', atten, v)

In [None]:
nn.Parameter(torch.randn(20))

In [None]:
a = AttentionEmbeds((20, 20), 30)

In [None]:
a((x_1, x_2)).shape

In [None]:
X = np.random.rand(100, 896, 3072)

In [None]:
PT = '/Users/nsofroniew/Documents/data/multiomics/enformer/scratch'

In [None]:
import zarr

In [None]:
z1 = zarr.open(PT + '/example.zarr', mode='w', shape=(1000, 896, 3072), chunks=(1, None, None), dtype='float32')

In [None]:
for i in range(10):
    z1[i * 100: (i+1)*100] = X

In [None]:
z2 = zarr.open(PT + '/example.zarr', mode='r')

In [None]:
from time import time

In [None]:
start = time()
z2[893]
stop = time()
print(stop - start)

In [None]:
torch.save({'results': X[0]}, PT + '/example_0.pt')

In [None]:
start = time()
torch.load(PT + '/example_0.pt')
stop = time()
print(stop - start)

In [None]:
class ZarrDataset(torch.utils.data.Dataset):
    def __init__(self, path: str):
        super(ZarrDataset, self).__init__()

        self.path = path

        self.array = zarr.open(path, mode='r')

    def __len__(self) -> int:
        return self.array.shape[0]

    def __getitem__(self, idx):
        return self.array[idx]

In [None]:
ds = ZarrDataset(PT + '/example.zarr')
dl = torch.utils.data.DataLoader(ds, shuffle=False, num_workers=2)

In [None]:
start = time()
for batch in iter(dl):
    pass
stop = time()
print(stop - start)

In [None]:
from rosa.datasets import RosaObsDataset, ToTensor, RosaObsVarDataset, RosaJointDataset
from rosa.config import ExpressionTransformConfig

from torch.utils.data import default_collate


In [None]:
# isinstance(ds, RosaJointDataset)

In [None]:
import anndata as ad

ADATA_PT = "/Users/nsofroniew/Documents/data/multiomics/cell_census/tabula_sapiens_by_features_with_embeds_new_norm.h5ad"


adata = ad.read_h5ad(ADATA_PT)

In [None]:
from torch import Tensor
from typing import Optional, Tuple, List

In [None]:
ds = RosaObsDataset(adata, obs_input='embedding')

In [None]:
ds[0][0].shape

In [None]:
ds[0][1].shape

In [None]:
adata.varm['embedding_pca'].shape

In [None]:
2**14 / 19429

In [None]:
import torch

torch.empty((0, 0))

In [None]:
# class RosaObsVarDataset(RosaJointDataset):
#     def __init__(
#         self,
#         adata: ad.AnnData,
#         *,
#         var_input: str,
#         obs_input: str,
#         expression_layer: Optional[str] = None,
#         expression_transform_config: Optional[ExpressionTransformConfig] = None,
#     ) -> None:
#         super(RosaObsVarDataset, self).__init__(adata, obs_input=obs_input, var_input=var_input, expression_layer=expression_layer, expression_transform_config=expression_transform_config)

#     def __len__(self) -> int:
#         return self.expression.shape[0]

#     def __getitem__(self, idx: int) -> Tuple[Tuple[Tensor, Tensor], Tensor]:
#         obs_input = self.input[0][idx]
#         expression = self.expression[idx]
#         full_input = (obs_input.expand((self.input[1].shape[0], obs_input.shape[0])), torch.empty((self.input[1].shape[0], 0)))
#         return full_input, expression

#     def collate_fn(self, batch: List[Tuple[Tuple[Tensor, Tensor], Tensor]]) -> Tuple[Tuple[Tensor, Tensor], Tensor]:
#         (x0, _), y = default_collate(batch)
#         x1 = self.input[1].expand((x0.shape[0],) + self.input[1].shape)
#         return (x0.view(-1, x0.shape[-1]), x1.view(-1, x1.shape[-1])), y.view(-1)

In [None]:
ds = RosaObsVarDataset(adata, obs_input='embedding', var_input='embedding_pca')

In [None]:
ds[0][0][0].shape

In [None]:
ds[0][0][1].shape

In [None]:
ds[0][1].shape

In [None]:
from torch.utils.data import DataLoader


dl = DataLoader(
            ds,
            batch_size=64,
            shuffle=False,
            num_workers=0,
            # collate_fn=ds.collate_fn,
        )

In [None]:
batch = next(iter(dl))

In [None]:
batch[0][0].shape

In [None]:
batch[0][1][0].expand(batch[0][1].shape).shape

In [None]:
batch[1].shape

In [None]:
from torch.utils.data import default_collate


def obsvar_collate(batch):
    (x0, x1), y = default_collate(batch)
    return (x0.view(-1, x0.shape[-1]), x1.view(-1, x1.shape[-1])), y.view(-1)

In [None]:
x = ds[0][0][0]

In [None]:
x.expand((2000, 110)).shape

In [None]:
import torch

torch.arange(100)

In [None]:
z = torch.multinomial(torch.arange(100).float(), 10).long()

In [None]:
x = torch.rand(100, 1000)

In [None]:
x[z].shape

In [None]:
from pytorch_lightning.utilities.rank_zero import LightningDeprecationWarning

In [None]:
import scanpy as sc

In [None]:
ADATA_PT = "/Users/nsofroniew/Documents/data/multiomics/cell_census/tabula_sapiens_pbulk.h5ad"

In [None]:
import anndata as ad

In [None]:
adata = ad.read_h5ad(ADATA_PT)

In [None]:
adata_2 = ad.concat([adata, adata])

In [None]:
adata_2

In [None]:
adata.uns['log1p']['base'] = None
sc.tl.rank_genes_groups(adata_2, 'label', method='wilcoxon')

In [None]:
adata_3 = adata_2[:adata.n_obs]

In [None]:
import pandas as pd

result = adata_3.uns['rank_genes_groups']
groups = result['names'].dtype.names
df = pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names', 'pvals']})

In [None]:
adata_3.shape

In [None]:
df

In [None]:
logFCs, pvals = dc.get_contrast(adata,
                                group_col='cell_type',
                                condition_col='disease',
                                condition='COVID-19',
                                reference='normal',
                                method='t-test'
                               )

In [1]:
ADATA_PT = "/Users/nsofroniew/Documents/data/multiomics/cell_census/tabula_sapiens.h5ad"

In [2]:
import anndata as ad
import scanpy as sc

In [60]:
adata = ad.read_h5ad(ADATA_PT)

In [61]:
adata.var = adata.var.set_index('feature_id')

In [62]:
adata = adata[:20000, :1000]

sc.pp.normalize_total(adata, target_sum=1e5)
sc.pp.log1p(adata)
sc.tl.rank_genes_groups(adata, 'cell_type', method='t-test')

  view_to_actual(adata)
  self.stats[group_name, 'names'] = self.var_names[global_indices]
  self.stats[group_name, 'scores'] = scores[global_indices]
  self.stats[group_name, 'pvals'] = pvals[global_indices]
  self.stats[group_name, 'pvals_adj'] = pvals_adj[global_indices]
  self.stats[group_name, 'logfoldchanges'] = np.log2(
  self.stats[group_name, 'names'] = self.var_names[global_indices]
  self.stats[group_name, 'scores'] = scores[global_indices]
  self.stats[group_name, 'pvals'] = pvals[global_indices]
  self.stats[group_name, 'pvals_adj'] = pvals_adj[global_indices]
  self.stats[group_name, 'logfoldchanges'] = np.log2(
  self.stats[group_name, 'names'] = self.var_names[global_indices]
  self.stats[group_name, 'scores'] = scores[global_indices]
  self.stats[group_name, 'pvals'] = pvals[global_indices]
  self.stats[group_name, 'pvals_adj'] = pvals_adj[global_indices]
  self.stats[group_name, 'logfoldchanges'] = np.log2(
  self.stats[group_name, 'names'] = self.var_names[global_ind

In [74]:
adata.uns['rank_genes_groups']['names'][5]

('ENSG00000107362', 'ENSG00000114779', 'ENSG00000146386', 'ENSG00000248487', 'ENSG00000204574', 'ENSG00000268205', 'ENSG00000265206', 'ENSG00000099204', 'ENSG00000248487', 'ENSG00000140526', 'ENSG00000275700', 'ENSG00000164163', 'ENSG00000128274', 'ENSG00000268205', 'ENSG00000285486', 'ENSG00000128274', 'ENSG00000099204', 'ENSG00000121410', 'ENSG00000117528', 'ENSG00000175164', 'ENSG00000168792', 'ENSG00000140526', 'ENSG00000103222', 'ENSG00000265206', 'ENSG00000275700', 'ENSG00000108798', 'ENSG00000228434', 'ENSG00000163322', 'ENSG00000114779', 'ENSG00000181409', 'ENSG00000108798', 'ENSG00000248487', 'ENSG00000106077', 'ENSG00000284874', 'ENSG00000099204', 'ENSG00000136379', 'ENSG00000008311', 'ENSG00000081760', 'ENSG00000204574')

In [101]:
group_names = adata.uns['rank_genes_groups']['names'].dtype.names

In [104]:
adata.uns['rank_genes_groups']['names']['B cell'].shape

(1000,)

In [105]:
adata.uns['rank_genes_groups']['scores']['B cell']

array([ 1.54892235e+01,  9.45605564e+00,  8.55731010e+00,  7.09016609e+00,
        7.07610321e+00,  7.05922461e+00,  6.76698065e+00,  6.33519459e+00,
        6.15257215e+00,  5.25063181e+00,  4.97421932e+00,  4.91893673e+00,
        4.68113804e+00,  4.37397194e+00,  4.00625134e+00,  3.97034192e+00,
        3.93682933e+00,  3.90745640e+00,  3.70588017e+00,  3.54302192e+00,
        3.52923155e+00,  3.48039198e+00,  3.37411690e+00,  3.01259518e+00,
        2.80566049e+00,  2.68220973e+00,  2.60377240e+00,  2.55142856e+00,
        2.42716408e+00,  2.34950280e+00,  2.17168593e+00,  2.08693242e+00,
        2.07405901e+00,  1.86138582e+00,  1.78469217e+00,  1.73268902e+00,
        1.72764540e+00,  1.72319615e+00,  1.71792126e+00,  1.69958472e+00,
        1.69796348e+00,  1.65862465e+00,  1.64955914e+00,  1.59679508e+00,
        1.57159758e+00,  1.57047760e+00,  1.54041409e+00,  1.48922098e+00,
        1.48906887e+00,  1.47987556e+00,  1.46436965e+00,  1.44366527e+00,
        1.41408217e+00,  

In [109]:
adata.uns['rank_genes_groups']['logfoldchanges']['B cell']

array([ 5.2094507e+00,  4.8909335e+00,  2.8564801e+00,  2.3101621e+00,
        2.3332534e+00,  2.5318687e+00,  2.1926687e+00,  2.7848787e+00,
        2.0098469e+00,  1.6829197e+00,  1.8435516e+00,  1.5641372e+00,
        3.8566883e+00,  1.4610239e+00,  1.2858261e+00,  1.2994709e+00,
        1.2295771e+00,  1.2332904e+00,  1.4777619e+00,  1.9577641e+00,
        4.0750661e+00,  1.2231256e+00,  1.9953816e+00,  1.2196643e+00,
        1.4211061e+00,  1.1131777e+00,  1.3143189e+00,  1.3172687e+00,
        1.4634844e+00,  8.8572782e-01,  1.7527899e+00,  1.4204648e+00,
        1.7760317e+00,  8.0562890e-01,  7.0459825e-01,  1.4335918e+00,
        5.8261824e-01,  1.8081175e+00,  1.0666671e+00,  5.7121527e-01,
        2.1054552e+00,  5.4306853e-01,  1.6674036e+00,  5.6704587e-01,
        1.3554325e+00,  1.2369736e+00,  2.1650369e+00,  2.0095432e+00,
        1.6270305e+00,  1.9844443e+00,  4.8799771e-01,  7.7658045e-01,
        2.5273359e+01,  2.2831244e+00,  1.2996391e+00,  9.2486668e-01,
      

In [112]:
adata.uns['rank_genes_groups']['names']['B cell'][:10]

array(['ENSG00000265206', 'ENSG00000005471', 'ENSG00000114779',
       'ENSG00000146386', 'ENSG00000146109', 'ENSG00000107362',
       'ENSG00000164163', 'ENSG00000237513', 'ENSG00000099204',
       'ENSG00000158122'], dtype=object)

In [102]:
group_names

('B cell',
 'CD4-positive, alpha-beta T cell',
 'CD4-positive, alpha-beta memory T cell',
 'CD8-positive, alpha-beta T cell',
 'CD8-positive, alpha-beta cytokine secreting effector T cell',
 'CD141-positive myeloid dendritic cell',
 'T cell',
 'basal cell',
 'basophil',
 'ciliated cell',
 'classical monocyte',
 'double-positive, alpha-beta thymocyte',
 'endothelial cell',
 'endothelial cell of hepatic sinusoid',
 'erythrocyte',
 'fibroblast',
 'hematopoietic stem cell',
 'hepatocyte',
 'intrahepatic cholangiocyte',
 'ionocyte',
 'liver dendritic cell',
 'macrophage',
 'mast cell',
 'mature NK T cell',
 'memory B cell',
 'monocyte',
 'mucus secreting cell',
 'naive B cell',
 'naive thymus-derived CD4-positive, alpha-beta T cell',
 'neutrophil',
 'non-classical monocyte',
 'plasma cell',
 'plasmacytoid dendritic cell',
 'platelet',
 'secretory cell',
 'serous cell of epithelium of trachea',
 'smooth muscle cell',
 'tracheal goblet cell',
 'type I NK T cell')

In [100]:
len(adata.uns['rank_genes_groups']['names'].dtype.names)

39

In [63]:
len(adata.uns['rank_genes_groups']['names'])

1000

In [32]:
len(adata.uns['rank_genes_groups']['names'][1])

160

In [57]:
adata.var.set_index('feature_id')

Unnamed: 0_level_0,soma_joinid,feature_name,feature_length
feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000121410,0,A1BG,3999
ENSG00000268895,1,A1BG-AS1,3374
ENSG00000148584,2,A1CF,9603
ENSG00000175899,3,A2M,6318
ENSG00000245105,4,A2M-AS1,2948
...,...,...,...
ENSG00000286406,60633,RP11-1277N13.1,4210
ENSG00000287924,60634,RP11-732B24.1,1109
ENSG00000288153,60635,CTA-963H5.7,335
ENSG00000287751,60636,RP11-434D2.14,507


In [37]:
import numpy as np

np.unique(adata.uns['rank_genes_groups']['names'][0])

array([('17566', '17566', '18556', '18556', '18556', '12500', '23580', '11122', '13950', '17566', '13950', '27300', '29331', '17566', '13158', '31887', '13950', '27538', '11116', '14173', '19390', '13299', '13565', '19367', '27930', '6684', '17935', '17952', '30586', '29604', '30312', '15214', '21910', '15072', '17623', '13299', '11771', '19390', '30509', '18239', '19394', '25126', '12614', '12614', '17952', '17952', '27484', '29965', '28917', '36488', '25188', '34819', '17145', '13299', '15214', '17300', '17300', '12024', '7046', '30587', '19634', '14173', '36488', '14173', '36488', '26296', '17952', '27302', '10595', '13950', '13950', '31433', '25254', '34819', '15214', '24931', '34819', '9889', '19394', '27462', '27866', '13706', '23014', '19361', '17568', '19298', '38762', '12180', '28917', '31433', '30701', '11122', '21826', '11303', '12630', '17566', '14173', '27866', '17566', '21910', '15072', '11632', '31433', '17559', '14755', '12112', '29604', '18087', '27538', '18556', '1112

In [39]:
type(adata.uns['rank_genes_groups']['names'][1])

numpy.void

In [56]:
adata.uns['rank_genes_groups']['logfoldchanges']

(5.881035, 7.205521, 6.43951, 6.727491, 6.1591105, 6.761284, 8.745455, 1.7795879, 4.956738, 6.8631206, 5.3377604, 2.862865, 6.4997497, 8.401167, 8.230047, 3.4946938, 4.5523076, 4.3963137, 9.129167, 9.724476, 9.29179, 6.536508, 12.214147, 7.9974265, 8.399494, 10.637169, 7.6713486, 5.4094505, 13.255723, 10.627888, 13.459865, 6.5944223, 8.022387, 6.506741, 2.854675, 7.483746, 7.670808, 10.706106, 2.3602629, 5.400573, 7.3365636, 13.430292, 5.6759124, 5.8657303, 4.2147803, 4.888867, 8.478251, 6.3236265, 7.2032638, 5.902944, 10.565244, 3.4587846, 4.6770825, 7.0351105, 6.2996755, 16.338106, 15.4042015, 10.227097, 16.038496, 12.70416, 12.378877, 9.206963, 6.2106586, 9.326288, 4.9740214, 1.8804623, 3.9595134, 1.9119495, 6.3516264, 4.9400215, 5.212997, 5.848224, 9.114564, 4.2106786, 8.072676, 11.309373, 3.2371514, 7.1129985, 8.435274, 8.793837, 6.0914145, 9.231887, 13.09219, 8.813252, 10.285228, 16.430239, 7.8629756, 10.1334915, 8.104783, 6.4785833, 13.814871, 1.7891961, 4.6703143, 4.148885, 5.0

In [12]:
adata.uns['rank_genes_groups']['scores'].shape

(60638,)

In [18]:
len(adata.uns['rank_genes_groups']['logfoldchanges'][0])

160

In [22]:
len(adata.uns['rank_genes_groups']['names'])

60638

In [23]:
len(adata.uns['rank_genes_groups']['names'][0])

160

In [27]:
adata.uns['rank_genes_groups']['names'][2]

('12640', '17559', '27296', '12614', '11122', '26327', '17299', '11506', '26327', '17559', '27300', '27244', '17600', '14072', '22046', '19388', '11506', '27301', '28566', '11768', '29573', '32106', '30701', '15072', '17952', '30681', '12265', '28917', '23097', '23102', '31190', '15072', '27473', '32106', '27307', '19390', '13463', '29573', '29127', '12020', '28566', '36488', '13950', '11506', '17935', '30154', '27835', '17952', '30154', '25254', '34842', '25188', '12589', '27464', '12589', '17298', '17298', '17301', '16804', '32241', '21910', '13463', '10594', '13781', '25254', '27036', '16793', '26531', '31313', '22009', '27296', '15904', '15214', '15072', '19406', '27616', '27975', '27835', '29953', '25070', '15825', '16335', '29962', '29067', '17566', '11116', '36488', '21789', '17952', '13869', '29127', '23580', '26811', '19406', '12633', '12626', '10594', '17952', '12639', '13879', '23007', '18020', '12569', '10521', '12112', '11632', '23102', '12640', '27230', '13950', '12500', 

In [29]:
adata.var.loc['34842']

soma_joinid                 34842
feature_id        ENSG00000211459
feature_name              MT-RNR1
feature_length                954
Name: 34842, dtype: object

In [None]:
adata_raw = ad.read_h5ad(ADATA_PT)

In [None]:
adata_raw.uns['rank_genes_groups'] = adata.uns['rank_genes_groups']

In [None]:
adata_raw.write_h5ad(ADATA_PT)