In [None]:
import numpy as np
import anndata as ad
import scanpy as sc
from rosa.preprocessing import (
    clean_cells_genes,
)

RAW_ADATA_PT = "/Users/nsofroniew/Documents/data/multiomics/cell_census/tabula_sapiens_by_features.h5ad"
EMBEDS_ADATA_PT = "/Users/nsofroniew/Documents/data/multiomics/cell_census/tabula_sapiens_by_features_with_embeds_new_norm.h5ad"


In [None]:
adata = ad.read_h5ad(EMBEDS_ADATA_PT)

In [None]:
adata.layers['counts'].sum(axis=1)

In [None]:
adata.layers['counts_normalized'] = adata.layers['counts'].copy()
sc.pp.normalize_total(adata, target_sum=1e5, layer='counts_normalized')

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(adata.layers['counts_normalized'].flatten());

In [None]:
from rosa.preprocessing import bin_expression, reconstruct_expression

In [None]:
bin_expression(adata, 128)

In [None]:
reconstruct_expression(adata)

In [None]:
((adata.X - adata.layers['reconstructed'])**2).mean()

In [None]:
# Plot residuals
plt.hist((adata.layers['reconstructed'] - adata.X).ravel(), bins=1000);
plt.xlim([-.25, .25]);

In [None]:
# Identify cells and genes not trained on (when possible)
adata.layers['prediction'] = adata.layers['reconstructed']
test_genes = np.logical_not(adata.var["train"])
test_cells = np.logical_not(adata.obs["train"])
adata_test = adata[test_cells, test_genes]
sc.tl.dendrogram(adata_test, groupby="label", use_rep="X")


In [None]:
from rosa.plotting import plot_marker_gene_heatmap

In [None]:
marker_genes_dict = adata_test.obs.set_index('label').to_dict()['marker_feature_name']
plot_marker_gene_heatmap(adata_test, marker_genes_dict)

In [None]:
plt.hist(adata.layers['binned'].flatten(), bins=25, density=True);

In [None]:
plt.hist(adata.X.flatten(), bins=250, density=True);
plt.ylim([0, 1]);
plt.xlim([0, 8])

In [None]:
adata.X

In [None]:
from typing import Union

In [None]:
adata.layers['X']

In [None]:
np.empty((10, 0))[0]

In [None]:
from enum import Enum, auto

class EmbeddingType(Enum):
    JOINT = auto()
    VAR = auto()
    OBS = auto()

In [None]:
EmbeddingType.JOINT

In [None]:
list(EmbeddingType.__members__)

In [None]:
adata.X = np.ceil(adata.X)
sc.pp.filter_genes(adata, min_cells=1)
sc.experimental.pp.normalize_pearson_residuals(adata)
adata.X[adata.X<0] = 0

In [None]:
2**10

In [None]:
import torch

In [None]:
np.isinf(adata.X).sum()

In [None]:
np.isnan(adata.X).sum()

In [None]:
sc.pp.log1p(adata)

In [None]:
adata.X

In [None]:
torch.tensor([0])

In [None]:
adata.X = np.ceil(adata.X)
adata.layers["counts"] = adata.X.copy()

In [None]:
sc.pp.filter_genes(adata, min_cells=1)

In [None]:
sc.pp.filter_cells(adata, min_genes=1)

In [None]:
adata

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(adata.layers['counts'].sum(axis=1));

In [None]:
print(adata.layers['counts'].sum(axis=1).mean() / 1e5)
print(adata.layers['counts'].sum(axis=1).var() / 1e10)

In [None]:
adata.layers["counts_normalized_total"] = adata.X.copy()
sc.pp.normalize_total(adata, 1e5, layer="counts_normalized_total")

In [None]:
print(adata.layers['counts_normalized_total'].sum(axis=1).mean() / 1e5)
print(adata.layers['counts_normalized_total'].sum(axis=1).var() / 1e10)

In [None]:
adata.layers["counts_normalized_pearson"] = adata.X.copy()
adata.layers['counts_normalized_pearson'] = np.ceil(adata.layers['counts_normalized_pearson'])
sc.experimental.pp.normalize_pearson_residuals(adata, layer="counts_normalized_pearson", theta=1e2)

In [None]:
np.isnan(adata.layers['counts_normalized_pearson']).sum()

In [None]:
print(adata.layers['counts_normalized_pearson'].sum(axis=1).mean())
print(adata.layers['counts_normalized_pearson'].sum(axis=1).var())

In [None]:
plt.hist(adata.layers['counts_normalized_pearson'].flatten(), np.linspace(0, 100, 1000));

In [None]:
plt.hist(adata.layers['counts'].flatten(), np.linspace(0, 100, 1000));

In [None]:
np.log1p(0.1)

In [None]:
(adata.layers['counts_normalized_pearson'] - adata.layers['counts']).max()

In [None]:
adata.layers['counts_normalized_pearson'] = np.round(adata.layers['counts_normalized_pearson'])

In [None]:
np.isnan(adata.layers['counts_normalized_pearson'])[0]

In [None]:
adata.layers['counts_normalized_pearson'].min()

In [None]:
adata.uns['pearson_residuals_normalization']

In [None]:
TABULA_SAPIENS_BY_CELL_TYPE_WITH_EMBEDS_PT = "/Users/nsofroniew/Documents/data/multiomics/cell_census/tabula_sapiens_by_features_with_embeds_new_norm.h5ad"

In [None]:
adata_norm = ad.read_h5ad(TABULA_SAPIENS_BY_CELL_TYPE_WITH_EMBEDS_PT)

In [None]:
y = np.exp(adata_norm.X) - 1

In [None]:
y

In [None]:
import scanpy as sc

In [None]:
adata_norm.X = adata_norm.layers['counts'].copy()

In [None]:
sc.pp.normalize_total(adata_norm, 1e5)

In [None]:
(abs(adata_norm.X - y)).max()

In [None]:
# from scipy.special import kl_div

# y_hat = np.asarray(adata[keep_cells].X.flatten())
# y = np.asarray(adata[keep_cells].layers['prediction'].flatten())

# kl_div(y, y_hat).mean()

In [None]:
from scipy.stats import kstest, poisson

y_hat = np.asarray(adata[keep_cells].X.flatten())
y = np.asarray(adata[keep_cells].layers['prediction'].flatten())

kstest(y_hat, y)

In [None]:
kstest(y, 'poisson', args=(np.mean(y),))

In [None]:
from scipy.stats import kstest, poisson

poisson_dist = poisson(np.mean(y))

In [None]:
y_p = poisson_dist.rvs(size=10000)

In [None]:
hist, _ = np.histogram(y, bins=bins)
hist_hat, _ = np.histogram(y_hat, bins=bins)
plt.bar(bins[:-1], hist/hist.sum(), width=10/1000)
plt.bar(bins[:-1], hist_hat/hist_hat.sum(), alpha=0.5, width=10/1000)
plt.ylim([0, 1e-2]);
plt.xlim([0, 8]);

In [None]:
poisson_dist = poisson(np.mean(y))
y_new = poisson_dist.rvs(size=10000)


hist, _ = np.histogram(y, bins=bins)
hist_new, _ = np.histogram(y_new, bins=bins)
plt.bar(bins[:-1], hist/hist.sum(), width=10/1000)
plt.bar(bins[:-1], hist_new//hist_new.sum(), alpha=0.5, width=10/1000)
plt.ylim([0, 1e-2]);
plt.xlim([0, 8]);

In [None]:
initial_params = [.2, 0.1, 1.2, 0.0.001]

result = minimize(negative_binomial, initial_params, args=(y,), method='Nelder-Mead')

In [None]:
# Extract the optimal parameters
r1, p1, r2, p2 = result.x
data = y

In [None]:
from scipy.stats import nbinom

r1, p1, r2, p2 = (0.1, 0.1, 1.1, 0.1)

# nbinom_dist_1 = nbinom(9.1, 0.6)
# nbinom_dist_2 = nbinom(1.2, .001)
y_new = (nbinom.rvs(.2, 0.1, size=10000) + nbinom.rvs(1.2, 0.001, size=10000)) / 1000


hist, _ = np.histogram(y, bins=bins)
hist_new, _ = np.histogram(y_new, bins=bins)
plt.bar(bins[:-1], hist/hist.sum(), width=10/1000)
plt.bar(bins[:-1], hist_new/hist_new.sum(), alpha=0.5, width=10/1000)
plt.ylim([0, 1e-2]);
plt.xlim([0, 8]);

In [None]:
import numpy as np
from scipy.optimize import minimize
from scipy.stats import nbinom
import matplotlib.pyplot as plt

# Generate sample data
data = y

# Define the negative binomial function
def negative_binomial(params, data):
    r1, p1, r2, p2 = params
    pmf1 = nbinom.pmf(1000 * data, r1, p1)
    pmf2 = nbinom.pmf(1000 * data, r2, p2)
    return -np.log(pmf1 + pmf2).sum()

# Define the initial values for the parameters
initial_params = (0.1, 0.1, 1.1, 0.1)


# Minimize the negative binomial function using the Nelder-Mead algorithm
result = minimize(negative_binomial, initial_params, args=(data,), method='Nelder-Mead')

# Extract the optimal parameters
r1, p1, r2, p2 = result.x

# Plot the histogram of the data
plt.hist(data, bins=30, density=True, alpha=0.5, label='Data')

# Plot the sum of the negative binomials
x = np.arange(0, data.max())
pmf1 = nbinom.pmf(x, r1, p1)
pmf2 = nbinom.pmf(x, r2, p2)
plt.plot(x, pmf1 + pmf2, 'r-', lw=2, label='Sum of Negative Binomials')

plt.xlabel('x')
plt.ylabel('Probability')
plt.legend()
plt.show()


In [None]:
try:
  print(x)
except NameError:
  print("Variable x is not defined")

In [None]:
import torch

In [None]:
torch.long

In [None]:
a = torch.tensor(2.3).type(torch.long)

In [None]:
adata.varm['embedding'].shape

In [None]:
import matplotlib.pyplot as plt

plt.hist(adata.varm['embedding'].ravel(), bins=2000);
plt.xlim([-0.5, 0.5]);

In [None]:
adata.varm['embedding']

In [None]:
from sklearn.decomposition import PCA

In [None]:
# fit pca on training data
pca = PCA()
pca.fit(adata.varm['embedding'])


In [None]:
E = pca.transform(adata.varm['embedding'])

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_));
plt.xlim([0, 3042])

In [None]:
np.cumsum(pca.explained_variance_ratio_)[512]

In [None]:
plt.hist(E[:, :512].ravel(), bins=2000);
plt.xlim([-1.5, 1.5]);

In [None]:
E.shape

In [1]:
import anndata as ad
from rosa.preprocessing import (
    calculate_gene_embeddings_pca,
)


EMBEDS_ADATA_PT = "/Users/nsofroniew/Documents/data/multiomics/cell_census/tabula_sapiens_by_features_with_embeds_new_norm.h5ad"

Global seed set to 0
  new_rank_zero_deprecation(
  return new_rank_zero_deprecation(*args, **kwargs)


In [2]:
adata = ad.read_h5ad(EMBEDS_ADATA_PT)
adata = calculate_gene_embeddings_pca(adata, 256)

In [3]:
adata.uns["embedding_pca"]

{'explained_variance': 0.918266625236611}

In [4]:
adata.write_h5ad(EMBEDS_ADATA_PT)

In [None]:
adata.varm['embedding_pca'].shape

In [None]:
adata.varm

In [10]:
ADATA_BULK_PT = "/Users/nsofroniew/Documents/data/multiomics/cell_census/tabula_sapiens_pbulk.h5ad"
adata = ad.read_h5ad(ADATA_BULK_PT)


In [16]:
adata.var.set_index("feature_id")

Unnamed: 0_level_0,soma_joinid,feature_name,feature_length
feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000121410,0,A1BG,3999
ENSG00000268895,1,A1BG-AS1,3374
ENSG00000118017,10,A4GNT,1779
ENSG00000129968,100,ABHD17A,6163
ENSG00000236469,1000,AC007040.8,570
...,...,...,...
ENSG00000256789,9995,RP11-697H9.2,423
ENSG00000255015,9996,RP11-716H6.1,507
ENSG00000255219,9997,RP11-716H6.2,571
ENSG00000255183,9998,LINC02711,679
