In [1]:
import scanpy as sc
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import rcParams
from matplotlib import colors
from matplotlib import patches
import seaborn as sns
import batchglm
import diffxpy.api as de
import patsy as pat
from statsmodels.stats.multitest import multipletests
import logging, warnings
import statsmodels.api as sm

In [2]:
plt.rcParams['figure.figsize']=(8,8) #rescale figures
sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()
de.__version__

logging.getLogger("tensorflow").setLevel(logging.ERROR)
logging.getLogger("batchglm").setLevel(logging.INFO)
logging.getLogger("diffxpy").setLevel(logging.INFO)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 35)
warnings.filterwarnings("ignore", category=DeprecationWarning, module="tensorflow")

-----
anndata     0.8.0
scanpy      1.8.2
sinfo       0.3.1
-----
PIL                 8.0.1
anndata             0.8.0
asciitree           NA
asttokens           NA
backcall            0.2.0
batchglm            v0.7.4
cairo               1.20.0
cffi                1.14.3
cloudpickle         1.6.0
colorama            0.4.4
comm                0.1.2
cycler              0.10.0
cython_runtime      NA
cytoolz             0.11.0
dask                2.30.0
dateutil            2.8.2
debugpy             1.6.6
decorator           4.4.2
diffxpy             v0.7.4
executing           1.2.0
fasteners           0.19
h5py                3.1.0
igraph              0.9.1
ipykernel           6.21.3
jedi                0.17.2
joblib              1.1.0
kiwisolver          1.3.1
leidenalg           0.8.3
llvmlite            0.34.0
louvain             0.6.1
matplotlib          3.5.1
matplotlib_inline   0.1.6
mpl_toolkits        NA
natsort             7.1.0
numba               0.51.2
numcodecs           0.12.1

In [3]:
sc.settings.n_jobs = 40
sc.set_figure_params(figsize=(4, 4), vector_friendly = True)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

In [4]:
adata = sc.read_h5ad("Data/Output_231123_adata_scvi_random_sampleID_annot2.h5ad")

In [59]:
adata.X = adata.layers["logcounts"]

In [5]:
adata

AnnData object with n_obs × n_vars = 393060 × 49133
    obs: 'batch', 'sampleID', 'Age', 'Assay', 'Stage', 'Race', 'PMI', 'Hemisphere', 'Library', 'Brain_Region', 'Dataset', 'Sex', 'Diagnosis', 'DF_classification', 'cluster_original', 'cluster_main', 'n_genes', 'Stage2', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'doublet_scores', 'predicted_doublets', '_scvi_batch', '_scvi_labels', 'leiden_scvi', 'leiden_0.5', 'leiden_0.7', 'leiden_1.0', 'Brain_Region2', 'Brain_Region3', 'Brain_Region_Unit', 'cluster_number', 'Leiden'
    uns: 'Brain_Region3_colors', 'Brain_Region_colors', 'Leiden_colors', 'Stage2_colors', 'Stage_colors', 'cluster_main2_colors', 'cluster_main_colors', 'leiden_scvi_colors', 'log1p'
    obsm: 'X_scVI_sampleID', 'X_umap', '_scvi_extra_categorical_covs'
    layers: 'counts', 'logcounts', 'scaled'

In [11]:
adata.obs["Stage2"].value_counts()

Adult (40-60Y)           74482
Fetal (1st trimester)    68108
Adult (60-80Y)           55500
Adult (20-40Y)           45656
Neonatal                 37254
Fetal (2nd trimester)    35183
Childhood (1-6Y)         30726
Adolescence (12-20Y)     14010
Childhood (6-12Y)        13739
Adult (>80Y)             11967
Fetal (3rd trimester)     6435
Name: Stage2, dtype: int64

In [12]:
adata.obs["Stage3"] =  adata.obs['Stage2'].astype(str)

adata.obs.loc[(adata.obs['Stage2'] == 'Fetal (1st trimester)'), 'Stage3'] = 'Fetal'
adata.obs.loc[(adata.obs['Stage2'] == 'Fetal (2nd trimester)'), 'Stage3'] = 'Fetal'
adata.obs.loc[(adata.obs['Stage2'] == 'Fetal (3rd trimester)'), 'Stage3'] = 'Fetal'

adata.obs.loc[(adata.obs['Stage2'] == 'Childhood (1-6Y)'), 'Stage3'] = 'Childhood'
adata.obs.loc[(adata.obs['Stage2'] == 'Childhood (6-12Y)'), 'Stage3'] = 'Childhood'

adata.obs.loc[(adata.obs['Stage2'] == 'Adolescence (12-20Y)'), 'Stage3'] = 'Adolescence'

adata.obs.loc[(adata.obs['Stage2'] == 'Adult (20-40Y)'), 'Stage3'] = 'Adult'
adata.obs.loc[(adata.obs['Stage2'] == 'Adult (40-60Y)'), 'Stage3'] = 'Adult'
adata.obs.loc[(adata.obs['Stage2'] == 'Adult (60-80Y)'), 'Stage3'] = 'Adult'
adata.obs.loc[(adata.obs['Stage2'] == 'Adult (>80Y)'), 'Stage3'] = 'Adult'

adata.obs.Stage3.value_counts()

Adult          187605
Fetal          109726
Childhood       44465
Neonatal        37254
Adolescence     14010
Name: Stage3, dtype: int64

# C11 Subset

In [60]:
adata1 = adata[adata.obs["cluster_number"] == "C11"].copy()

In [61]:
adata1.obs['total_counts_scaled'] = adata1.obs['total_counts']/adata1.obs['total_counts'].mean()

## ESR1

In [62]:
formula = "1 + Sex + Stage3 + Brain_Region + n_genes_by_counts + pct_counts_mt"

In [63]:
adata1.obs["Stage3"].value_counts()

Adult          7866
Neonatal       2334
Childhood      1641
Fetal           774
Adolescence     281
Name: Stage3, dtype: int64

In [64]:
adata1.obs["Dataset"].value_counts()

Herring     5372
AllenM1     4175
Turecki     1632
ZhangPD     1123
Morabito     311
Hardwick     280
Braun          3
Name: Dataset, dtype: int64

In [65]:
adata1.obs["Sex"].value_counts()

M          7907
F          4986
Unknown       3
Name: Sex, dtype: int64

In [66]:
adata1.obs["Brain_Region"].value_counts()

BA9                  7106
M1                   4175
BA8                   667
BA46                  415
prefrontal cortex     311
BA10                  219
Brain                   3
Name: Brain_Region, dtype: int64

In [67]:
adata1 = adata1[(adata1.obs["Brain_Region"] != "Brain") & (adata1.obs["Brain_Region"] != "prefrontal cortex")]

In [68]:
dmat1 = de.utils.design_matrix(
    data=adata1,
    formula="~" + formula,
    as_numeric=["n_genes_by_counts", "pct_counts_mt"],
    return_type="patsy"
)

In [69]:
if np.linalg.matrix_rank(np.asarray(dmat1[0])) < np.min(dmat1[0].shape):
        print(f'Cannot test as design matrix is not full rank.')

In [70]:
np.linalg.matrix_rank(np.asarray(dmat1[0]))

12

In [71]:
np.min(dmat1[0].shape)

12

In [72]:
adata1 = adata1[:, 'ESR1']

In [73]:
adata1

View of AnnData object with n_obs × n_vars = 12582 × 1
    obs: 'batch', 'sampleID', 'Age', 'Assay', 'Stage', 'Race', 'PMI', 'Hemisphere', 'Library', 'Brain_Region', 'Dataset', 'Sex', 'Diagnosis', 'DF_classification', 'cluster_original', 'cluster_main', 'n_genes', 'Stage2', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'doublet_scores', 'predicted_doublets', '_scvi_batch', '_scvi_labels', 'leiden_scvi', 'leiden_0.5', 'leiden_0.7', 'leiden_1.0', 'Brain_Region2', 'Brain_Region3', 'Brain_Region_Unit', 'cluster_number', 'Leiden', 'Stage3', 'total_counts_scaled'
    uns: 'Brain_Region3_colors', 'Brain_Region_colors', 'Leiden_colors', 'Stage2_colors', 'Stage_colors', 'cluster_main2_colors', 'cluster_main_colors', 'leiden_scvi_colors', 'log1p'
    obsm: 'X_scVI_sampleID', 'X_umap', '_scvi_extra_categorical_covs'
    layers: 'counts', 'logcounts', 'scaled'

# GLM model

In [74]:
for i, gene in enumerate(adata1.var_names):
    # Specify model
    pois_model = sm.GLM(
        endog=adata1.X[:, i].todense(), #[idx_train, :], 
        exog=dmat1[0], 
        offset=np.log(adata1.obs['total_counts_scaled'].values),
        family=sm.families.Poisson(),   
    )
    # Fit the model
    pois_results1 = pois_model.fit() # i enumerate 하지 않으면 error가 발생.

    # Get the covariance matrix
    cov_mat1 = pois_results1.cov_params()

In [75]:
p_val = pois_results1.pvalues
p_val

array([5.76092690e-007, 2.02425058e-002, 3.42365261e-002, 2.21229220e-005,
       1.82171745e-013, 2.40695049e-007, 3.92038436e-015, 8.10610249e-005,
       3.20724819e-001, 5.14164670e-097, 2.30223666e-115, 8.01752419e-015])

In [76]:
formatted_p_values = [format(p, '.4f') for p in p_val]
formatted_p_values

['0.0000',
 '0.0202',
 '0.0342',
 '0.0000',
 '0.0000',
 '0.0000',
 '0.0000',
 '0.0001',
 '0.3207',
 '0.0000',
 '0.0000',
 '0.0000']

In [77]:
summary_with_names1 = pois_results1.summary(xname=dmat1[1])
summary_with_names1

0,1,2,3
Dep. Variable:,y,No. Observations:,12582.0
Model:,GLM,Df Residuals:,12570.0
Model Family:,Poisson,Df Model:,11.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-4667.9
Date:,"Thu, 07 Dec 2023",Deviance:,4937.0
Time:,13:17:58,Pearson chi2:,8510.0
No. Iterations:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.6252,0.125,4.999,0.000,0.380,0.870
Sex[T.M],0.1633,0.070,2.322,0.020,0.025,0.301
Stage3[T.Adult],-0.2364,0.112,-2.117,0.034,-0.455,-0.018
Stage3[T.Childhood],0.4943,0.117,4.242,0.000,0.266,0.723
Stage3[T.Fetal],-2.6373,0.358,-7.361,0.000,-3.339,-1.935
Stage3[T.Neonatal],-0.6607,0.128,-5.165,0.000,-0.911,-0.410
Brain_Region[T.BA9],-0.5949,0.076,-7.857,0.000,-0.743,-0.446
Brain_Region[T.BA10],0.4633,0.118,3.941,0.000,0.233,0.694
Brain_Region[T.BA46],0.0889,0.089,0.993,0.321,-0.087,0.264


In [83]:
res = (summary_with_names1.tables[1])
res = pd.DataFrame(res[1:], columns=res[0])

round_scientific = lambda x: '{:.2e}'.format(x)
res["pvalue"] = pd.Series(p_val).apply(round_scientific)
res

Unnamed: 0,Unnamed: 1,coef,std err,z,P>|z|,[0.025,0.975],pvalue
0,Intercept,0.6252,0.125,4.999,0.0,0.38,0.87,5.76e-07
1,Sex[T.M],0.1633,0.07,2.322,0.02,0.025,0.301,0.0202
2,Stage3[T.Adult],-0.2364,0.112,-2.117,0.034,-0.455,-0.018,0.0342
3,Stage3[T.Childhood],0.4943,0.117,4.242,0.0,0.266,0.723,2.21e-05
4,Stage3[T.Fetal],-2.6373,0.358,-7.361,0.0,-3.339,-1.935,1.82e-13
5,Stage3[T.Neonatal],-0.6607,0.128,-5.165,0.0,-0.911,-0.41,2.41e-07
6,Brain_Region[T.BA9],-0.5949,0.076,-7.857,0.0,-0.743,-0.446,3.92e-15
7,Brain_Region[T.BA10],0.4633,0.118,3.941,0.0,0.233,0.694,8.11e-05
8,Brain_Region[T.BA46],0.0889,0.089,0.993,0.321,-0.087,0.264,0.321
9,Brain_Region[T.M1],-2.5529,0.122,-20.902,0.0,-2.792,-2.314,5.14e-97


In [84]:
res.to_csv("Summary_ESR1_GLM_forshare.csv")