In [1]:
import pandas as pd
from ALLCools.mcds import MCDS
from scipy.stats import f_oneway, ttest_ind, kruskal
import joblib
import anndata
from wmb import mm10
import pingouin as pg
import xarray as xr
import joblib
import numpy as np

In [2]:
gene_name_to_id = joblib.load('240112_DMG_CEF/gene_name_to_id_dict')
gene_id_to_name = joblib.load('240112_DMG_CEF/gene_id_to_name_dict')

  **kwargs
  **kwargs


1. calculate p value by f_oneway
2. do a fdr correction

In [3]:
_gender = 'Female'
mc_type = 'CHN'

In [4]:
if _gender == 'Female':
    mcds_paths = "Female.genes_frac.mcds"
    var_dim = "gene"
    meta = pd.read_csv('/home/qzeng/project/aging/metadata/240104_m3C_META.csv', index_col = 0)
    use_cts = joblib.load(f"/home/qzeng/project/aging/metadata/m3c_use_cts")
elif _gender == 'Male':
    mcds_paths = "Male.genes_frac.mcds"
    var_dim = "geneslop2k"
    meta = pd.read_csv('/home/qzeng/project/aging/metadata/240104_mC_META.csv', index_col = 0)
    use_cts = joblib.load(f"/home/qzeng/project/aging/metadata/mc_use_cts")

In [5]:
use_meta = meta[meta['AgingMajorType'].isin(use_cts)]

In [6]:
mcds =  MCDS.open(mcds_paths, var_dim=var_dim, use_obs=use_meta.index).sel(mc_type=mc_type).load()

In [7]:
mcds.coords['AgingMajorType'] = use_meta['AgingMajorType']
mcds.coords['Age'] = use_meta['Age']

In [8]:
genes = mcds.get_index(var_dim)

In [9]:
gene_name_to_id['Xist'] in genes

True

In [10]:
df_list = []
cell_types = []

for mt ,tmp_mcds in mcds.groupby('AgingMajorType'):

    tmp_meta = tmp_mcds['Age'].to_pandas()

    tmp_8wk_cells = tmp_meta[tmp_meta == '8wk'].index
    tmp_9mo_cells = tmp_meta[tmp_meta == '9mo'].index
    tmp_18mo_cells = tmp_meta[tmp_meta == '18mo'].index

    tmp_gene_frac = tmp_mcds[f"{var_dim}_da_frac"].to_pandas()

    F, p_value = f_oneway(
        tmp_gene_frac.loc[tmp_8wk_cells],
        tmp_gene_frac.loc[tmp_9mo_cells],
        tmp_gene_frac.loc[tmp_18mo_cells],
        axis=0,
    )
    celltype_gene_pvalue = pd.DataFrame({"pvalue": p_value}, index=genes)
    
    # fdr correction
    reject, pvals_corr = pg.multicomp(celltype_gene_pvalue[f'pvalue'], method="fdr_by")
    celltype_gene_pvalue['corrected_pvalue'] = pvals_corr

    tmp_gene_frac['Age'] = use_meta['Age']
    gene_mean_by_age = tmp_gene_frac.groupby('Age').mean().T
    gene_mean_by_age.columns = [f'{_age}' for _age in gene_mean_by_age.columns]

    celltype_gene_pvalue_and_mean = pd.concat([celltype_gene_pvalue, gene_mean_by_age], axis = 1)
    df_list.append(celltype_gene_pvalue_and_mean)
    cell_types.append(mt)
    print(mt)

Astro-NT NN
Astro-TE NN
CA1-ProS Glut
CA3 Glut
CEA-BST Gaba
CS-PRNr-DR En1 Sox2 Gaba
DG Glut
DG-PIR Ex IMN
L2/3 IT CTX Glut
L2/3 IT ENT Glut
L2/3 IT PPP Glut
L4/5 IT CTX Glut
L5 ET CTX Glut
L5 IT CTX Glut
L5 NP CTX Glut
L6 CT CTX Glut
L6 IT CTX Glut
L6b/CT ENT Glut
LA-BLA-BMA-PA Glut
LDT-PCG-CS Gata3 Lhx1 Gaba
Lamp5 Gaba
MEA-BST Gaba
MEA-COA Glut
MRN-VTN-PPN Pax5 Cdh23 Gaba
Microglia NN
OPC NN
Oligo NN
PAG Glut
PAG-PPN Pax5 Sox21 Gaba
PB Evx2 Glut
Pvalb Gaba
STR D1 Gaba
STR D1 Sema5a Gaba
STR D2 Gaba
STR Gaba
STR-PAL Chst9 Gaba
SUB-ProS Glut
Sst Gaba
VLMC NN
Vip Gaba


In [11]:
data_arrays = []
for cell_type, df in dict(zip(cell_types,df_list)).items():
    data = np.array([df])
    da = xr.DataArray(data, 
                      coords=[
                          [cell_type],
                          df.index, 
                          df.columns, 
                          ],
                      dims=['cell_type','genes', 'columns'])
    data_arrays.append(da)
total_data = xr.Dataset({f"anova_results": xr.concat(data_arrays, dim="cell_type")})
total_data.to_zarr(f'{mc_type}.{_gender}.AgingMajorType.Anova.Result.zarr', mode='w')

<xarray.backends.zarr.ZarrStore at 0x7f34c4b107d0>