In [None]:
# conda activate anndata

import numpy as np
import pandas as pd
import anndata as ad

Here I create pseduobulk data per donor and cell type to perform DE analysis for each cell type

In [None]:
import os
os.chdir("/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM")

In [125]:
adata = ad.read_h5ad("data/tasic_2018_ALM_STAR_model/tasic_2018_ALM_STAR_gene_counts_scVI.h5ad")

In [None]:
# I want to find DE genes for each cell type. The best way to do this is by pseudobulking cell types by donor.

# First let's see how the makeup of cell types break down by donor:

pd.DataFrame(adata.obs.groupby("donor_id")["cell_subclass"].value_counts().groupby(level=0).head(1))

  pd.DataFrame(adata.obs.groupby("donor_id")["cell_subclass"].value_counts().groupby(level=0).head(2))
  pd.DataFrame(adata.obs.groupby("donor_id")["cell_subclass"].value_counts().groupby(level=0).head(2))


Unnamed: 0_level_0,Unnamed: 1_level_0,count
donor_id,cell_subclass,Unnamed: 2_level_1
228567,Sst,39
228567,Pvalb,1
228568,Sst,113
228568,Pvalb,2
228844,L5 IT,37
...,...,...
ANM382288,Astro,0
ANM384860,L5 PT,92
ANM384860,Astro,0
ANM386332,L5 PT,91


In [127]:
# Looks like donor is confounded with cell type (by design). 

# Let's try pseudobulking anyways.

In [None]:
df_list = []
meta_list = []

for ctype in np.unique(adata.obs['cell_subclass']):
    print(f"Starting {ctype}...")
    
    adata_subset = adata[adata.obs['cell_subclass'] == ctype].copy()
    
    X = adata_subset.raw.X
    
    df = pd.DataFrame.sparse.from_spmatrix(
        X, 
        index=adata_subset.obs_names,
        columns=adata.raw.var_names
    )
    df_bulked = df.groupby(adata_subset.obs['donor_id']).sum()
    
    meta_list.append(pd.DataFrame({
        'Cell_type': ctype, 
        'Donor': df_bulked.index.astype(str).values
    }))
    
    df_bulked.index = ctype + "_" + df_bulked.index.astype(str)
    df_list.append(df_bulked)

In [None]:
df_all = pd.concat(df_list, axis=0)
meta = pd.concat(meta_list, axis=0).reset_index(drop=True)

In [None]:
meta['Sample_ID'] = df_all.index.values
df_all.index.name = None

In [None]:
# Save
df_all.T.to_csv("data/tasic_2018_ALM_STAR_pseudobulk.csv")
meta.to_csv("data/tasic_2018_ALM_STAR_pseudobulk_sampleinfo.csv", index=False)

In [None]:
# pseudobulk = pd.read_csv("data/tasic_2018_ALM_STAR_pseudobulk.csv", index_col=0)
# pseudobulk_meta = pd.read_csv("data/tasic_2018_ALM_STAR_pseudobulk_sampleinfo.csv")