In [1]:
# conda activate anndata

import numpy as np
import pandas as pd
import anndata as ad

Here I create pseduobulk data per donor and cell type to perform DE analysis for each cell type

In [2]:
import os
os.chdir("/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM")

In [3]:
adata = ad.read_h5ad("data/tasic_2018_ALM_STAR_model/tasic_2018_ALM_STAR_gene_counts_scVI.h5ad")

In [4]:
# # I want to find DE genes for each cell type. The best way to do this is by pseudobulking cell types by donor.

# # First let's see how the makeup of cell types break down by donor:

# pd.DataFrame(adata.obs.groupby("donor_id")["cell_subclass"].value_counts().groupby(level=0).head(1))

In [5]:
# Looks like donor is confounded with cell type (by design). 

# Let's try pseudobulking anyways.

In [6]:
ctype = "Pvalb"

In [7]:
adata_subset = adata[adata.obs['cell_subclass'] == ctype].copy()
X = adata_subset.raw.X

X.shape

(896, 40301)

In [8]:
df = pd.DataFrame.sparse.from_spmatrix(
    X, 
    index=adata_subset.obs_names,
    columns=adata.raw.var_names
)

In [9]:
df.head()

Unnamed: 0,4933401J01Rik,Gm26206,Xkr4,Gm18956,Gm37180,Gm37363,Gm37686,Gm1992,Gm37329,Gm7341,...,mt-Nd4,mt-Th,mt-Ts2,mt-Tl2,mt-Nd5,mt-Nd6,mt-Te,mt-Cytb,mt-Tt,mt-Tp
SRR7312420,0,0,2.0,5.0,21.0,0.0,0.0,0,0,0,...,715.0,0,0,0,771.0,128.0,0.0,2601.0,0,1.0
SRR7312429,0,0,57.0,0.0,0.0,8.0,0.0,0,0,0,...,888.0,0,0,0,918.0,185.0,0.0,3512.0,0,0.0
SRR7312533,0,0,0.0,0.0,0.0,1.0,0.0,0,0,0,...,1390.0,0,0,0,1375.0,296.0,1.0,3638.0,0,1.0
SRR7312534,0,0,2.0,0.0,0.0,1.0,0.0,0,0,0,...,577.0,0,0,0,442.0,132.0,0.0,2028.0,0,1.0
SRR7312535,0,0,6.0,0.0,0.0,3.0,11.0,0,0,0,...,804.0,0,0,0,810.0,145.0,1.0,3260.0,0,1.0


In [64]:
gene_cts = df.sum(axis=0)

In [69]:
df = df.loc[:, gene_cts > 0]

In [70]:
df.shape

(896, 33534)

In [71]:
corr_mat = np.corrcoef(df)
flat_idx = np.argmin(corr_mat)
row_idx, col_idx = np.unravel_index(flat_idx, corr_mat.shape)
print(row_idx, col_idx)

775 870


In [72]:
df.index[row_idx]

'SRR7317735'

In [73]:
np.argmin(np.mean(corr_mat, axis=0))

775

In [76]:
np.mean(corr_mat, axis=0)[row_idx]

0.48168652955452684

In [None]:
# Remove samples that are lowly correlated to other samples representing the same cell type

775

In [84]:
groups = df.groupby(adata_subset.obs['donor_id'], observed=True).groups

bulked = []
for donor, idx in groups.items():
    print(donor)
    df_subset = df.loc[idx]
    bulked.append(df_subset.sum())

228567
228568
231501
235087
236444
236445
241108
241109
242140
242963
242964
243795
244400
245825
245826
246135
246257
247662
247666
248192
248193
248194
249705
250737
252097
252926
254187
254188
259268
260510
262848
264921
273951
277143
278730
290917
296791
297025
301139
307570
309437
311661
341959
345741
350651
353652
359707
363499


In [96]:
column_names = list(groups.keys())
df_bulked = pd.concat(bulked, axis=1)

df_bulked.columns = column_names

In [97]:
df_bulked.head()

Unnamed: 0,228567,228568,231501,235087,236444,236445,241108,241109,242140,242963,...,301139,307570,309437,311661,341959,345741,350651,353652,359707,363499
4933401J01Rik,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
Gm26206,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Xkr4,0,36.0,332.0,142.0,341.0,255.0,198.0,19.0,1193.0,580.0,...,1397.0,96.0,7.0,414.0,83.0,38.0,297.0,69.0,30.0,64.0
Gm18956,0,0.0,26.0,0.0,9.0,13.0,0.0,0.0,2.0,0.0,...,83.0,0.0,3.0,23.0,0.0,0.0,7.0,0.0,0.0,0.0
Gm37180,0,0.0,152.0,192.0,252.0,184.0,56.0,0.0,589.0,123.0,...,821.0,1.0,0.0,212.0,85.0,8.0,105.0,7.0,6.0,0.0


In [123]:
corr_mat = df_bulked.corr()   # shape: (n_cols, n_cols)
col_means = corr_mat.mean(axis=0)   # or axis=1, same here since it's symmetric
col_means.sort_values()

228568    0.805597
228567    0.840690
252097    0.904633
245825    0.909584
236445    0.914508
363499    0.921279
245826    0.930034
252926    0.931323
246257    0.934918
243795    0.935696
297025    0.938313
235087    0.939542
290917    0.940643
254188    0.941474
264921    0.943029
231501    0.943631
254187    0.944390
307570    0.945447
345741    0.949012
241108    0.950581
353652    0.951718
309437    0.952175
260510    0.952689
278730    0.952921
259268    0.953365
242963    0.953709
241109    0.957064
359707    0.957889
350651    0.958233
236444    0.958295
242964    0.959639
262848    0.959837
242140    0.960339
244400    0.960763
247666    0.961112
311661    0.961634
341959    0.962106
273951    0.963044
296791    0.963172
249705    0.964731
277143    0.964912
246135    0.965123
248194    0.965675
250737    0.966105
247662    0.966145
248193    0.966221
248192    0.966306
301139    0.966380
dtype: float64

In [114]:
df_bulked_v2 = df.groupby(adata_subset.obs['donor_id']).sum()

  df_bulked_v2 = df.groupby(adata_subset.obs['donor_id']).sum()


In [116]:
df_bulked_v2.T.head()

donor_id,228567,228568,231501,235087,236444,236445,241108,241109,242140,242963,...,301139,307570,309437,311661,341959,345741,350651,353652,359707,363499
4933401J01Rik,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
Gm26206,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Xkr4,0,36.0,332.0,142.0,341.0,255.0,198.0,19.0,1193.0,580.0,...,1397.0,96.0,7.0,414.0,83.0,38.0,297.0,69.0,30.0,64.0
Gm18956,0,0.0,26.0,0.0,9.0,13.0,0.0,0.0,2.0,0.0,...,83.0,0.0,3.0,23.0,0.0,0.0,7.0,0.0,0.0,0.0
Gm37180,0,0.0,152.0,192.0,252.0,184.0,56.0,0.0,589.0,123.0,...,821.0,1.0,0.0,212.0,85.0,8.0,105.0,7.0,6.0,0.0


In [122]:
corr_mat_pd = df_bulked_v2.T.corr()   # shape: (n_cols, n_cols)

col_means_pd = corr_mat_pd.mean(axis=1)   # or axis=1, same here since it's symmetric
col_means_pd.sort_values()

donor_id
228568    0.805597
228567    0.840690
252097    0.904633
245825    0.909584
236445    0.914508
363499    0.921279
245826    0.930034
252926    0.931323
246257    0.934918
243795    0.935696
297025    0.938313
235087    0.939542
290917    0.940643
254188    0.941474
264921    0.943029
231501    0.943631
254187    0.944390
307570    0.945447
345741    0.949012
241108    0.950581
353652    0.951718
309437    0.952175
260510    0.952689
278730    0.952921
259268    0.953365
242963    0.953709
241109    0.957064
359707    0.957889
350651    0.958233
236444    0.958295
242964    0.959639
262848    0.959837
242140    0.960339
244400    0.960763
247666    0.961112
311661    0.961634
341959    0.962106
273951    0.963044
296791    0.963172
249705    0.964731
277143    0.964912
246135    0.965123
248194    0.965675
250737    0.966105
247662    0.966145
248193    0.966221
248192    0.966306
301139    0.966380
dtype: float64

In [124]:
corr_mat_pd

donor_id,228567,228568,231501,235087,236444,236445,241108,241109,242140,242963,...,301139,307570,309437,311661,341959,345741,350651,353652,359707,363499
donor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
228567,1.0,0.91337,0.864833,0.932839,0.809096,0.723803,0.861707,0.858837,0.855031,0.804028,...,0.863778,0.79692,0.858216,0.847946,0.837099,0.793841,0.855771,0.838196,0.815268,0.740514
228568,0.91337,1.0,0.803981,0.894153,0.755161,0.671888,0.828453,0.825714,0.800473,0.743549,...,0.821568,0.804617,0.854378,0.809682,0.806249,0.786542,0.819172,0.802338,0.775416,0.73217
231501,0.864833,0.803981,1.0,0.938446,0.975302,0.937889,0.947362,0.952274,0.984451,0.973953,...,0.958289,0.912051,0.923069,0.94256,0.941237,0.925408,0.93026,0.929784,0.940431,0.881603
235087,0.932839,0.894153,0.938446,1.0,0.931939,0.870593,0.945337,0.955897,0.948071,0.920684,...,0.97007,0.924779,0.96139,0.959827,0.957675,0.917162,0.964495,0.952766,0.943715,0.882943
236444,0.809096,0.755161,0.975302,0.931939,1.0,0.965696,0.956363,0.966603,0.984519,0.992452,...,0.97903,0.95271,0.948301,0.971202,0.973352,0.958211,0.959987,0.960997,0.975817,0.927568
236445,0.723803,0.671888,0.937889,0.870593,0.965696,1.0,0.903777,0.923937,0.94747,0.951502,...,0.92537,0.918052,0.895438,0.918684,0.929056,0.917943,0.91102,0.902756,0.933953,0.900656
241108,0.861707,0.828453,0.947362,0.945337,0.956363,0.903777,1.0,0.971922,0.957934,0.954999,...,0.96742,0.95056,0.9568,0.96748,0.963259,0.950143,0.963464,0.943715,0.956634,0.918438
241109,0.858837,0.825714,0.952274,0.955897,0.966603,0.923937,0.971922,1.0,0.966897,0.959834,...,0.976164,0.953596,0.966994,0.976516,0.970666,0.957545,0.967741,0.954785,0.963536,0.937641
242140,0.855031,0.800473,0.984451,0.948071,0.984519,0.94747,0.957934,0.966897,1.0,0.984277,...,0.977073,0.947181,0.952048,0.967375,0.965805,0.957141,0.959431,0.95976,0.965776,0.917583
242963,0.804028,0.743549,0.973953,0.920684,0.992452,0.951502,0.954999,0.959834,0.984277,1.0,...,0.972342,0.942361,0.936217,0.964279,0.964709,0.957212,0.952897,0.953656,0.966419,0.921587


In [None]:
df_list = []
meta_list = []

for ctype in np.unique(adata.obs['cell_subclass']):
    print(f"Starting {ctype}...")
    
    adata_subset = adata[adata.obs['cell_subclass'] == ctype].copy()
    
    X = adata_subset.raw.X
    
    df = pd.DataFrame.sparse.from_spmatrix(
        X, 
        index=adata_subset.obs_names,
        columns=adata.raw.var_names
    )
    df_bulked = df.groupby(adata_subset.obs['donor_id']).sum()
    
    meta_list.append(pd.DataFrame({
        'Cell_type': ctype, 
        'Donor': df_bulked.index.astype(str).values
    }))
    
    df_bulked.index = ctype + "_" + df_bulked.index.astype(str)
    df_list.append(df_bulked)

In [None]:
df_all = pd.concat(df_list, axis=0)
meta = pd.concat(meta_list, axis=0).reset_index(drop=True)

In [None]:
meta['Sample_ID'] = df_all.index.values
df_all.index.name = None

In [None]:
# Save
df_all.T.to_csv("data/tasic_2018_ALM_STAR_donor_cell_type_pseudobulk.csv")
meta.to_csv("data/tasic_2018_ALM_STAR_donor_cell_type_pseudobulk_sampleinfo.csv", index=False)