In [1]:
#pip install tf-nightly
#pip install tfp-nightly

pseudobulks

In [1]:
import os 
os.getcwd()

'/home/daveg/Work/cruk/pseudobulks'

In [2]:
#%reset

In [3]:
import scanpy as sc
import seaborn as sns
import pandas as pd
import numpy as np
import anndata
import itertools
import gc
#from diffexpr.py_deseq import py_DESeq2
#from rpy2.robjects import Formula

In [4]:
q = sc.read_h5ad('../atlas/Atlas_adatas_June2021_Atlas_final_May2021.h5ad')

In [82]:
q.obs.columns

Index(['samplename', 'n_genes', 'n_molecules', 'doublet_score', 'percent_mito',
       'leiden', 'louvain', 'diagnosis', 'phase', 'sample_diagnosis',
       'patient', 'treatment', 'procedure', 'hcl_refined', 'hcl_celltype',
       'hcl_score', 'CLid', 'CL_name', 'nobatch_leiden', 'nobatch_louvain',
       'cnv_avg', 'has_cnv'],
      dtype='object')

In [83]:
def build_design(q, qci):
    # build design matrix
    patient_ids = ([x[0:3] for x in qci.obs.samplename])
    full_sample_df = pd.DataFrame({'patient':patient_ids, 'biosample':qci.obs.samplename, 'procedure':qci.obs.procedure, 'n_molecules':qci.obs.n_molecules, 'dx':qci.obs.diagnosis})
        
    # get average n_molecules
    sample_df_molecules=full_sample_df.groupby('biosample').agg(avg_molecules=pd.NamedAgg(column='n_molecules', aggfunc='mean'))
    sample_df_molecules['biosample'] = sample_df_molecules.index
    sample_df_molecules.index = range(0,len(sample_df_molecules))
    
    # get the number of cells from each sample
    cell_counts = pd.DataFrame(full_sample_df.biosample.value_counts())
    cell_counts.columns = ['cell_counts']
    cell_counts['biosample'] = cell_counts.index
    
    # merge in the cell counts and molecules
    full_sample_df = full_sample_df.merge(cell_counts)
    full_sample_df.drop('n_molecules', axis=1, inplace=True)
    
    # the list of biosamples in this cluster
    biosample_list = list(set(full_sample_df.biosample))
    
    # and the order of cells as index
    index = np.array(full_sample_df.biosample.tolist())
    
    # then we make the design matrix
    sample_df = full_sample_df.drop_duplicates()
    sample_df = sample_df.merge(sample_df_molecules)
    sample_df.loc[:,'binned_cell_counts'] = pd.cut(sample_df.cell_counts, bins=[0,8,32,128,512,2048,100000]) #((sample_df.cell_counts - np.mean(sample_df.cell_counts)) / np.std(sample_df.cell_counts))
    
    return( (biosample_list, index, sample_df) )

In [84]:
def build_count_matrix(biosample_list, index, qci, sample_df):
    # sum within samples
    res0 = pd.DataFrame()
    for bsl in biosample_list:
        idx = np.argwhere(index == bsl).flatten()
        mat = qci.X[idx,:].sum(axis=0)
        cnt_sum = mat.flatten().tolist()[0]
        if len(res0) == 0:
            res0 = pd.DataFrame(cnt_sum, columns=[bsl])
        else:
            res0 = res0.join(pd.DataFrame(cnt_sum, columns=[bsl]))
    count_matrix = res0.loc[:, sample_df.biosample.tolist()]
    count_matrix['id'] = qci.var.index.tolist()
    count_matrix.index = qci.var.index
    return(count_matrix)

In [89]:
cellclusters = dict(
    gi_epithelial=['0','3','4','6','8','13','19','20','28','31','33','34','35','36','38','39','40'], # 78049 cells, # 32 is heptoid
    squamous_epithelial=['16','18','21'],
    fibroblasts=['7'],
    myofibroblasts=['12'],
    endothelial=['15','5', '30', '26'],
    neuroendocrine=['17'],
    parietal=['29'],
    stromal=['7','12','15','5', '30', '26'],
    neutrophils=['22'],
    monocytes_macs_DCs=['10'],  # and macs and dcs
    B_cells=['11','23'],
    cd4_Tcells=['2'],
    cd8_Tcells=['1','25'],
    NK_cells=['14'],
    mast_cells=['9'],
    hepatoid=['27'],
    naive_T_cells=['24']
)


In [90]:
for leiden_label in cellclusters.keys():
    print(leiden_label)

gi_epithelial
squamous_epithelial
fibroblasts
myofibroblasts
endothelial
neuroendocrine
parietal
stromal
neutrophils
monocytes_macs_DCs
B_cells
cd4_Tcells
cd8_Tcells
NK_cells
mast_cells
hepatoid
naive_T_cells


In [91]:
clusterlabs = list(set(q.obs.leiden))
clusterlabs.sort()
res_df = pd.DataFrame()
for leiden_label in cellclusters.keys(): #clusterlabs:
    # subset the anndata to this cluster
    print('leiden cluster: ' + leiden_label)
    ### subset data to this cluster
    clusterlabels = cellclusters[leiden_label]
    qci = q[q.obs.leiden.isin(clusterlabels)]
    #qci = qci[qci.obs.diagnosis.isin(['NE', 'NS', 'M', 'D', 'T'])] # use all samples
    ###
    (biosample_list, index, sample_df) = build_design(q, qci)
    # building the pseudobulk count matrix
    count_matrix = build_count_matrix(biosample_list, index, qci, sample_df)
    print('columns in count matrix: ' + str(len(count_matrix.columns)))
    sample_df.binned_cell_counts = [str(x) for x in sample_df.binned_cell_counts]
    #write it out
    count_matrix.to_csv('pseudobulks_cluster_level/'+leiden_label+'_pseudobulk.csv')
    sample_df.to_csv('pseudobulks_cluster_level/'+leiden_label+'_metadata.csv')    
    del qci
    gc.collect()


leiden cluster: gi_epithelial
columns in count matrix: 57
leiden cluster: squamous_epithelial
columns in count matrix: 46
leiden cluster: fibroblasts
columns in count matrix: 57
leiden cluster: myofibroblasts
columns in count matrix: 57
leiden cluster: endothelial
columns in count matrix: 57
leiden cluster: neuroendocrine
columns in count matrix: 47
leiden cluster: parietal
columns in count matrix: 21
leiden cluster: stromal
columns in count matrix: 57
leiden cluster: neutrophils
columns in count matrix: 45
leiden cluster: monocytes_macs_DCs
columns in count matrix: 54
leiden cluster: B_cells
columns in count matrix: 56
leiden cluster: cd4_Tcells
columns in count matrix: 57
leiden cluster: cd8_Tcells
columns in count matrix: 57
leiden cluster: NK_cells
columns in count matrix: 57
leiden cluster: mast_cells
columns in count matrix: 57
leiden cluster: hepatoid
columns in count matrix: 32
leiden cluster: naive_T_cells
columns in count matrix: 54


In [92]:
cellnames = list(set(q.obs.CL_name))
cellnames

['pulmonary ionocyte',
 'pancreatic D cell',
 'respiratory basal cell',
 'neuron',
 'unknown',
 'natural killer cell',
 'pancreatic A cell',
 'hepatocyte',
 'mucous cell of stomach',
 'T cell',
 'parietal cell',
 'neutrophil',
 'chromaffin cell',
 'epithelial cell of stomach',
 'plasmacytoid dendritic cell',
 'lymphocyte',
 'myofibroblast cell',
 'peritubular myoid cell',
 'endothelial cell',
 'goblet cell',
 'antibody secreting cell',
 'fibroblast',
 'pancreatic PP cell',
 'Schwann cell',
 'luminal epithelial cell of mammary gland',
 'smooth muscle cell',
 'macrophage',
 'hematopoietic precursor cell',
 'enterocyte',
 'ciliated columnar cell of tracheobronchial tree',
 'peptic cell',
 'striated muscle cell',
 'granulocyte',
 'pancreatic epsilon cell',
 'mucus neck cell of gastric gland',
 'epithelial cell of esophagus',
 'lung secretory cell',
 'pancreatic ductal cell',
 'B cell',
 'pancreatic stellate cell',
 'mast cell',
 'epithelial cell of lung',
 'dendritic cell']

In [93]:

def buildPM(qci,q,cell_label):
    #qci = qci[qci.obs.diagnosis.isin(['NE', 'NS', 'M', 'D', 'T'])]
    ###
    (biosample_list, index, sample_df) = build_design(q, qci)
    # building the pseudobulk count matrix
    count_matrix = build_count_matrix(biosample_list, index, qci, sample_df)
    sample_df.binned_cell_counts = [str(x) for x in sample_df.binned_cell_counts]
    #write it out
    count_matrix.to_csv('pseudobulks_celltype_level/'+cell_label+'_pseudobulk.csv')
    sample_df.to_csv('pseudobulks_celltype_level/'+cell_label+'_metadata.csv')


res_df = pd.DataFrame()
for cell_label in cellnames: #clusterlabs:
    # subset the anndata to this cluster
    print('celltype: ' + cell_label)
    if cell_label == 'neutrophil':
        ### subset data to this cluster
        print('***monocytes***')
        qci = q[(q.obs.CL_name == cell_label) & (q.obs.leiden == '10')]
        buildPM(qci,q,'monocytes')
        print('***neutrophils***')
        qci = q[(q.obs.CL_name == cell_label) & (q.obs.leiden == '22')]
        buildPM(qci,q,'neutrophils')
    else:
        ### subset data to this cluster
        qci = q[q.obs.CL_name == cell_label]
        buildPM(qci,q,cell_label)
        
    del qci
    gc.collect()


celltype: pulmonary ionocyte
celltype: pancreatic D cell
celltype: respiratory basal cell
celltype: neuron
celltype: unknown
celltype: natural killer cell
celltype: pancreatic A cell
celltype: hepatocyte
celltype: mucous cell of stomach
celltype: T cell
celltype: parietal cell
celltype: neutrophil
***monocytes***
***neutrophils***
celltype: chromaffin cell
celltype: epithelial cell of stomach
celltype: plasmacytoid dendritic cell
celltype: lymphocyte
celltype: myofibroblast cell
celltype: peritubular myoid cell
celltype: endothelial cell
celltype: goblet cell
celltype: antibody secreting cell
celltype: fibroblast
celltype: pancreatic PP cell
celltype: Schwann cell
celltype: luminal epithelial cell of mammary gland
celltype: smooth muscle cell
celltype: macrophage
celltype: hematopoietic precursor cell
celltype: enterocyte
celltype: ciliated columnar cell of tracheobronchial tree
celltype: peptic cell
celltype: striated muscle cell
celltype: granulocyte
celltype: pancreatic epsilon cell

In [94]:
samplenames = list(set(q.obs.samplename))
samplenames

['E26B',
 'E24D',
 'E11C',
 'E07A',
 'E11E',
 'E26D',
 'E24C',
 'E19D',
 'E07D1',
 'E08C',
 'E21C',
 'E26C',
 'E23B',
 'E21E',
 'E20A',
 'E26E',
 'E20D',
 'E11D',
 'E08D',
 'E08B',
 'E23C',
 'E24A',
 'E07C',
 'E07D2',
 'E12C',
 'E19C',
 'E17C',
 'E21A',
 'E23E',
 'E14B',
 'E21F',
 'E08A',
 'E14A',
 'E12B',
 'E17D',
 'E23A',
 'E12D',
 'E19B',
 'E20C',
 'E20B',
 'E14C',
 'E11B',
 'E17E',
 'E26A',
 'E17B',
 'E21B',
 'E17A',
 'E19E',
 'E07B',
 'E14D',
 'E24B',
 'E21D',
 'E12A',
 'E23D',
 'E19F',
 'E19A']

In [95]:


res_df = pd.DataFrame()

for sample_label in samplenames: #clusterlabs:
    # subset the anndata to this cluster
    print('sample name: ' + sample_label)
    ### subset data to this cluster
    qci = q[q.obs.samplename == sample_label]
    mat = qci.X.sum(axis=0)
    cnt_sum = mat.flatten().tolist()[0]
    if len(res_df) == 0:
        res_df = pd.DataFrame(cnt_sum, columns=[sample_label])
    else:
        res_df = res_df.join(pd.DataFrame(cnt_sum, columns=[sample_label]))
    del qci
    gc.collect()

res_df.to_csv('atlas_samplenames_all_genes_pseudobulk.csv')


sample name: E26B
sample name: E24D
sample name: E11C
sample name: E07A
sample name: E11E
sample name: E26D
sample name: E24C
sample name: E19D
sample name: E07D1
sample name: E08C
sample name: E21C
sample name: E26C
sample name: E23B
sample name: E21E
sample name: E20A
sample name: E26E
sample name: E20D
sample name: E11D
sample name: E08D
sample name: E08B
sample name: E23C
sample name: E24A
sample name: E07C
sample name: E07D2
sample name: E12C
sample name: E19C
sample name: E17C
sample name: E21A
sample name: E23E
sample name: E14B
sample name: E21F
sample name: E08A
sample name: E14A
sample name: E12B
sample name: E17D
sample name: E23A
sample name: E12D
sample name: E19B
sample name: E20C
sample name: E20B
sample name: E14C
sample name: E11B
sample name: E17E
sample name: E26A
sample name: E17B
sample name: E21B
sample name: E17A
sample name: E19E
sample name: E07B
sample name: E14D
sample name: E24B
sample name: E21D
sample name: E12A
sample name: E23D
sample name: E19F
sample n

In [96]:
res_df

Unnamed: 0,E26B,E24D,E11C,E07A,E11E,E26D,E24C,E19D,E07D1,E08C,...,E17A,E19E,E07B,E14D,E24B,E21D,E12A,E23D,E19F,E19A
0,1306.0,742.0,744.0,428.0,370.0,2378.0,410.0,1501.0,849.0,1253.0,...,830.0,1179.0,701.0,652.0,815.0,404.0,776.0,402.0,2491.0,1519.0
1,1989.0,206.0,168.0,110.0,49.0,135.0,953.0,597.0,188.0,143.0,...,269.0,337.0,81.0,477.0,55.0,1819.0,49.0,522.0,177.0,170.0
2,3863.0,7503.0,1817.0,207.0,1798.0,30727.0,2525.0,4718.0,3392.0,14001.0,...,5828.0,5712.0,4104.0,10383.0,5875.0,613.0,3152.0,433.0,20421.0,9317.0
3,317.0,136.0,200.0,178.0,4061.0,1344.0,70.0,278.0,129.0,253.0,...,378.0,418.0,166.0,151.0,190.0,95.0,2516.0,103.0,1333.0,423.0
4,22.0,19.0,75.0,5.0,48.0,297.0,0.0,47.0,10.0,78.0,...,13.0,21.0,8.0,16.0,11.0,10.0,63.0,12.0,77.0,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35601,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35602,2021.0,866.0,1591.0,1048.0,352.0,2216.0,837.0,2079.0,764.0,1671.0,...,1025.0,1477.0,706.0,1101.0,1565.0,934.0,688.0,704.0,2920.0,2098.0
35603,3624.0,1609.0,2798.0,646.0,413.0,2414.0,1158.0,2034.0,1142.0,2622.0,...,1698.0,3296.0,1594.0,1441.0,3586.0,1828.0,509.0,1206.0,2184.0,1204.0
35604,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
