In [1]:
#pip install tf-nightly
#pip install tfp-nightly

pseudobulks

In [2]:
#%reset

In [3]:
import scanpy as sc
import seaborn as sns
import pandas as pd
import numpy as np
import anndata
import itertools
import gc
from diffexpr.py_deseq import py_DESeq2
from rpy2.robjects import Formula

In [4]:
q = sc.read_h5ad('../../atlas/Atlas_adatas_June2021_Atlas_final_May2021.h5ad')

In [5]:
def build_design(q, qci):
    # build design matrix
    patient_ids = ([x[0:3] for x in qci.obs.samplename])
    full_sample_df = pd.DataFrame({'patient':patient_ids, 'biosample':qci.obs.samplename, 'dx':qci.obs.diagnosis})
    # get the number of cells from each sample
    cell_counts = pd.DataFrame(full_sample_df.biosample.value_counts())
    cell_counts.columns = ['cell_counts']
    cell_counts['biosample'] = cell_counts.index
    # merge in the cell counts
    full_sample_df = full_sample_df.merge(cell_counts)
    # the list of biosamples in this cluster
    biosample_list = list(set(full_sample_df.biosample))
    # and the order of cells as index
    index = np.array(full_sample_df.biosample.tolist())
    # then we make the design matrix
    sample_df = full_sample_df.drop_duplicates()
    sample_df.loc[:,'binned_cell_counts'] = pd.cut(sample_df.cell_counts, bins=[0,5,10,20,40,80,160,100000]) #((sample_df.cell_counts - np.mean(sample_df.cell_counts)) / np.std(sample_df.cell_counts))
    return( (biosample_list, index, sample_df) )

In [6]:
def build_count_matrix(biosample_list, index, qci, sample_df):
    # sum within samples
    res0 = pd.DataFrame()
    for bsl in biosample_list:
        idx = np.argwhere(index == bsl).flatten()
        mat = qci.X[idx,:].sum(axis=0)
        cnt_sum = mat.flatten().tolist()[0]
        if len(res0) == 0:
            res0 = pd.DataFrame(cnt_sum, columns=[bsl])
        else:
            res0 = res0.join(pd.DataFrame(cnt_sum, columns=[bsl]))
    count_matrix = res0.loc[:, sample_df.biosample.tolist()]
    count_matrix['id'] = qci.var.index.tolist()
    count_matrix.index = qci.var.index
    return(count_matrix)

In [7]:
# batch
cellclusters = dict(
    epithelial=['0','3','4','6','8','13','19','20','28','31','33','34','35','36','38','39','40'], # 78049 cells, # 32 is heptoid
    squamous=['16','18','21'],
    fibroblasts=['7'],
    myofibroblasts=['15'],
    endothelial=['12','15','5', '30', '26'],
    stromal=['7','12','15','5', '30', '26'],
    neutrophils=['22'],
    Bcells=['11','23'],  # 23 plasma?
    monocytes=['10'],  # and macs and dcs
    cd4_Tcells=['2'],
    cd8_Tcells=['1','25'],
    NKcells=['14'],
    mastcells=['9'],
    parietal_cell=['42'],
    chief_cell=['4','6']
)

In [None]:
# non batch
cellclusters = dict(
    epithelial=['0','3','4','6','8','13','19','20','28','31','33','34','35','36','38','39','40'], # 78049 cells, # 32 is heptoid
    squamous=['16','18','21'],
    fibroblasts=['7'],
    myofibroblasts=['15'],
    endothelial=['12','15','5', '30', '26'],
    stromal=['7','12','15','5', '30', '26'],
    neutrophils=['22'],
    Bcells=['11','23'],  # 23 plasma?
    monocytes=['10'],  # and macs and dcs
    cd4_Tcells=['2'],
    cd8_Tcells=['1','25'],
    NKcells=['14'],
    mastcells=['9'],
    parietal_cell=['42'],
    chief_cell=['4','6']
)

In [8]:
for leiden_label in cellclusters.keys():
    print(leiden_label)

epithelial
fibroblasts
myofibroblasts
endothelial
stromal
neutrophils
Bcells
monocytes
cd4_Tcells
cd8_Tcells
NKcells
mastcells
gastric


In [9]:
clusterlabs = list(set(q.obs.leiden))
clusterlabs.sort()
res_df = pd.DataFrame()
for leiden_label in cellclusters.keys(): #clusterlabs:
    # subset the anndata to this cluster
    print('leiden cluster: ' + leiden_label)
    ### subset data to this cluster
    clusterlabels = cellclusters[leiden_label]
    qci = q[q.obs.leiden.isin(clusterlabels)]
    qci = qci[qci.obs.diagnosis.isin(['NE', 'NS', 'M', 'D', 'T'])]
    ###
    (biosample_list, index, sample_df) = build_design(q, qci)
    # building the pseudobulk count matrix
    count_matrix = build_count_matrix(biosample_list, index, qci, sample_df)
    sample_df.binned_cell_counts = [str(x) for x in sample_df.binned_cell_counts]
    #write it out
    count_matrix.to_csv('pseudobulks/'+leiden_label+'_pseudobulk.csv')
    sample_df.to_csv('pseudobulks/'+leiden_label+'_metadata.csv')    
    del qci
    gc.collect()


leiden cluster: epithelial


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


leiden cluster: fibroblasts
leiden cluster: myofibroblasts
leiden cluster: endothelial
leiden cluster: stromal
leiden cluster: neutrophils
leiden cluster: Bcells
leiden cluster: monocytes
leiden cluster: cd4_Tcells
leiden cluster: cd8_Tcells
leiden cluster: NKcells
leiden cluster: mastcells
leiden cluster: gastric


In [13]:
cellnames = list(set(q.obs.CL_name))
cellnames

['hematopoietic precursor cell',
 'epithelial cell of esophagus',
 'pancreatic ductal cell',
 'epithelial cell of stomach',
 'pancreatic stellate cell',
 'Schwann cell',
 'plasmacytoid dendritic cell',
 'pancreatic A cell',
 'pancreatic PP cell',
 'hepatocyte',
 'luminal epithelial cell of mammary gland',
 'pancreatic D cell',
 'dendritic cell',
 'natural killer cell',
 'goblet cell',
 'pulmonary ionocyte',
 'macrophage',
 'respiratory basal cell',
 'striated muscle cell',
 'mucus neck cell of gastric gland',
 'lymphocyte',
 'epithelial cell of lung',
 'antibody secreting cell',
 'pancreatic epsilon cell',
 'mucous cell of stomach',
 'neuron',
 'smooth muscle cell',
 'ciliated columnar cell of tracheobronchial tree',
 'neutrophil',
 'endothelial cell',
 'fibroblast',
 'T cell',
 'chromaffin cell',
 'peptic cell',
 'peritubular myoid cell',
 'enterocyte',
 'lung secretory cell',
 'unknown',
 'myofibroblast cell',
 'granulocyte',
 'B cell',
 'parietal cell',
 'mast cell']

In [17]:

def buildPM(qci,q,cell_label):
    qci = qci[qci.obs.diagnosis.isin(['NE', 'NS', 'M', 'D', 'T'])]
    ###
    (biosample_list, index, sample_df) = build_design(q, qci)
    # building the pseudobulk count matrix
    count_matrix = build_count_matrix(biosample_list, index, qci, sample_df)
    sample_df.binned_cell_counts = [str(x) for x in sample_df.binned_cell_counts]
    #write it out
    count_matrix.to_csv('../pseudobulks_celltype_level/'+cell_label+'_pseudobulk.csv')
    sample_df.to_csv('../pseudobulks_celltype_level/'+cell_label+'_metadata.csv')    


res_df = pd.DataFrame()
for cell_label in cellnames: #clusterlabs:
    # subset the anndata to this cluster
    print('celltype: ' + cell_label)
    if cell_label == 'neutrophil':
        ### subset data to this cluster
        print('***monocytes***')
        qci = q[(q.obs.CL_name == cell_label) & (q.obs.leiden == '10')]
        buildPM(qci,q,'monocytes')
        print('***neutrophils***')
        qci = q[(q.obs.CL_name == cell_label) & (q.obs.leiden == '22')]
        buildPM(qci,q,'neutrophils')
    else:
        ### subset data to this cluster
        qci = q[q.obs.CL_name == cell_label]
        buildPM(qci,q,cell_label)
        
    del qci
    gc.collect()


celltype: hematopoietic precursor cell


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


celltype: epithelial cell of esophagus
celltype: pancreatic ductal cell
celltype: epithelial cell of stomach
celltype: pancreatic stellate cell
celltype: Schwann cell
celltype: plasmacytoid dendritic cell
celltype: pancreatic A cell
celltype: pancreatic PP cell
celltype: hepatocyte
celltype: luminal epithelial cell of mammary gland
celltype: pancreatic D cell
celltype: dendritic cell
celltype: natural killer cell
celltype: goblet cell
celltype: pulmonary ionocyte
celltype: macrophage
celltype: respiratory basal cell
celltype: striated muscle cell
celltype: mucus neck cell of gastric gland
celltype: lymphocyte
celltype: epithelial cell of lung
celltype: antibody secreting cell
celltype: pancreatic epsilon cell
celltype: mucous cell of stomach
celltype: neuron
celltype: smooth muscle cell
celltype: ciliated columnar cell of tracheobronchial tree
celltype: neutrophil
***monocytes***
***neutrophils***
celltype: endothelial cell
celltype: fibroblast
celltype: T cell
celltype: chromaffin cel