In [1]:
#!/usr/bin/env python
# coding: utf-8
import sys
import scanpy as sc
import anndata
import pandas as pd
import numpy as np
import os
import gc
import decoupler as dc
import regex as re
os.chdir("/Users/ricardoramirez/Dropbox/PostDoc/Research/MS")

In [2]:
sc_dat = sc.read_h5ad(filename = "./data/adata_raw_cellbender.h5ad")

In [10]:
list(sc_dat.obs.columns)

['n_genes',
 'n_genes_by_counts',
 'total_counts',
 'total_counts_mt',
 'pct_counts_mt',
 'doublet_score',
 'predicted_doublet',
 'diss_score',
 'patient_id',
 'sample_id',
 'Condition',
 'lesion_type',
 'Age',
 'Sex',
 'RIN',
 'Batch',
 'visium',
 'snRNA-seq',
 'batch',
 'leiden',
 'cell_types']

In [11]:
meta_data_cols = ["patient_id", "sample_id", "lesion_type", "Age", "Sex", "RIN", "Batch"]

In [12]:
available_cells = sc_dat.obs[["cell_types"]].drop_duplicates()
available_cells

Unnamed: 0,cell_types
CO37_ACCTACCGTATGCTAC-1,Neurons
CO37_TCGCAGGGTCCAACGC-1,OPC
CO37_CATGGTAAGCTACTAC-1,Oligos
CO37_ACCATTTTCAGACTGT-1,Astros
CO37_GTTACGAGTCGGTGTC-1,Astros_c
CO37_GCTGCAGTCTCTAGGA-1,Endothelia
CO37_GATCACATCTCGAGTA-1,Microglia
CO37_ACAGGGATCTCATAGG-1,Stroma
CO37_ACCATTTAGTGCCAGA-1,T_cells
CO41_CTTCGGTTCCCTCTCC-1,B_cells


In [13]:
meta_data = sc_dat.obs[meta_data_cols].drop_duplicates()

In [14]:
meta_data

Unnamed: 0,patient_id,sample_id,lesion_type,Age,Sex,RIN,Batch
CO37_ACCTACCGTATGCTAC-1,CO37 P5B3,CO37,Ctrl,87,M,6.1,4
CO40_CCTCTCCGTGTGAATA-1,PDCO40 A1B2,CO40,Ctrl,61,F,7.8,1
CO41_ACCAAACTCTCTCTTC-1,CO41 A1C4,CO41,Ctrl,54,M,7.2,4
CO45_TCACTATGTCTCACAA-1,CO45 A1D4,CO45,Ctrl,77,M,6.4,4
CO74_GAAGGACAGAAACACT-1,CO74 A1A2,CO74,Ctrl,84,F,7.0,2
CO85_CTTACCGTCACATTGG-1,CO85 A3C2,CO85,Ctrl,81,F,5.9,3
MS197_TATCTTGGTTTCTTAC-1,MS197 P2D3,MS197,CA,52,F,9.0,4
MS229_TTTATGCCACCCTAAA-1,MS229 P2C2,MS229,CA,53,M,7.0,4
MS371N_TCTATACGTGGCACTC-1,MS371 A3D6,MS371N,A,40,M,7.6,4
MS377I_AGGGCCTAGCTTAGTC-1,MS377 A2D4,MS377I,CA,50,F,6.5,1


In [15]:
meta_data.to_csv("./data/meta_data.csv")

In [16]:
cell_type_numbers = sc_dat.obs.groupby(["sample_id", "cell_types"])["cell_types"].count()
cell_type_numbers = cell_type_numbers.to_frame().rename({"cell_types":"counts"}, axis=1).reset_index()

In [17]:
cell_type_numbers

Unnamed: 0,sample_id,cell_types,counts
0,CO37,Astros,746
1,CO37,Astros_c,16
2,CO37,B_cells,0
3,CO37,Endothelia,37
4,CO37,Microglia,396
...,...,...,...
185,MS586,Neurons,39
186,MS586,OPC,188
187,MS586,Oligos,4599
188,MS586,Stroma,21


In [18]:
sc_dat.layers['counts'] = sc_dat.X
padata = dc.get_pseudobulk(sc_dat, 
                           sample_col='sample_id', 
                           groups_col='cell_types', 
                           layer='counts', 
                           min_prop=0, 
                           min_smpls=0,
                           mode='sum')

In [None]:
pb_dat = pd.DataFrame(padata.X)
pb_dat.columns = padata.var.index.values
pb_dat.index = padata.obs.index.values
pb_dat.to_csv("./data/pb_data.csv")

In [None]:
pb_coldata = padata.obs.copy()
pb_coldata["colname"] = pb_coldata.index.values

In [None]:
pb_coldata = pb_coldata.merge(cell_type_numbers, on = ["cell_types","sample_id"], how = "left")

In [None]:
pb_coldata

In [None]:
pb_coldata.to_csv("./data/pb_coldata.csv")