In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scanpy
import gzip
from pathlib import Path
import seaborn as sns
import pickle

In [None]:
# This data file can be downloaded from CellChat
geneInfo = pd.read_csv("../CellChatMouseDB/geneInfo_input_CellChatDB.csv")
signaling_genes = [g for g in list(geneInfo['Symbol'])]
signaling_genes_to_analyze = signaling_genes

In [None]:
# This data is taken from the hspc seurat file
hspc_genes = pd.read_csv("../Tabula Muris Senis/bone_marrow_types/gene_expr_df_CLP.csv", index_col = 0)
hspc_genes.index = [i for i in list(hspc_genes.index)]
# This data is taken from the boneMarrowNiche seurat file
non_hema_genes = pd.read_csv("../Tabula Muris Senis/bone_marrow_types/gene_expr_df_EC-Arteriar.csv", index_col = 0)
# This data is taken from the bloodAndImmune seurat file
immune_genes = pd.read_csv("../Tabula Muris Senis/bone_marrow_types/gene_expr_df_B cell.csv", index_col = 0)
signaling_genes_to_analyze = list(set(hspc_genes.index) & set(signaling_genes_to_analyze) & set(list(non_hema_genes.index)) & set(immune_genes.index))

In [None]:
# This data is taken from Tabula Muris Senis (https://tabula-muris-senis.sf.czbiohub.org/)
dir_path = Path('../Tabula Muris Senis/')
files = [f for f in dir_path.iterdir() if f.is_file()]

for f in files:
    # Reading counts mat from one Mouse Atlas
    #counts_mat = pd.read_csv(f, compression = 'gzip', sep = ' ')
    
    # Reading h5ad files from Tabula Muris
    scanpy_obj = scanpy.read_h5ad(f)
    counts_mat = scanpy_obj.var['means']
    
    signaling_genes_to_analyze = list(set(counts_mat.index) & set(signaling_genes_to_analyze))

In [None]:
signaling_gene_dict = {}
for i in signaling_genes_to_analyze:
    signaling_gene_dict[i] = {"max":0.0, "min":10000, 'hist_array':[]}
n_total = 0

In [None]:
for f in files:
    scanpy_obj = scanpy.read_h5ad(f)
    scanpy.pp.normalize_total(scanpy_obj, target_sum = 10000)
    scanpy.pp.log1p(scanpy_obj)
    counts_mat = pd.DataFrame(scanpy_obj.X.toarray(), index = scanpy_obj.obs_names, columns = scanpy_obj.var_names)
    counts_mat = counts_mat[signaling_genes_to_analyze].T
    
    counts_mat = np.expm1(counts_mat)
    counts_mat = counts_mat.div(counts_mat.sum(axis=0), axis = 1)
    counts_mat = counts_mat*10000
    counts_mat = np.log1p(counts_mat).T
    
    n_total += len(counts_mat)
    
    mean_df = counts_mat.groupby(scanpy_obj.obs['cell_ontology_class']).mean()
    
    for g in counts_mat.columns:
        mean_min, mean_max = mean_df[g].min(), mean_df[g].max()
        #t_max, t_min, t_sum = counts_mat[g].max(), counts_mat[g].min(), counts_mat[g].sum()
        if mean_max > signaling_gene_dict[g]['max']:
            signaling_gene_dict[g]['max'] = mean_max
        if mean_min < signaling_gene_dict[g]['min']:
            signaling_gene_dict[g]['min'] = mean_min
        #signaling_gene_dict[g]['sum'] += t_sum
        signaling_gene_dict[g]['hist_array'].extend(list(mean_df[g]))

signaling_gene_atlas = pd.DataFrame(signaling_gene_dict).T

In [None]:
whole_bm_df = pd.DataFrame()
for cellName in [
"B cell","Dendritic cells", "EoBaso prog."  ,
"Ery prog.","EryMk prog.","Erythroblasts"  ,
"GranMono prog.","LMPPs","Mk prog."     ,  
"Mono prog.","Monocytes","NK cells"      , 
"Neutro prog.","Neutrophils","T cells"   ,     
"large pre-B.","pro-B","small pre-B."   ,

"Chondrocytes","EC-Arteriar","EC-Arteriolar" ,
"EC-Sinusoidal","Fibroblasts","MSPC-Adipo"   , 
"MSPC-Osteo","Myofibroblasts","Osteo"        , 
"Osteoblasts","Pericytes","Schwann-cells" ,
"Smooth-muscle", "HSC", 'MPP', 'GMP', 'CMP', 'CLP', 'MEP'
]:
    cell_df = pd.read_csv("../Tabula Muris Senis/bone_marrow_types/gene_expr_df_" + cellName + ".csv", index_col = 0)
    cell_df = cell_df.loc[signaling_genes_to_analyze,]
    
    cell_df = np.expm1(cell_df)
    cell_df = cell_df.div(cell_df.sum(axis=0), axis = 1)
    cell_df = cell_df*10000
    cell_df = np.log1p(cell_df)
    
    whole_bm_df[cellName] = cell_df.mean(axis = 1)

whole_bm_df = pd.concat([whole_bm_df, total_hspc_df.loc[whole_bm_df.index, ]], axis = 1)

In [None]:
total_hspc_df = pd.DataFrame()
for hspc_name in ["HSC", 'MPP', 'GMP', 'CMP', 'CLP', 'MEP']:
    hspc_df = pd.read_csv("../Tablua Muris Senis/bone_marrow_types/gene_expr_df_" + hspc_name + ".csv", index_col = 0)
    hspc_df = hspc_df.loc[signaling_genes_to_analyze,]
    
    hspc_df = np.expm1(hspc_df)
    hspc_df = hspc_df.div(hspc_df.sum(axis=0), axis = 1)
    hspc_df = hspc_df*10000
    hspc_df = np.log1p(hspc_df)
    
    total_hspc_df[hspc_name] = hspc_df.mean(axis = 1)

In [None]:
# These file are taken from the cellchat database
interaction_genes_file = "../CellChatMouseDB/interaction_input_CellChatDB.csv"
complex_input_file = "../CellChatMouseDB/complex_input_CellChatDB.csv"
ig = pd.read_csv(interaction_genes_file)
complex_input = pd.read_csv(complex_input_file, index_col = 0)

pathway_gene_dict = {}

for i in range(len(ig)):
    path = ig['pathway_name'][i]
    ligand = ig['ligand'][i]
    receptor = ig['receptor'][i]
    
    if path not in pathway_gene_dict.keys():
        pathway_gene_dict[path] = [[],[]]
        
        
    pathway_gene_dict[path][0].append(ligand)
    pathway_gene_dict[path][1].append(receptor)
    
    if "_" in ligand or " " in ligand:
        ligand_splits = list(complex_input.loc[ligand,])
        for l in ligand_splits:
            pathway_gene_dict[path][0].append(l)
        pathway_gene_dict[path][0].remove(ligand)

    
    if "_" in receptor or " " in receptor:
        
        receptor_splits = list(complex_input.loc[receptor,])
        for r in receptor_splits:
            pathway_gene_dict[path][1].append(r)
        pathway_gene_dict[path][1].remove(receptor)   
            
for pathway_name in pathway_gene_dict.keys():
    ligands = pathway_gene_dict[pathway_name][0]
    receptors = pathway_gene_dict[pathway_name][1]
    ligands = list(set(ligands))
    receptors = list(set(receptors))
    ligands = [x for x in ligands if str(x) != 'nan']
    receptors = [x for x in receptors if str(x) != 'nan']
    pathway_gene_dict[pathway_name][0] = ligands
    pathway_gene_dict[pathway_name][1] = receptors

In [None]:
def calculate_percentile_BM_array(gene_list, bm_array):
    for g in gene_list:
        percentiles = []
        for b in bm_array:
            gg = signaling_gene_dict[g]['hist_array']
            gg.sort()
            rank = sum(1 for x in gg if x <= b)
            percentiles.append(rank/(len(gg)))
    return percentiles

In [None]:
def calculate_percentile_BM(gene_list, val):
    for g in gene_list:
        gg = signaling_gene_dict[g]['hist_array']
        gg.sort()
        rank = sum(1 for x in gg if x <= val)
        percentile = rank/(len(gg))
    return percentile

In [None]:
def calculate_percentile_hspc_pathway(pathway_name, direction, hspc_name):
    
    gene_list = pathway_gene_dict[pathway_name]
    if direction == 'sending':
        gene_list = gene_list[0]
    else:
        gene_list = gene_list[1]
    
    whole_mouse_vals = []
    hspc_vals = []
    
    gene_list = [i  for i in gene_list if i in signaling_genes_to_analyze]

    for g in gene_list:
        hspc_vals.append(total_hspc_df.loc[g,hspc_name])
        gg = signaling_gene_dict[g]['hist_array']
        whole_mouse_vals.append(gg)
    
    whole_mouse_vals = [max(values) for values in zip(*whole_mouse_vals)]
    
    whole_mouse_vals.sort()
    
    rank = sum(1 for x in whole_mouse_vals if x <= max(hspc_vals))
    percentile = rank/(len(whole_mouse_vals))
    
    return percentile

In [None]:
pathway_dynamic_range = pd.DataFrame(columns = ['Ligand', 'Receptor', 'Direction', 'Pathway',
                                           'HSC', "MPP", 'CMP', 'CLP', 'GMP', 'MEP', 'maxBM'])
nRow = 0
for pathway_name in pathway_gene_dict.keys():
    ligand_receptor_interactions = ig[ig['pathway_name'] == pathway_name]
    ligands = list(ligand_receptor_interactions['ligand'])
    receptors = list(ligand_receptor_interactions['receptor'])
    
    ligand_cleanedL = []
    receptor_cleanedL = []
    ligand_cleaned = []
    receptor_cleaned = []
    #fix names first
    for L, R in zip(ligands, receptors):
        if "_" in L or " " in L:
            ligand_splits = list(complex_input.loc[L,])
            ligand_splits = [x for x in ligand_splits if str(x) != 'nan']
            nL = len(ligand_splits)
            for i in range(nL):
                ligand_cleanedL.append(ligand_splits[i])
                receptor_cleanedL.append(R)
        else:
            ligand_cleanedL.append(L)
            receptor_cleanedL.append(R)
            
    for L,R in zip(ligand_cleanedL, receptor_cleanedL):
        if "_" in R or " " in R:
            receptors_splits = list(complex_input.loc[R,])
            receptors_splits = [x for x in receptors_splits if str(x) != 'nan']
            nR = len(receptors_splits)
            for i in range(nR):
                receptor_cleaned.append(receptors_splits[i])
                ligand_cleaned.append(L)
        else:
            ligand_cleaned.append(L)
            receptor_cleaned.append(R)
    
    
    for L,R in zip(ligand_cleaned, receptor_cleaned):
        if L in signaling_genes_to_analyze and R in signaling_genes_to_analyze:
            row_to_append = [L, R, 'sending', pathway_name]
            hspc_dr = []
            for hspc in ['HSC', "MPP", 'CMP', 'CLP', 'GMP', 'MEP']:
                hspc_dr.append(calculate_percentile_hspc_pathway(pathway_name, 'sending', hspc))
            
            row_to_append.extend(hspc_dr)
            row_to_append.append(calculate_percentile_BM([R], max(list(whole_bm_df.loc[R,]))))
        pathway_dynamic_range.loc[nRow] = row_to_append
        nRow += 1
    for L,R in zip(ligand_cleaned, receptor_cleaned):
        if R in signaling_genes_to_analyze and L in signaling_genes_to_analyze:
            row_to_append = [L, R, 'receiving', pathway_name]
            hspc_dr = []
            for hspc in ['HSC', "MPP", 'CMP', 'CLP', 'GMP', 'MEP']:
                hspc_dr.append(calculate_percentile_hspc_pathway(pathway_name, 'receiving', hspc))
            
            row_to_append.extend(hspc_dr)
            row_to_append.append(calculate_percentile_BM([L], max(list(whole_bm_df.loc[L,]))))
        pathway_dynamic_range.loc[nRow] = row_to_append
        nRow += 1

In [None]:
hspc_pathway_specificity_index = pd.DataFrame(columns = ['HSC', 'MPP', 'CMP', 'CLP', 'GMP', 'MEP'])
for pathway_name in list(set(pathway_dynamic_range['Pathway'])):
    for d in ['sending','receiving']:
        pathway_subset = pathway_dynamic_range[pathway_dynamic_range['Pathway'] == pathway_name]
        pathway_subset = pathway_subset[pathway_subset['Direction'] == d]
        
        if len(pathway_subset) > 0:
            list_to_append = []
            for hspc in ['HSC', 'MPP', 'CMP', 'CLP', 'GMP', 'MEP']:
                hspcMax = pathway_subset[hspc].max()
                
                bmMax = pathway_subset[pathway_subset[hspc] == hspcMax]['maxBM'].max()
                si_value = hspcMax*bmMax
                list_to_append.append(si_value)

            hspc_pathway_specificity_index.loc[d + " " + pathway_name,] = list_to_append