In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import itertools
import math
from statsmodels.stats.multitest import multipletests
import scipy

non_barcodes = ["Chondrocytes","EC-Arteriar","EC-Arteriolar","EC-Sinusoidal",
                        "Fibroblasts","MSPC-Adipo","MSPC-Osteo","Myofibroblasts","Osteo",
                        "Osteoblasts","Pericytes","Schwann-cells","Smooth-muscle", 
                       'CLP', 'lin-', 'MEP', 'CMP', 'GMP', 'MPP', 'HSC', 'B cell', 'Dendritic cells',
                        'Eo/Baso prog.', 'Ery prog.', 'Ery/Mk prog.', 'Erythroblasts', 'Gran/Mono prog.',
                        'LMPPs', 'Mk prog.', 'Mono prog.', 'Monocytes', 'NK cells', 'Neutro prog.', 'Neutrophils',
                        'T cells', 'large pre-B.', 'pro-B', 'small pre-B.']

type_to_ct = {'Non-hematopoietic':["Chondrocytes","ECArteriar","ECArteriolar","ECSinusoidal",
                        "Fibroblasts","MSPCAdipo","MSPCOsteo","Myofibroblasts","Osteo",
                        "Osteoblasts","Pericytes","Schwanncells","Smoothmuscle"],
              'HPC' : ['CLP', 'lin', 'MEP', 'CMP', 'GMP', 'MPP', 'HSC','EoBaso prog', 'Ery prog', 'EryMk prog','GranMono prog',
                        'LMPPs', 'Mk prog', 'Mono prog','Neutro prog'],
              'Blood+Immune' : ['B cell', 'Dendritic cells','Erythroblasts','Monocytes', 'NK cells','Neutrophils','T cells',
                                'large preB', 'proB', 'small preB']}

ct_to_type = {}
for i in type_to_ct.keys():
    for j in type_to_ct[i]:
        ct_to_type[j] = i 
    
def process_csv(filename):
    pp = pd.read_csv(filename, header  = 0)
    pp = pp.set_index('Unnamed: 0')
    pp.index.names = [None]
    return pp
    
def make_corr(df):
    
    corr_df = pd.DataFrame()
    pval_df = pd.DataFrame()
    col_names = list(df.columns)
    comps = 0
    
    for i in range(len(col_names)):
        
        toAdd = []
        toAdd_p = []
        
        for k in range(i):
                toAdd.append(0)
                toAdd_p.append(np.nan)
                
        for j in range(i, len(col_names)):
            
            pathwayi, pathwayj = col_names[i].split(" ")[-1] ,col_names[j].split(" ")[-1]
            
            if pathwayi != pathwayj:
                corr, pval = scipy.stats.spearmanr(df[col_names[i]],df[col_names[j]], alternative = 'two-sided')
                toAdd.append(corr)
                toAdd_p.append(pval)
                comps += 1
            else:
                toAdd.append(0)
                toAdd_p.append(np.nan)
            
        corr_df[col_names[i]] = toAdd
        pval_df[col_names[i]] = toAdd_p
            
    return corr_df, pval_df

For calculating new correlation and p-values...<br>
https://en.wikipedia.org/wiki/Fisher_transformation <br>
https://www.tandfonline.com/doi/abs/10.1080/00221309809595548 <br>
https://support.minitab.com/en-us/minitab/21/help-and-how-to/statistics/basic-statistics/how-to/correlation/methods-and-formulas/methods-and-formulas/

In [2]:
from scipy import stats
def calc_new_pval(adj_corr, n = 500):
    
    t_stat = adj_corr*(np.sqrt(n - 2)) / np.sqrt(1 - adj_corr*adj_corr)
    
    p_val = stats.t.sf(np.abs(t_stat), n - 1)*2
    
    return p_val

In [3]:
# Gene table pulled from CellChat
interaction_genes_file = "../interaction_genes.csv"
ig = pd.read_csv(interaction_genes_file)

pathway_gene_dict = {}

for i in range(len(ig)):
    path = ig['pathway_name'][i]
    ligand = ig['ligand'][i]
    receptor = ig['receptor'][i]
    
    if path not in pathway_gene_dict.keys():
        pathway_gene_dict[path] = [[],[]]
        
        
    pathway_gene_dict[path][0].append(ligand)
    pathway_gene_dict[path][1].append(receptor)

In [5]:
def find_first_pathway(s):
    s = s.split()

    try:
        re = s.index('recieve')
    except:
        re = 100
    try:
        se = s.index('send')
    except:
        se = 100
    if re < se:
        return s[re:re+2]
    else:
        return s[se:se+2]

def find_first_celltype(s):
    s = s.split()

    try:
        re = s.index('recieve')
    except:
        re = 100
    try:
        se = s.index('send')
    except:
        se = 100
    if re < se:
        return " ".join(s[0:re])
    else:
        return " ".join(s[0:se])
    
def find_second_celltype(s):
    s = s.split()
    andi = s.index('and')
    s = s[andi+1:]
    
    return find_first_celltype(" ".join(s))

def get_pathway_genes(p):
    p1sorr = 1 if p[0] == 'send' else 0
    p1gs = pathway_gene_dict[p[1]][p1sorr]
    return list(set(p1gs))

def filter_same_gene_pathways(p1, p2):

    p1gs = get_pathway_genes(p1)
    p2gs = get_pathway_genes(p2)
    
    return bool(set(p1gs) & set(p2gs))

def total_pathway_genes(p1, p2):

    p1gs = get_pathway_genes(p1)
    p2gs = get_pathway_genes(p2)
    
    return len(p1gs) + len(p2gs)

def make_signaling_df(parent_path):
    
    file_names = ["Chondrocytes", "ECArteriar", "ECArteriolar", "ECSinusoidal","Fibroblasts",
                  "MSPCAdipo", "MSPCOsteo", "Myofibroblasts", "Osteo", "Osteoblasts",
                  "Pericytes", "Schwanncells", "Smoothmuscle","CLP","lin","MEP","CMP","GMP","MPP",
                  "HSC","B cell","Dendritic cells","EoBaso prog","Ery prog","EryMk prog",
                  "Erythroblasts","GranMono prog","LMPPs","Mk prog","Mono prog","Monocytes",
                  "NK cells","Neutro prog","Neutrophils","T cells","large preB","proB","small preB"]

    path_path = parent_path + "all_pathways/"
    
    filepath_names_s = [path_path + 'hsc_to_' + f + ".csv" for f in file_names]
    
    filepath_names_r = [path_path + f + "_to_hsc.csv" for f in file_names]
        
    signaling_df = pd.DataFrame()
    for filepath, fn in zip(filepath_names_s, file_names):
        new_add = process_csv(filepath)
        new_add.columns = [fn + " recieve " + i for i in new_add.columns]
        signaling_df = pd.concat([signaling_df, new_add], axis = 1)
    
    for filepath, fn in zip(filepath_names_r, file_names):
        new_add = process_csv(filepath)
        new_add.columns = [fn + " send " + i for i in new_add.columns]
        signaling_df = pd.concat([signaling_df, new_add], axis = 1)
    
    signaling_df = signaling_df.loc[:, (signaling_df.sum(axis=0) != 0)]
    
    # REMOVE NICHE CELLS FROM ROWS SO YOU JUST HAVE SINGLE CELL BARCODES IN THE ROWS
    signaling_df = signaling_df.T
    allc = list(signaling_df.columns)
    for i in non_barcodes:
        allc.remove(i)
    signaling_df = signaling_df[allc]
    signaling_df = signaling_df.T
    
    return signaling_df

def reduce_to_pathway_pathway_correlations(_cor, _pval, _path_cor):

    df = sorted(zip(_cor, _pval, _path_cor))

    df = pd.DataFrame(df, columns = ['Correlation', 'Adj. P-val', 'Pathways'])
    df["Pathway1"] = [find_first_pathway(i) for i in df["Pathways"]]
    df["Pathway2"] = [i.split()[-2:] for i in df["Pathways"]]
    
#
# Choosing which pathway to group first so they are easily comparable across HSPCs and +/-
# and rename pathways based on direction of signaling...
#
    pp1 = [i[1] for i in df['Pathway1']]
    pp2 = [i[1] for i in df['Pathway2']]
    whichOne = [True if j[0]>i[0] else False for i,j in zip(pp1, pp2)]
    df['whichOne'] = whichOne

    df['same_gene'] = [filter_same_gene_pathways(p1, p2) for p1,p2 in zip(list(df['Pathway1']),list(df['Pathway2']))]
    df['geneCount'] = [total_pathway_genes(p1, p2) for p1,p2 in zip(list(df['Pathway1']),list(df['Pathway2']))]
    df = df.drop(df[df['same_gene'] == True].index)
    df['combo'] = df["Pathway1"] + df["Pathway2"]
    df['combo2'] = df["Pathway2"] + df["Pathway1"]
    df = df.replace(0.0, 1e-323)
    combo_name = []
    whichOne = list(df['whichOne'])
    for i in range(len(whichOne)):

        if whichOne[i]:
            newname = " ".join(list(df['combo'])[i])
        else:
            newname = " ".join(list(df['combo2'])[i])

        newname = newname.replace('recieve', 'temp')
        newname = newname.replace('send', 'receive')
        newname = newname.replace('temp', 'send')
        combo_name.append(newname)

    df['TrueName'] = combo_name

#
# Get cell types in cell-type pathway correlations
#
    df['Cell Type 1'] = [find_first_celltype(i) for i in df["Pathways"]]
    df['Cell Type 2'] = [find_second_celltype(i) for i in df["Pathways"]]  

    mean_corr = []
    mean_adjp = []
    name = []
    ct1 = []
    ct2 = []
    for i in range(len(combo_name)):
        if combo_name[i] not in name:
            sub = df[df['TrueName'] == combo_name[i]]
            ct1s = list(sub['Cell Type 1'])
            ct2s = list(sub['Cell Type 2'])

            ct1s.extend(ct1s)
            ct1s = list(set(ct1s))

            ct2s.extend(ct2s)
            ct2s = list(set(ct2s))

#
# Calculate average correlation and p-value
#
            adj_r = np.tanh(np.arctanh(sub['Correlation']).mean())
            mean_corr.append(adj_r)

            mean_adjp.append(calc_new_pval(adj_r))

            name.append(combo_name[i])
            ct1.append(ct1s)
            ct2.append(ct2s)
#
# Export into own dataframe
#
    mean_df = pd.DataFrame()
    mean_df['Correlation'] = mean_corr
    mean_df['Adj. P-val'] = mean_adjp
    mean_df['Name'] = name
    mean_df["Cell Type 1"] = ct1
    mean_df["Cell Type 2"] = ct2
    mean_df = mean_df.sort_values(by = 'Correlation')
    
    return mean_df

In [6]:
# SELECT PATHWAY WHERE THE 'all_pathways' FOLDER IS
parent_dir = "general_niche/"

# SINGLE CELL HSPC TYPE (FOR FILE NAME AND PLOTTING)
n = "hsc"


In [7]:
df =make_signaling_df(parent_dir)

In [8]:
# NORMALIZE INTERACTION INTENSITY SCORES
df_max_scaled = df.copy()
for column in df_max_scaled.columns:
    df_max_scaled[column] = df_max_scaled[column] /df_max_scaled[column].abs().max()

In [9]:
# REMOVE INTERACTIONS THAT AREN'T FREQUENT
to_remove = []
for col in df_max_scaled.columns:
    interaction = list(df_max_scaled[col])
    
    # fraction that are 0 > 95% (fraction that are non-zero < 5%)
    
    if (interaction.count(0)/len(interaction)) > 0.95:
        to_remove.append(col)
        
df_max_scaled_filtered = df_max_scaled.drop(columns = to_remove)

In [10]:
# CALCULATE CELL-TYPE PATHWAY CORRELATIONS FOR EVERY INTERACTION ACROSS SINGLE CELL HSPCS
corr_df, pval_df = make_corr(df_max_scaled_filtered)
corr_df.index = corr_df.columns
pval_df.index = pval_df.columns

  corr_df[col_names[i]] = toAdd
  pval_df[col_names[i]] = toAdd_p


520984


In [13]:
# Get list of all pvalues
pvals = []
for i in range(len(pval_df.columns)):
    for j in range(i+1, len(pval_df.columns)):
        if np.isnan(pval_df.iloc[j,i]):
            pvals.append(1)
        else:
            pvals.append(pval_df.iloc[j,i])

# Do Benjamini and Hochberg correction
bhc = multipletests(pvals, method = 'fdr_bh')

pvals_bhc = pval_df.copy()

In [14]:
pos_cor = []
neg_cor = []
pos_pval = []
neg_pval = []
path_pos_cor = []
path_neg_cor = []

In [15]:
# Correlation significance threshold...
threshold = 0.05

k = 0
for i in range(len(pvals_bhc.columns)):
    for j in range(i+1, len(pvals_bhc.columns)):
        if bhc[1][k] < threshold:
            
            pvals_bhc.iloc[j,i] = bhc[1][k]
            
            if corr_df.iloc[j,i] > 0:
                pos_cor.append(corr_df.iloc[j,i])
                pos_pval.append(pvals_bhc.iloc[j,i])
                path_pos_cor.append(pvals_bhc.columns[j] + " and " + pvals_bhc.columns[i])
            else:
                neg_cor.append(corr_df.iloc[j,i])
                neg_pval.append(pvals_bhc.iloc[j,i])
                path_neg_cor.append(pvals_bhc.columns[j] + " and " + pvals_bhc.columns[i])
        else:
            pvals_bhc.iloc[j,i] = np.nan
        k+=1

### Reduce cell-type pathway correlations to pathway-pathway correlations...

In [16]:
mean_pos = reduce_to_pathway_pathway_correlations(pos_cor, pos_pval, path_pos_cor)

In [18]:
mean_neg = reduce_to_pathway_pathway_correlations(neg_cor, neg_pval, path_neg_cor)