# Correlation-based gene rankings
We compute the correlation coeffients between each gene and each phenotype - we then average across all phenotypes.

In [12]:
import gc
import h5py
import numpy as np
import pandas as pd
import os
import statsmodels.api as sm
from scipy import stats
from scipy.stats import spearmanr, pearsonr, rankdata
import matplotlib
import pickle
import sys

from matplotlib import pyplot as plt
import matplotlib as mpl

%matplotlib inline
import seaborn as sns

from functools import reduce
import numpy.ma as ma

path_to_configs = "../"
sys.path.append(path_to_configs)
from configs import * 

In [13]:
with h5py.File(path_to_configs + path_to_MDAD_data_folders + "ACT_MSBBRNA_ROSMAP.h5", 'r') as hf:
    gene_symbols = hf["gene_symbols"][:].astype(str)
    ge = hf["ge"][:]
    Y = hf["labels"][:]
    labels_names= hf["labels_names"][:].astype(str)
    print(list(hf.keys()))
    
labels_df = pd.DataFrame(Y, columns=labels_names.astype(str), dtype="str")

path_to_save_scores = path_to_save_scores = path_to_configs + path_to_gene_rankings + "Correlations/"


['ge', 'gene_symbols', 'labels', 'labels_names']


In [14]:
phenotypes = ["CERAD", "BRAAK", "PLAQUES", "TANGLES", "ABETA_IHC", "TAU_IHC"]

In [15]:
rs_phen = {}
ps_phen = {}
for phenotype in phenotypes:
    print(phenotype)
    a=ma.masked_invalid(labels_df[phenotype].astype(float))
    
    rs = []
    ps = []
    for row in ge.T:
    
        b=ma.masked_invalid(row)

        msk = (~a.mask & ~b.mask)

        r,p = stats.pearsonr(a[msk],b[msk])
        rs.append(r)
        ps.append(p)
        
    rs_phen[phenotype]=rs
    ps_phen[phenotype]=ps

CERAD
BRAAK
PLAQUES
TANGLES
ABETA_IHC
TAU_IHC


In [16]:
phen_dict = {"all": ['CERAD','BRAAK','PLAQUES','TANGLES','ABETA_IHC','TAU_IHC'], 
             "abeta": ['CERAD','PLAQUES','ABETA_IHC'],
            "tau": ['BRAAK','TANGLES','TAU_IHC']}


In [17]:
RANK_DF = pd.DataFrame.from_dict(rs_phen)
RANK_DF["gene"] = gene_symbols
for phen in phenotypes:
    RANK_DF["%s_percentiles"%phen] = RANK_DF[phen].rank(pct=True)


In [18]:
for phen_group in phen_dict:
    RANK_DF["%s-related_score"%phen_group] = RANK_DF[["%s_percentiles"%x for x in phen_dict[phen_group]]].mean(axis=1).rank(pct=True)
    RANK_DF["%s-related_score"%phen_group] = RANK_DF["%s-related_score"%phen_group] - np.min(RANK_DF["%s-related_score"%phen_group])

In [19]:
### SAVE rnk FILES (used for gseapy)
### SAVE FINAL RANKINGS FOR EACH PHENOTYPE AND COMBINED ACROSS GROUPS
if not os.path.isdir(path_to_save_scores):
    os.makedirs(path_to_save_scores)

for p_group in ["all", "abeta", "tau"]:
    
    scores_df = RANK_DF[["gene","%s-related_score"%p_group]].sort_values("%s-related_score"%p_group, ascending=False)
    scores_df = scores_df.reset_index(drop=True)
    scores_df.to_csv("%s%s-related.rnk"%(path_to_save_scores,p_group), sep="\t", header=False, index=False)

In [20]:
RANK_DF.to_csv(path_to_save_scores + "ALL_CONSENSUS_SCORES.csv")
print("Saved rankings to %s"%path_to_save_scores)

Saved rankings to ../../../Pipeline_Output_Submitted/gene_rankings/Correlations/
