https://presciencegpu.cs.washington.edu:9991/notebooks/bigdrive/MD-AD_public/Analyses_Top_Genes/TMP_CLEAN_MAYBE_final_genes_from_correlations-Copy1.ipynb#CHECKING-MD-AD

# CORRELATION-BASED 

In [4]:
import gc
import h5py
import numpy as np
import pandas as pd
import os
import statsmodels.api as sm
from scipy import stats
from scipy.stats import spearmanr, pearsonr, rankdata
import matplotlib
import pickle

from matplotlib import pyplot as plt
import matplotlib as mpl

%matplotlib inline
import seaborn as sns

from functools import reduce
import numpy.ma as ma

In [5]:
with h5py.File("../../md-ad_public_repo_data/DATA/MTL_data/ACT_MSBBRNA_ROSMAP.h5", 'r') as hf:
    gene_symbols = hf["gene_symbols"][:].astype(str)
    ge = hf["ge"][:]
    Y = hf["labels"][:]
    labels_names= hf["labels_names"][:].astype(str)
    print(list(hf.keys()))
    
labels_df = pd.DataFrame(Y, columns=labels_names.astype(str), dtype="str")



['ge', 'gene_symbols', 'labels', 'labels_names']


In [6]:
phenotypes = ["CERAD", "BRAAK", "PLAQUES", "TANGLES", "ABETA_IHC", "TAU_IHC"]

In [7]:
rs_phen = {}
ps_phen = {}
for phenotype in phenotypes:
    print(phenotype)
    a=ma.masked_invalid(labels_df[phenotype].astype(float))
    
    rs = []
    ps = []
    for row in ge.T:
    
        b=ma.masked_invalid(row)

        msk = (~a.mask & ~b.mask)

        r,p = stats.pearsonr(a[msk],b[msk])
        rs.append(r)
        ps.append(p)
        
    rs_phen[phenotype]=rs
    ps_phen[phenotype]=ps

CERAD
BRAAK
PLAQUES
TANGLES
ABETA_IHC
TAU_IHC


In [8]:
phen_dict = {"all": ['CERAD','BRAAK','PLAQUES','TANGLES','ABETA_IHC','TAU_IHC'], 
             "abeta": ['CERAD','PLAQUES','ABETA_IHC'],
            "tau": ['BRAAK','TANGLES','TAU_IHC']}


In [9]:
RANK_DF = pd.DataFrame.from_dict(rs_phen)
RANK_DF["gene"] = gene_symbols
for phen in phenotypes:
    RANK_DF["%s_percentiles"%phen] = RANK_DF[phen].rank(pct=True)


In [10]:
for phen_group in phen_dict:
    RANK_DF["%s_score"%phen_group] = RANK_DF[["%s_percentiles"%x for x in phen_dict[phen_group]]].mean(axis=1).rank(pct=True)

In [11]:
RANK_DF.to_csv("correlation_based_rankings.csv")

In [26]:
orig = pd.read_csv("paper_results/finalgenes_190819/percentiles/all_corrs.rnk", names=["gene", "all_score"], delimiter="\t")

In [30]:
RANK_DF[["gene","all_score"]].merge(orig, on="gene", suffixes=("_new", "_old")).corr()

Unnamed: 0,all_score_new,all_score_old
all_score_new,1.0,1.0
all_score_old,1.0,1.0


### Note:
this way of calculating matches up exactly with the correlation-based ranking used in the paper

# CHECKING MD-AD CONSISTENCY

"CONSENSUS_SCORES_190909/ALL_CONSENSUS_SCORES" agrees with ALL_CONSENSUS_SCORES except for excel issues

In [33]:
MDAD_rank_in_plots = "paper_results/ALL_CONSENSUS_SCORES.csv"

In [41]:
MDAD_rank_in_plots = pd.read_csv("paper_results/ALL_CONSENSUS_SCORES.csv")

In [42]:
rnk = pd.read_csv("paper_results/CONSENSUS_SCORES_190909/ALL_CONSENSUS_SCORES.csv").sort_values("gene")

In [44]:
tmp = MDAD_rank_in_plots.merge(rnk, on="gene")

In [49]:
for col in rnk.columns:
    if col != "gene":
        print(col)
        print((tmp[col+"_x"]-tmp[col+"_y"]).mean())


all-related_score
2.8940180542642383e-12
abeta-related_score
-2.05346121352966e-12
tau-related_score
1.8004454978796048e-12
CERAD_score
-1.431971085034711e-12
BRAAK_score
5.040922651934725e-12
PLAQUES_score
-4.1570491007617157e-13
TANGLES_score
1.1397125300393897e-12
ABETA_IHC_score
-2.184608757313869e-12
TAU_IHC_score
-1.8090215254791454e-12


In [47]:
tmp

Unnamed: 0,gene,all-related_score_x,abeta-related_score_x,tau-related_score_x,CERAD_score_x,BRAAK_score_x,PLAQUES_score_x,TANGLES_score_x,ABETA_IHC_score_x,TAU_IHC_score_x,all-related_score_y,abeta-related_score_y,tau-related_score_y,CERAD_score_y,BRAAK_score_y,PLAQUES_score_y,TANGLES_score_y,ABETA_IHC_score_y,TAU_IHC_score_y
0,LTF,0.995121,0.993645,0.996597,0.981301,0.998584,0.999791,0.997354,0.999841,0.993852,0.995121,0.993645,0.996597,0.981301,0.998584,0.999791,0.997354,0.999841,0.993852
1,GFAP,0.994333,0.991481,0.997186,0.986780,0.997004,0.998143,0.998194,0.989519,0.996359,0.994333,0.991481,0.997186,0.986780,0.997004,0.998143,0.998194,0.989519,0.996359
2,PLCE1,0.991745,0.987057,0.996434,0.982739,0.995163,0.988401,0.996219,0.990032,0.997919,0.991745,0.987057,0.996434,0.982739,0.995163,0.988401,0.996219,0.990032,0.997919
3,FPR3,0.989980,0.980407,0.999553,0.978097,0.999462,0.972853,0.999623,0.990271,0.999575,0.989980,0.980407,0.999553,0.978097,0.999462,0.972853,0.999623,0.990271,0.999575
4,C4B,0.989063,0.991055,0.987071,0.984019,0.993599,0.997385,0.991936,0.991760,0.975679,0.989063,0.991055,0.987071,0.984019,0.993599,0.997385,0.991936,0.991760,0.975679
5,ZNF98,0.984010,0.976052,0.991967,0.933536,0.980056,0.998811,0.996997,0.995808,0.998849,0.984010,0.976052,0.991967,0.933536,0.980056,0.998811,0.996997,0.995808,0.998849
6,NPNT,0.983867,0.971643,0.996090,0.948458,0.994306,0.982954,0.997123,0.983518,0.996840,0.983867,0.971643,0.996090,0.948458,0.994306,0.982954,0.997123,0.983518,0.996840
7,SLC28A2,0.981122,0.977736,0.984507,0.957477,0.976636,0.990499,0.980450,0.985232,0.996436,0.981122,0.977736,0.984507,0.957477,0.976636,0.990499,0.980450,0.985232,0.996436
8,EFHB,0.978717,0.959888,0.997546,0.952314,0.999065,0.966233,0.998459,0.961117,0.995113,0.978717,0.959888,0.997546,0.952314,0.999065,0.966233,0.998459,0.961117,0.995113
9,KNSTRN,0.978502,0.974868,0.982137,0.964855,0.982410,0.988427,0.983199,0.971322,0.980801,0.978502,0.974868,0.982137,0.964855,0.982410,0.988427,0.983199,0.971322,0.980801


In [34]:
os.listdir("paper_results/CONSENSUS_SCORES_190909/")

['tau-related.csv',
 'PLAQUES.csv',
 'ALL_CONSENSUS_SCORES.csv',
 'abeta-related.csv',
 'TAU_IHC.csv',
 'all-related.csv',
 'TANGLES.csv',
 'rnk_files',
 'BRAAK.csv',
 'CERAD.csv',
 'ABETA_IHC.csv']

### tmp - disagreements with previous files

In [46]:
res190819 = pd.read_csv("/Users/nbbwang/Downloads/all_corrs.rnk", delimiter="\t", names=['gene', 'score_190819'])

In [47]:
tmp2 = res190819.merge(RANK_DF[["gene", "all_score"]])

In [48]:
tmp2.corr()

Unnamed: 0,score_190819,all_score
score_190819,1.0,1.0
all_score,1.0,1.0


In [49]:
disagreements = np.where(tmp2["score_190819"].rank() != tmp2["all_score"].rank())

In [50]:
tmp2["abs_score"] = tmp2["all_score"].apply(lambda x: (x-.5)*2 if x>=.5 else (.5-x)*2)
tmp2["gene_sign"] = tmp2["all_score"].apply(lambda x: " +" if x>.5 else "  - ")

tmp2["abs_score_190819"] = tmp2["score_190819"].apply(lambda x: (x-.5)*2 if x>=.5 else (.5-x)*2)
tmp2["gene_sign_190819"] = tmp2["score_190819"].apply(lambda x: " +" if x>.5 else "  - ")

In [51]:
np.intersect1d(tmp2.sort_values("abs_score_190819", ascending=False)["gene"].values[:100], tmp2.sort_values("abs_score", ascending=False)["gene"].values[:100]).shape

(98,)