In [1]:
import pickle
import numpy as np
import pandas as pd
import os
import h5py
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
import matplotlib

%matplotlib inline

import seaborn as sns

from scipy.stats import spearmanr, pearsonr, rankdata

  from ._conv import register_converters as _register_converters


In [2]:
path_to_average_IGs = "/Users/nbbwang/Documents/Lee Lab/AD_project/analyses/NEW_FULL_PIPELINE/fgsea/data/IG_weights_forR/origGE/reps/weighted_avg_high_vs_low_path/"
with h5py.File("/Users/nbbwang/Documents/Lee Lab/AD_project/analyses/NEW_FULL_PIPELINE/Data_Processing/ProcessedData/combined_files/01_resid_postcombat/ACT_MSBBRNA_ROSMAP.h5", 'r') as hf:
    gene_symbols = hf["gene_symbols"][:].astype(str)

path_to_save_scores = "gene_rankings/MTL/"

    
phenotypes = ['CERAD','BRAAK','PLAQUES','TANGLES','ABETA_IHC','TAU_IHC']
method = "MTL"

In [3]:
phen_dict = {"all": ['CERAD','BRAAK','PLAQUES','TANGLES','ABETA_IHC','TAU_IHC'], 
             "abeta": ['CERAD','PLAQUES','ABETA_IHC'],
            "tau": ['BRAAK','TANGLES','TAU_IHC']}

In [None]:
# ranks such that most positive value has rank 1
def weights_to_rankings(weights, direction="positive"):
    if direction=="positive":
        return  len(weights) - rankdata(weights)
    elif direction =="negative":
        return len(weights) - rankdata(-1*weights)
    else:
        return len(weights) - rankdata(abs(weights))

In [14]:
np.loadtxt(path_to_average_IGs + "%s/%i/outputs/G_by_output_type.txt"%(method,i)).T.shape

(9, 14591)

In [None]:
### COMBINE REPS

if method=="MTL":
    num_reps = 101
elif method == "MLP_baselines":
    num_reps = 10
    
    
num_cats_overlaps = num_reps

rep_gws = []
for i in range(num_reps):
    if i%10==0:
        print(i)
        
    p_by_g_gw = np.loadtxt(path_to_average_IGs + "%s/%i/outputs/G_by_output_type.txt"%(method,i)).T
    rep_gws.append(p_by_g_gw)
rep_gws = np.array(rep_gws)

rep_ranks = {}
for direction in ["positive", "negative"]:
    rep_ranks[direction] = np.zeros([len(rep_gws), len(phenotypes), len(gene_symbols)])
    for i in range(num_reps):
        for p in range(len(phenotypes)):
            rep_ranks[direction][i,p,:] = weights_to_rankings(rep_gws[i,p,:], direction).astype(int)

0
10
20
30
40
50
60
70
80


In [None]:
### PLOT THE CONSISTENCY OF RANKINGS AS THE NUMBER OF RUNS INCREASES

tops = [100,500,1000,2000]
agreement_fracs = {}
for direction in ["positive", "negative"]:
    print(direction)
    agreement_fracs[direction] = []
    for top_num in tops:
        print(top_num)
        agreement_frac = []
        currep = 1
        cur_top = []

        for currep in range(1,101):
            new_top = np.argsort(np.mean(np.mean(rep_ranks[direction],axis=1)[:currep,:],axis=0))[:top_num]
            agreement_frac.append(len(np.intersect1d(cur_top,new_top))/top_num)

            cur_top=new_top
        agreement_fracs[direction].append(agreement_frac)

cmap=plt.cm.tab20
cmaplist = [cmap(i) for i in range(cmap.N)]
warm_cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", [cmaplist[3], cmaplist[6]])
cold_cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", [cmaplist[5], cmaplist[0]])

colors = {"negative": [cold_cmap(x) for x in np.arange(4)/3],
         "positive": [warm_cmap(x) for x in np.arange(4)/3]}

f,ax=plt.subplots(2,1,figsize=(8,8))
for i,direction in enumerate(agreement_fracs.keys()):
    for j,a in enumerate(agreement_fracs[direction]):
        ax[i].plot(a[1:], c=colors[direction][j])
        ax[i].set_ylim(.6,1.05)

        ax[i].legend(tops, bbox_to_anchor=(1.2, 0.5), frameon=False)
        ax[i].spines['right'].set_visible(False)
        ax[i].spines['top'].set_visible(False)
        ax[i].set_xlim(-1,100)

In [None]:
scores = rep_ranks["negative"]/(len(gene_symbols)-1)
consensus_scores = np.mean(scores,axis=0)

In [None]:
### SAVE rnk FILES (used for gseapy)
### SAVE FINAL RANKINGS FOR EACH PHENOTYPE AND COMBINED ACROSS GROUPS


if not os.path.isdir(path_to_save_scores):
    os.makedirs(path_to_save_scores)


for i,p in enumerate(phenotypes):
    to_save_c_scores = consensus_scores[i]
    
    scores_df = pd.DataFrame(np.vstack([gene_symbols,to_save_c_scores]).T, columns=["gene", "score"])
    scores_df["score"] = scores_df["score"].astype(float)
    scores_df = scores_df.sort_values("score", ascending=False)
    scores_df = scores_df.reset_index(drop=True)
    scores_df.to_csv("%s%s.rnk"%(path_to_save_scores,p), sep="\t", header=False, index=False)
for p_group in ["all", "abeta", "tau"]:
    p_idx = np.where(np.in1d(phenotypes, phen_dict[p_group]))[0]
    to_save_c_scores = np.mean(consensus_scores[p_idx],axis=0)
    scores_df = pd.DataFrame(np.vstack([gene_symbols,to_save_c_scores]).T, columns=["gene", "score"])
    scores_df["score"] = scores_df["score"].astype(float)
    scores_df = scores_df.sort_values("score", ascending=False)
    scores_df = scores_df.reset_index(drop=True)
    scores_df.to_csv("%s%s-related.rnk"%(path_to_save_scores,p_group), sep="\t", header=False, index=False)

In [None]:
current_ranking =pd.read_csv("%s%s-related.rnk"%(path_to_save_scores,"all"), sep="\t", names=["gene", "all-related_score"])
for group in [x+"-related" for x in ["abeta", "tau"]] + phenotypes:
    new_df = pd.read_csv("%s%s.rnk"%(path_to_save_scores,group), names=["gene", "%s_score"%group], sep="\t")
    current_ranking = current_ranking.merge(new_df, on="gene", how="left")

In [None]:
current_ranking.to_csv(path_to_save_scores + "ALL_CONSENSUS_SCORES.csv")
print("Saved rankings to %s"%path_to_save_scores)