In [1]:
import gc
import h5py
import numpy as np
import pandas as pd
import scipy
import datetime 
import h5py
import sys
import pickle
import os
from scipy import stats 
import seaborn as sns

from matplotlib import pyplot as plt
%matplotlib inline

path_to_configs = "../"
sys.path.append(path_to_configs)
from configs import *

  from ._conv import register_converters as _register_converters


In [2]:
mod = "MTL"
mods = ["Linear","MLP_baselines", "MTL"]
phens = ["CERAD", "BRAAK", "PLAQUES", "TANGLES", "ABETA_IHC", "TAU_IHC"]


path_to_ext_val_data = path_to_configs + path_to_ext_val_data_folder
path_to_results = path_to_configs + path_to_ext_val_results

### Human

In [3]:
dpath = ""
t_stat = {}
t_ps= {}
dfs = {}
for mod in mods:
    exclude_list = ["Mouse",'Mayo_CER.tsv', 'Mayo_TCX.tsv', 'HBTRC_synapse.tsv', 'HBTRC_GEO_CER.tsv', 'ROSMAP_GE2.tsv']

    ge_list = []
    df_list = []
    embedding_list = []
    
    for dset in os.listdir("%spredictions%s/"%(path_to_results,dpath)):

        if dset not in exclude_list:

            AD_labs = pd.read_csv("%sprocessed/all_human/all_human_labels/%s"%(path_to_ext_val_data,dset), delimiter="\t")
            covars = pd.read_csv("%sprocessed/all_human/all_human_covars/%s"%(path_to_ext_val_data,dset), delimiter="\t", index_col=None)
            processed_ge_data = np.loadtxt("%sprocessed/all_human/ge_pca/%s"%(path_to_ext_val_data,dset))
            AD_labs = AD_labs.merge(covars, how="left", on="sample_name")
            preds = pd.read_csv("%spredictions%s/%s/%s/final.csv"%(path_to_results, dpath,dset,mod), index_col="Unnamed: 0")
            combined_df = pd.concat([AD_labs,preds],axis=1)
            combined_df["source"]=dset
            
            # FITERS:
            combined_df = combined_df[combined_df["age"]>61]
            combined_df["age_groups"] = combined_df["age"].apply(lambda x: "<75" if x < 75 else \
                                                                 ("[75-85)" if x < 85 else "85+"))
            
            if dset=="MayoRNASeq.tsv":
                combined_df = combined_df[combined_df["sample_name"].apply(lambda x: True if "_CER" not in x else False)]
            
            df_list.append(combined_df)
            
            kept_rows_idx = np.asarray(combined_df.index)
            ge_list.append(processed_ge_data[kept_rows_idx])
            
            if mod=="MTL":
                embedding = np.loadtxt("%smodel_transformations%s/all_human/%s/MTL/1/test/0.txt"%(path_to_results, dpath,dset))
                embedding_list.append(embedding[kept_rows_idx])
            
            
    stacked_dfs = []
    for df in df_list:
        df["AB_RELATED"] = np.mean(df[["ABETA_IHC", "PLAQUES", "CERAD"]].rank(pct=True),axis=1)
        df["TAU_RELATED"] = np.mean(df[["TAU_IHC", "TANGLES", "BRAAK"]].rank(pct=True),axis=1)
        df["AVG"] = np.mean(df[["AB_RELATED", "TAU_RELATED"]].rank(pct=True),axis=1)
        df["B_C"] = np.mean(df[["BRAAK", "CERAD"]].rank(pct=True),axis=1)
        stacked_dfs.append(df)

    df = pd.concat(stacked_dfs)
    df["study"] = ["_".join(x.split("_")[:2]) for x in df["source"]]

    dfs[mod] = df
    t_stat[mod] = {}
    t_ps[mod] = {}

    ge_pca_data = np.vstack(ge_list)
    if mod=="MTL":
        embeddings = np.vstack(embedding_list)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [4]:
for mod in mods:    
    if not os.path.isdir("%sCleaned/Model_Predictions/%s/"%(path_to_results,mod)):
        os.makedirs("%sCleaned/Model_Predictions/%s/"%(path_to_results,mod))
        
    dfs[mod].to_csv("%sCleaned/Model_Predictions/%s/%s.csv"%(path_to_results,mod,"all_human"))

if not os.path.isdir("%sCleaned/test_embeddings/%s/"%(path_to_results,mod)):
    os.makedirs("%sCleaned/test_embeddings/%s/"%(path_to_results,mod))
np.savetxt("%sCleaned/test_embeddings/%s/all_human.csv"%(path_to_results,mod),embeddings)


## Mouse

In [None]:
dfs = {}

dset = "Mouse"
dpath = "_intersection"
    
for mod in mods:

    if dpath == "_intersection":
        labels = pd.read_csv("%sprocessed%s/%s/labels_test.csv"%(path_to_ext_val_data,dpath,dset), index_col="Unnamed: 0")
    else:   
        labels = pd.read_csv("%sprocessed%s/%s/labels.csv"%(path_to_ext_val_data,dpath,dset), index_col="Unnamed: 0")
    final_preds = pd.read_csv("%spredictions%s/%s/%s/final.csv"%(path_to_results,dpath,dset,mod), index_col="Unnamed: 0")
    phens = list(final_preds.columns)

    df = pd.concat([labels,final_preds],axis=1)

    df["AB_RELATED"] = np.mean(df[["ABETA_IHC", "PLAQUES", "CERAD"]].rank(pct=True),axis=1)
    df["TAU_RELATED"] = np.mean(df[["TAU_IHC", "TANGLES", "BRAAK"]].rank(pct=True),axis=1)
    df["AVG"] = np.mean(df[["AB_RELATED", "TAU_RELATED"]].rank(pct=True),axis=1)
    df["B_C"] = np.mean(df[["BRAAK", "CERAD"]].rank(pct=True),axis=1)

    df["AD"] = df["strain"].apply(lambda x: x if x=="WILD" else "AD")
    df["time"] = df["age"].apply(lambda x: int(x[:-6] ))
    df["AD-region"] = df["AD"].values+"_"+labels["region"].values
    df["strain-region"] = df["strain"].values+"_"+labels["region"].values

    dfs[mod] = df[df["region"]!="Cerebellum"]
    dfs[mod]["idx"] = dfs[mod].index
    
    if mod=="MTL":
        embedding = np.loadtxt("%smodel_transformations%s/%s/MTL/1/test/0.txt"%(path_to_results, dpath,dset))
        to_keep_idx = dfs[mod]["idx"].values
        if not os.path.isdir("%sCleaned/test_embeddings/%s/"%(path_to_results,mod)):
            os.makedirs("%sCleaned/test_embeddings/%s/"%(path_to_results,mod))
        np.savetxt("%sCleaned/test_embeddings/%s/%s.csv"%(path_to_results,mod,dset),embedding[to_keep_idx])

In [None]:
for mod in mods:    
    if not os.path.isdir("%sCleaned/Model_Predictions/%s/"%(path_to_results,mod)):
        os.makedirs("%sCleaned/Model_Predictions/%s/"%(path_to_results,mod))
        
    dfs[mod].to_csv("%sCleaned/Model_Predictions/%s/%s.csv"%(path_to_results,mod,dset))

## Blood

In [None]:
dpath = "_intersection"
dfs = {}


for mod in mods:
    
    df_prestacked = []
  
    for dset in ['Blood_GSE63060', 'Blood_GSE63061']:
        if dpath == "_intersection":
            labels = pd.read_csv("%sprocessed%s/%s/labels_test.csv"%(path_to_ext_val_data,dpath,dset), index_col="Unnamed: 0")
        else:   
            labels = pd.read_csv("%sprocessed%s/%s/labels.csv"%(path_to_ext_val_data,dpath,dset), index_col="Unnamed: 0")
        final_preds = pd.read_csv("%spredictions%s/%s/%s/final.csv"%(path_to_results,dpath,dset,mod), index_col="Unnamed: 0")
        phens = list(final_preds.columns)


        df = pd.concat([labels,final_preds],axis=1)
        df["status"] = df["status"].apply(lambda x: x.lstrip())
        df = df[(df["status"]=="AD") | (df["status"]=="MCI") | (df["status"]=="CTL")]
        
        df["AB_RELATED"] = np.mean(df[["ABETA_IHC", "PLAQUES", "CERAD"]].rank(pct=True),axis=1)
        df["TAU_RELATED"] = np.mean(df[["TAU_IHC", "TANGLES", "BRAAK"]].rank(pct=True),axis=1)
        df["AVG"] = np.mean(df[["AB_RELATED", "TAU_RELATED"]].rank(pct=True),axis=1)
        df["B_C"] = np.mean(df[["BRAAK", "CERAD"]].rank(pct=True),axis=1)
        
        df["source"] = dset
        
        df["age_groups1"] = df["age"].apply(lambda x: "<75" if x < 75 else \
                                                         ("[75-85)" if x < 85 else \
                                                         ("85+")))
        
        df["age_groups2"] = df["age"].apply(lambda x: "<74" if x < 74 else \
                                                         ("[74-80)" if x < 80 else \
                                                         ("80+")))
        df_prestacked.append(df)
        
        
               
        if not os.path.isdir("%sCleaned/Model_Predictions/%s/"%(path_to_results,mod)):
            os.makedirs("%sCleaned/Model_Predictions/%s/"%(path_to_results,mod))
        df.to_csv("%sCleaned/Model_Predictions/%s/%s.csv"%(path_to_results,mod,dset))

        if mod=="MTL":
            embedding = np.loadtxt("%smodel_transformations%s/%s/MTL/1/test/0.txt"%(path_to_results, dpath,dset))
            to_keep_index = np.asarray(df.index)
            if not os.path.isdir("%sCleaned/test_embeddings/%s/"%(path_to_results,mod)):
                os.makedirs("%sCleaned/test_embeddings/%s/"%(path_to_results,mod))
            np.savetxt("%sCleaned/test_embeddings/%s/%s.csv"%(path_to_results,mod,dset),embedding[to_keep_index])
        
    df_stacked = pd.concat(df_prestacked)
    dfs[mod] = df_stacked


In [None]:
for mod in mods:    
    if not os.path.isdir("%s/Cleaned/Model_Predictions/%s/"%(path_to_results,mod)):
        os.makedirs("%s/Cleaned/Model_Predictions/%s/"%(path_to_results,mod))
        
    dfs[mod].to_csv("%s/Cleaned/Model_Predictions/%s/%s.csv"%(path_to_results,mod,"Blood"))
    
    