### NOTE: 
The sections of the notebook are for calculating integrated gradients (IG) weights for the consensus model:
- 100 runs of MD-AD training --> Gene to phenotype importance
- Consensus embeddings --> Gene to (consensus) node importance 

The IG files take up a LOT of space (Around 350GB) for outputs

### Use this code to get IG weights for each sample:

Output layer: 
save 1 x N x G matrix for each phenotype

Last shared layer:
save 1 x N x G matrix for each node separately 

In [1]:
import gc
import h5py
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.layers import Input, Dense, Dropout
from keras import optimizers, regularizers, losses

from keras.models import Model
from keras import backend as K
from keras.callbacks import CSVLogger
from keras import metrics

import scipy
import datetime 

import keras

import sys
import pickle


import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

K.tensorflow_backend._get_available_gpus()

path_to_configs = "../"
sys.path.append(path_to_configs)
from configs import * 
from models import * 

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

sys.path.append("../../packages")
from IntegratedGradients import IntegratedGradients as IG

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
with h5py.File(path_to_configs + path_to_MDAD_data_folders + "%s.h5"%(full_pca_dataset), 'r') as hf:
    PCA_components = hf["PCA_components_"][:]
    gene_symbols = hf["gene_symbols"][:]
    

with h5py.File(path_to_configs + path_to_MDAD_data_folders + "%s.h5"%(full_dataset), 'r') as hf:
    raw_X = hf["ge"][:].astype(np.float64)      
    raw_Y = hf["labels"][:]
    raw_gene_symbols = hf["gene_symbols"][:]


In [3]:
def get_model_layers(model_file, num_layers):
    
    # note: need to define custom functions for model in order to load, but these don't actually get used
    model = keras.models.load_model(model_file, custom_objects={"ordloss_cur_params": ordloss(0), \
            "ignorenans_mse": ignorenans_mse, "cat_acc": ignorenans_categorical_accuracy(0), \
            "ignorenans_scaled_mse": ignorenans_scaled_mse})
    
    # define new model that cuts off the last several layers
    newmodel = Model(inputs = model.input, outputs = model.layers[num_layers-1].output)
    
    # agian, need to specify these parameters, but they aren't used since we don't retrain the model
    opt = optimizers.adam()  
    newmodel.compile(optimizer=opt, loss= "mse")
    
    return newmodel


def get_gene_weight_output(model, X):
    L = model.output.shape[1] 
        
    IG_L_by_N_by_G = np.zeros([L, len(X), len(gene_symbols)])

    ig = IG.integrated_gradients(model)

    for lvar in range(L):
        if lvar%10 == 0:
            print(lvar, datetime.datetime.now())
            
        IG_L_by_N_by_G[lvar] = np.array([ig.explain(x, outc=lvar) for x in X]) 
        
    return IG_L_by_N_by_G 



def get_gene_weight_latent_node(model, X, node):
    L = model.output.shape[1] 
        
    IG_L_by_N_by_G = np.zeros([L, len(X), len(gene_symbols)])

    ig = IG.integrated_gradients(model)
    
    node_weights =  np.array([ig.explain(x, outc=node) for x in X]) 
    return node_weights

                
def get_PCA_stacked_model(model, raw_X, method, fname):


    main_input = Input(shape=(raw_X.shape[1],), dtype='float', name='main_input')
    submean = Dense(raw_X.shape[1], activation="linear", name='submean')(main_input)
    pcatrans = Dense(num_components, activation="linear", name='pcatrans')(submean)
    model.layers.pop(0)
    out_model = model(pcatrans)

    if method == "MTL":
        MTL_phenotype_output_mapping = {"BRAAK":0, "CERAD":1, "PLAQUES":2, "TANGLES":3, "ABETA_IHC":4, "TAU_IHC":5}
        model_w_PCA = Model(inputs=[main_input], outputs=[out_model[MTL_phenotype_output_mapping[phenotype]]])
    else:
        model_w_PCA = Model(inputs=[main_input], outputs=[out_model])

    model_w_PCA.layers[1].set_weights([np.identity(raw_X.shape[1]), -1*raw_X.mean(axis=0)])
    model_w_PCA.layers[2].set_weights([PCA_components.T[:,:500], np.zeros(500)])


    grad_clip_norm = float(fname.split("_")[-2])
    learning_rate = float(fname.split("_")[-4])
    opt = optimizers.adam(clipnorm=grad_clip_norm, lr=learning_rate)  
    model_w_PCA.compile(optimizer=opt, loss = "mse")

    return model_w_PCA

In [4]:
MTL_final_final_model = pickle.load(open(path_to_configs + path_to_final_models_chosen + "MTL/final.p", "rb" ) )
baselines_final_final_model = pickle.load(open(path_to_configs + path_to_final_models_chosen + "MLP_baselines/final.p", "rb" ) )

method = "MTL"

# GENE WEIGHTS ON OUTPUTS

In [17]:
# for i in range(100):
method = "MTL"
for i in range(100):
    print("************RUN %i*************"%i)
    for phenotype in ["CERAD", "BRAAK", "PLAQUES", "TANGLES", "ABETA_IHC", "TAU_IHC"]:

        print("---Saving IG weights for %s"%phenotype)
        if method == "MTL":
            fname = MTL_final_final_model
            path_to_model = path_to_configs + final_models_save_path + "models/MTL/ACT_MSBBRNA_ROSMAP_PCA/%s/%i/200.hdf5"%(MTL_final_final_model,i)

        else:
            fname = baselines_final_final_model[phenotype]

            path_to_model = path_to_configs + final_models_save_path + "models/MLP_baselines/%s/%s/%s/%i/200.hdf5"%("ACT_MSBBRNA_ROSMAP_PCA", phenotype, fname,i)
        
        model = keras.models.load_model(path_to_model, custom_objects={"ordloss_cur_params": ordloss(0), \
            "ignorenans_mse": ignorenans_mse, "cat_acc": ignorenans_categorical_accuracy(0), \
            "ignorenans_scaled_mse": ignorenans_scaled_mse})
        
        model_w_PCA = get_PCA_stacked_model(model, raw_X, method, fname)
     
        IG_weights = get_gene_weight_output(model_w_PCA, raw_X)

        if not os.path.isdir("%s%s/%s/%i/outputs/"%(path_to_configs + IG_save_path, SPECIFIC_FOLDER,method,i)):
            os.makedirs("%s%s/%s/%i/outputs/"%(path_to_configs + IG_save_path, SPECIFIC_FOLDER,method,i))

        with h5py.File("%s%s/%s/%i/outputs/%s.h5"%(path_to_configs + IG_save_path, SPECIFIC_FOLDER,method,i,phenotype), 'w') as hf:
            hf.create_dataset("gene_weights", data=IG_weights)

        print("Saved to %s%s/%s/%i/outputs/%s.h5"%(path_to_configs + IG_save_path, SPECIFIC_FOLDER,method,i,phenotype))
        K.clear_session()
        gc.collect()

************RUN 0*************
---Saving IG weights for CERAD


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Evaluated output channel (0-based index): All
Building gradient functions
Progress: 100.0%
Done.
0 2021-01-18 05:53:30.588650
Saved to ../../../Pipeline_Outputs_Submitted/IG_weights/origGE/MTL/0/outputs/CERAD.h5
---Saving IG weights for BRAAK
Evaluated output channel (0-based index): All
Building gradient functions
Progress: 100.0%
Done.
0 2021-01-18 05:54:12.680579
Saved to ../../../Pipeline_Outputs_Submitted/IG_weights/origGE/MTL/0/outputs/BRAAK.h5
---Saving IG weights for PLAQUES
Evaluated output channel (0-based index): All
Building gradient functions
Progress: 100.0%
Done.
0 2021-01-18 05:54:55.789050


KeyboardInterrupt: 

## Consensus nodes: Get IG weights for centroid nodes

In [7]:
path_to_centroid_info = "%s1/normed_KMeans_medoids/MTL_50_medoids_info.csv"%(path_to_configs + final_rep_consensus_embeddings_savepath)
centroid_info = pd.read_csv(path_to_centroid_info).sort_values("cluster")
method = "MTL"

consenus_IG_weights = np.zeros([len(centroid_info), len(raw_X), len(gene_symbols)])

for i,row in centroid_info.iterrows():
    run=row["run"]
    node_idx=row["node_idx"]
    
    path_to_model = path_to_configs + final_models_save_path + "models/MTL/ACT_MSBBRNA_ROSMAP_PCA/%s/%i/200.hdf5"%(MTL_final_final_model, run)
    MTL_up_to_latent = get_model_layers(path_to_model, 4)
    main_input = Input(shape=(raw_X.shape[1],), dtype='float', name='main_input')
    submean = Dense(raw_X.shape[1], activation="linear", name='submean')(main_input)
    pcatrans = Dense(500, activation="linear", name='pcatrans')(submean)
    MTL_up_to_latent.layers.pop(0)
    out_model = MTL_up_to_latent(pcatrans)
    model_w_PCA = Model(inputs=[main_input], outputs=[out_model])
    model_w_PCA.layers[1].set_weights([np.identity(raw_X.shape[1]), -1*raw_X.mean(axis=0)])
    model_w_PCA.layers[2].set_weights([PCA_components.T[:,:num_components], np.zeros(500)])
    grad_clip_norm = float(MTL_final_final_model.split("_")[-2])
    learning_rate = float(MTL_final_final_model.split("_")[-4])
    opt = optimizers.adam(clipnorm=grad_clip_norm, lr=learning_rate)  
    model_w_PCA.compile(optimizer=opt, loss = "mse")

    print("********CLUSTER NODE %i (node %i from run %i)******"%(i, node_idx, run))

    consenus_IG_weights[i] = get_gene_weight_latent_node(model_w_PCA, raw_X, node_idx)
    
    savepath = "%sconsensus/%s/%s/last_shared/"%(path_to_configs + IG_save_path, SPECIFIC_FOLDER, method)
    if not os.path.isdir(savepath):
        os.makedirs(savepath)

    with h5py.File(savepath + "%i.h5"%(i), 'w') as hf:
        hf.create_dataset("gene_weights", data=consenus_IG_weights[i])
    
    K.clear_session()
    gc.collect()
    


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


********CLUSTER NODE 0 (node 39 from run 72)******
Evaluated output channel (0-based index): All
Building gradient functions
Progress: 100.0%
Done.
********CLUSTER NODE 1 (node 9 from run 66)******
Evaluated output channel (0-based index): All
Building gradient functions
Progress: 100.0%
Done.


KeyboardInterrupt: 