## Downstream prediction using MLP

Perform downstream predictions using different embeddings with an MLP.  Replicate some of the XGB experiments using an MLP downstream model.

In [2]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout
from matplotlib import cm, pyplot as plt
from sklearn import metrics
from os.path import expanduser as eu
from os.path import isfile, join
from os import listdir
import sklearn.metrics as metrics
import numpy as np
import random
import time
import keras
import os

def load_min_model_helper(MPATH):
    print("[PROGRESS] Starting load_min_model_helper()")
    print("[DEBUG] MPATH {}".format(MPATH))
    mfiles = os.listdir(MPATH)
    full_mod_name = MPATH.split("/")[-1]
    mfiles = [f for f in mfiles if "val_loss" in f]
    loss_lst = [float(f.split("val_loss:")[1].split("_")[0]) for f in mfiles]
    min_ind = loss_lst.index(min(loss_lst))
    min_mod_name = "{}/{}".format(MPATH,mfiles[min_ind])
    if DEBUG: print("[DEBUG] min_mod_name {}".format(mfiles[min_ind]))
    return(load_model(min_mod_name))

def train_mlp_model(RESDIR,trainvalX,trainvalY,data_type,label_type,hosp_data):
    train_ratio = 0.9
    nine_tenths_ind = int(train_ratio*trainvalX.shape[0])
    X_train = trainvalX[0:nine_tenths_ind,:]
    y_train = trainvalY[0:nine_tenths_ind]
    X_valid = trainvalX[nine_tenths_ind:trainvalX.shape[0],:]
    y_valid = trainvalY[nine_tenths_ind:trainvalX.shape[0]]
    del trainvalX
    gc.collect()
    # Randomize
    indices = np.arange(0,X_train.shape[0])
    random.shuffle(indices)
    X_train = X_train[indices,:]
    y_train = y_train[indices]

    indices = np.arange(0,X_valid.shape[0])
    random.shuffle(indices)
    X_valid = X_valid[indices,:]
    y_valid = y_valid[indices]

    print("[PROGRESS] Starting create_model()")
    # lookback = 60; h1 = 200; h2 = 200;
    b_size = 1000; epoch_num = 200; lr = 0.00001
    opt_name = "adam"
    opt = keras.optimizers.Adam(lr)
    loss_func = "binary_crossentropy"
    mod_name = "multivariate_mlp_label{}_dtype{}_hd{}".format(label_type,data_type,hosp_data)
    mod_name += "_{}ep_{}ba_{}opt_{}loss".format(epoch_num,b_size,opt_name,loss_func)

    model = Sequential()
    model.add(Dense(100, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss=loss_func, optimizer=opt)

    MODDIR = PATH+"models/"+mod_name+"/"
    if not os.path.exists(MODDIR): os.makedirs(MODDIR)

    with open(MODDIR+"loss.txt", "w") as f:
        f.write("%s\t%s\t%s\t%s\n" % ("i", "train_loss", "val_loss", "epoch_time"))

    # Train and Save
    diffs = []; best_loss_so_far = float("inf")
    start_time = time.time(); per_iter_size = 300000
    for i in range(0,epoch_num):
        if per_iter_size < X_train.shape[0]:
            per_iter_size = X_train.shape[0]
        inds = np.random.choice(X_train.shape[0],per_iter_size,replace=False)
        curr_x = X_train[inds,]; curr_y = y_train[inds,]
        history = model.fit(curr_x, curr_y, epochs=1, batch_size=1000, 
                            validation_data=(X_valid,y_valid))

        # Save details about training
        train_loss = history.history['loss'][0]
        val_loss = history.history['val_loss'][0]
        epoch_time = time.time() - start_time
        with open(MODDIR+"loss.txt", "a") as f:
            f.write("%d\t%f\t%f\t%f\n" % (i, train_loss, val_loss, epoch_time))

        # Save model each iteration
        model.save("{}val_loss:{}_epoch:{}_{}.h5".format(MODDIR,val_loss,i,mod_name))
    return(MODDIR)

def load_mlp_model_and_test(RESDIR,MODDIR,X_test,y_test,data_type,label_type,hosp_data):
    model = load_min_model_helper(MODDIR)
    save_path = RESDIR+"hosp{}_data/{}/".format(hosp_data,data_type)
    if not os.path.exists(save_path): os.makedirs(save_path)
    print("[DEBUG] Loading model from {}".format(save_path))
    ypred = model.predict(X_test)
    np.save(save_path+"ypred.npy",ypred)
    np.save(save_path+"y_test.npy",y_test)
    auc = metrics.average_precision_score(y_test, ypred)
    np.random.seed(231)
    auc_lst = []
    roc_auc_lst = []
    for i in range(0,100):
        inds = np.random.choice(X_test.shape[0], X_test.shape[0], replace=True)
        auc = metrics.average_precision_score(y_test[inds], ypred[inds])
        auc_lst.append(auc)
        roc_auc = metrics.roc_auc_score(y_test[inds], ypred[inds])
        roc_auc_lst.append(roc_auc)
    auc_lst = np.array(auc_lst)
    roc_auc_lst = np.array(roc_auc_lst)
    print("[DEBUG] auc_lst.mean(): {}".format(auc_lst.mean()))
    print("[DEBUG] roc_auc_lst.mean(): {}".format(roc_auc_lst.mean()))

    SP = RESDIR+"hosp{}_data/".format(hosp_data)
    f = open('{}conf_int_hospdata{}_prauc.txt'.format(SP,hosp_data),'a')
    f.write("{}, {}+-{}\n".format(data_type,auc_lst.mean().round(4),2*np.std(auc_lst).round(4)))
    f.close()
    f = open('{}conf_int_hospdata{}_rocauc.txt'.format(SP,hosp_data),'a')
    f.write("{}, {}+-{}\n".format(data_type,roc_auc_lst.mean().round(4),2*np.std(roc_auc_lst).round(4)))
    f.close()
    np.save("{}auc_lst".format(save_path,data_type), auc_lst)
    np.save("{}roc_auc_lst".format(save_path,data_type), roc_auc_lst)

In [None]:
import sys
sys.path.append("..")
from xgb_setup import *
from sklearn.preprocessing import StandardScaler
os.nice(5)
PATH = "/projects/leelab2/hughchen/RELIC/repr_learning/"
DPATH = "/homes/gws/hughchen/phase/downstream_prediction/"
RESULTPATH = PATH+"/results/"; MODELPATH = PATH+"/models/"
lookback = 60
DEBUG = False

# label_type_eta_currfeat_lst = [("desat_bool92_5_nodesat",0.02,"SAO2"),
#                                ("nibpm60",0.1,"NIBPM"), 
#                                ("etco235",0.1,"ETCO2")]

# label_type_eta_currfeat_lst = [("nibpm60",0.1,"NIBPM"), 
#                                ("desat_bool92_5_nodesat",0.02,"SAO2")]
label_type_eta_currfeat_lst = [("desat_bool92_5_nodesat",0.02,"SAO2")]

for label_type, _, curr_feat in label_type_eta_currfeat_lst:
    print("\n[Progress] label_type: {}, eta: {}, curr_feat {}".format(label_type, "NA", curr_feat))

    xgb_type = "mlp_{}_top15".format(label_type)
    RESDIR = '{}{}/'.format(RESULTPATH, xgb_type)
    if not os.path.exists(RESDIR): os.makedirs(RESDIR)

    for hosp_data in [1]:
        print("\n[Progress] hosp_data {}".format(hosp_data))

        dt_lst = ["ema[top15]+nonsignal",
                  "raw[top15]+nonsignal",
                  "randemb[top15]+nonsignal"]
        for hosp_model in [0,1,"P"]:
            if hosp_model == "P" and "desat" not in label_type: continue
                
            print("\n[Progress] hosp_model {}".format(hosp_model))
            if hosp_model == 0:
                dt_lst += ["nextfive_{}[top15]+nonsignal".format(hosp_model),
                           "auto_{}[top15]+nonsignal".format(hosp_model),
                           "min5_{}[top15]+nonsignal".format(hosp_model)]
            else:
                dt_lst += ["nextfive_{}[top15]+nonsignal".format(hosp_model),
                           "auto_{}[top15]+nonsignal".format(hosp_model),
                           "min5_{}[top15]+nonsignal".format(hosp_model)]

            if label_type.startswith("desat"):
                dt_lst.append("hypox_{}[top15]+nonsignal".format(hosp_model))
            elif label_type.startswith("etco2"):
                dt_lst.append("hypoc_{}[top15]+nonsignal".format(hosp_model))
            elif label_type.startswith("nibpm"):
                dt_lst.append("hypot_{}[top15]+nonsignal".format(hosp_model))

        for data_type in dt_lst:
            print("\n[Progress] data_type {}".format(data_type))
            (trainvalX,trainvalY) = load_data(DPATH,data_type,label_type,True,
                                              hosp_data,curr_feat,DEBUG=DEBUG)
            print("[Progress] trainvalX.shape {}".format(trainvalX.shape))
            # Standardize data
            if "raw" in data_type or "ema" in data_type:
                print("[DEBUG] Scaling all features")
                scaler = StandardScaler()
                scaler.fit(trainvalX)
                trainvalX = scaler.transform(trainvalX,copy=True)
            else:
                print("[DEBUG] Scaling static features")
                scaler = StandardScaler()
                scaler.fit(trainvalX[:,-6:])
                trainvalX[:,-6:] = scaler.transform(trainvalX[:,-6:],copy=True)

            if not DEBUG:
                MODDIR = train_mlp_model(RESDIR,trainvalX,trainvalY,
                                         data_type,label_type,hosp_data)

            (test1X,test1Y)       = load_data(DPATH,data_type,label_type,False,
                                              hosp_data,curr_feat,DEBUG=DEBUG)
            print("[Progress] test1X.shape    {}".format(test1X.shape))
            # Standardize data
            if "raw" in data_type or "ema" in data_type:
                print("[DEBUG] Scaling all features")
                test1X = scaler.transform(test1X,copy=True)
            else:
                print("[DEBUG] Scaling static features")
                test1X[:,-6:] = scaler.transform(test1X[:,-6:],copy=True)

            if not DEBUG:
                load_mlp_model_and_test(RESDIR,MODDIR,test1X,test1Y,
                                        data_type,label_type,hosp_data)

### Run raw MLPs for nibpm and etco2

In [3]:
import sys
sys.path.append("..")
from xgb_setup import *
from sklearn.preprocessing import StandardScaler
os.nice(5)
PATH = "/projects/leelab2/hughchen/RELIC/repr_learning/"
DPATH = "/homes/gws/hughchen/phase/downstream_prediction/"
RESULTPATH = PATH+"/results/"; MODELPATH = PATH+"/models/"
lookback = 60
DEBUG = False

# label_type_eta_currfeat_lst = [("desat_bool92_5_nodesat",0.02,"SAO2"),
#                                ("nibpm60",0.1,"NIBPM"), 
#                                ("etco235",0.1,"ETCO2")]

label_type_eta_currfeat_lst = [("etco235",0.1,"ETCO2")]

for label_type, _, curr_feat in label_type_eta_currfeat_lst:
    print("\n[Progress] label_type: {}, eta: {}, curr_feat {}".format(label_type, "NA", curr_feat))

    xgb_type = "mlp_{}_top15".format(label_type)
    RESDIR = '{}{}/'.format(RESULTPATH, xgb_type)
    if not os.path.exists(RESDIR): os.makedirs(RESDIR)

    for hosp_data in [1]:
        print("\n[Progress] hosp_data {}".format(hosp_data))

        dt_lst = ["raw[top15]+nonsignal"]
        print(dt_lst)
        for data_type in dt_lst:
            print("\n[Progress] data_type {}".format(data_type))
            (trainvalX,trainvalY) = load_data(DPATH,data_type,label_type,True,
                                              hosp_data,curr_feat,DEBUG=DEBUG)
            print("[Progress] trainvalX.shape {}".format(trainvalX.shape))
            # Standardize data
            if "raw" in data_type or "ema" in data_type:
                print("[DEBUG] Scaling all features")
                scaler = StandardScaler()
                scaler.fit(trainvalX)
                trainvalX = scaler.transform(trainvalX,copy=True)
            else:
                print("[DEBUG] Scaling static features")
                scaler = StandardScaler()
                scaler.fit(trainvalX[:,-6:])
                trainvalX[:,-6:] = scaler.transform(trainvalX[:,-6:],copy=True)

            if not DEBUG:
                MODDIR = train_mlp_model(RESDIR,trainvalX,trainvalY,
                                         data_type,label_type,hosp_data)

            (test1X,test1Y)       = load_data(DPATH,data_type,label_type,False,
                                              hosp_data,curr_feat,DEBUG=DEBUG)
            print("[Progress] test1X.shape    {}".format(test1X.shape))
            # Standardize data
            if "raw" in data_type or "ema" in data_type:
                print("[DEBUG] Scaling all features")
                test1X = scaler.transform(test1X,copy=True)
            else:
                print("[DEBUG] Scaling static features")
                test1X[:,-6:] = scaler.transform(test1X[:,-6:],copy=True)

            if not DEBUG:
                load_mlp_model_and_test(RESDIR,MODDIR,test1X,test1Y,
                                        data_type,label_type,hosp_data)


[Progress] label_type: etco235, eta: NA, curr_feat ETCO2

[Progress] hosp_data 1
['raw[top15]+nonsignal']

[Progress] data_type raw[top15]+nonsignal
[DEBUG] Starting load_raw_data
[DEBUG] DPATH /homes/gws/hughchen/phase/downstream_prediction//data/etco235/hospital_1/
[Progress] trainvalX.shape (1754091, 906)
[DEBUG] Scaling all features
[PROGRESS] Starting create_model()
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Train on 1578681 samples, validate on 175410 samples
Epoch 1/1
Train on 1578681 samples, validate on 175410 samples
Epoch 1/1
Train on 1578681 samples, validate on 175410 samples
Epoch 1/1
Train on 1578681 samples, validate on 175410 samples
Epoch 1/1
Train on 1578681 samples, validate on 175410 samples
Epoch 1/1
Train on 1578681 samples, validate on 175410 samples
Epoch 1/1
Train on 157