In [None]:
import numpy as np
import os
import pandas as pd

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import seaborn as sns

import tensorflow as tf
from tensorflow import keras



from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import Dropout, InputLayer, Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

from tensorflow.compat.v1.keras.layers import CuDNNLSTM

from sklearn import metrics 

import warnings
warnings.filterwarnings(action='ignore')

#한글설정
import matplotlib.font_manager as fm

font_dirs = ['/usr/share/fonts/truetype/nanum', ]
font_files = fm.findSystemFonts(fontpaths=font_dirs)

for font_file in font_files:
    fm.fontManager.addfont(font_file)
    
# 한글 출력을 위해서 폰트 옵션을 설정합니다.
# "axes.unicode_minus" : 마이너스가 깨질 것을 방지

sns.set(font="NanumBarunGothic",
        rc={"axes.unicode_minus":False},
        style='darkgrid')

#GPU 사용 설정, -1이면 CPU 사용
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1"

gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
if gpus:  # gpu가 있다면, 용량 한도를 5GB로 설정
    tf.config.experimental.set_virtual_device_configuration(gpus[0], 
                                                            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5*1024)])
#     tf.config.experimental.set_virtual_device_configuration(gpus[1], 
#                                                             [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5*1024)])

<img src='./data/Method_allfit_Sequential.jpg' width=600px>

# Training - all fit

In [None]:
import random    
seed_num = 42
random.seed(seed_num)

X = np.load('/project/LSH/x_(7727,10,4068).npy')
y = np.load('/project/LSH/y_(7727,1).npy')

In [None]:
def get_model(gpu_mode=False):
    seed_num = 42
    tf.random.set_seed(seed_num)
    if gpu_mode:
        lstm = Sequential()
        lstm.add(InputLayer(input_shape=(X.shape[1],X.shape[2])))
        lstm.add(CuDNNLSTM(units=128, return_sequences=True))
        lstm.add(Activation('hard_sigmoid'))
        lstm.add(CuDNNLSTM(units=64, return_sequences=True))
        lstm.add(Activation('hard_sigmoid'))
        lstm.add(Dropout(0.2))
        lstm.add(CuDNNLSTM(units=64, return_sequences=True))
        lstm.add(Activation('hard_sigmoid'))
        lstm.add(CuDNNLSTM(units=32, return_sequences=False))
        lstm.add(Activation('hard_sigmoid'))
        lstm.add(Dropout(0.2))
        lstm.add(Dense(units=1, activation='sigmoid'))
    else:
        lstm = Sequential()
        lstm.add(InputLayer(input_shape=(x.shape[1],x.shape[2])))
        lstm.add(LSTM(units=128, activation='hard_sigmoid', return_sequences=True))
        lstm.add(LSTM(units=64, activation='hard_sigmoid', return_sequences=True))
        lstm.add(Dropout(0.2))
        lstm.add(LSTM(units=64, activation='hard_sigmoid', return_sequences=True))
        lstm.add(LSTM(units=32, activation='hard_sigmoid', return_sequences=False))
        lstm.add(Dropout(0.2))
        lstm.add(Dense(units=1, activation='sigmoid'))

    optimizer = Adam(learning_rate = 0.001)
    lstm.compile(optimizer=optimizer, loss = "binary_crossentropy", metrics=['acc'])
    return lstm

In [None]:
# MODEL_SAVE_FOLDER_PATH = './models/'
# filepath = MODEL_SAVE_FOLDER_PATH + 'ALLFIT_{epoch:02d}-{val_acc:.4f}.hdf5'
# ckpt = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_acc', save_best_only=True, save_weights_only=False, save_freq='epoch')

# model = get_model(gpu_mode=False)
# early_stop = EarlyStopping(monitor='val_acc', patience=50, verbose=1, restore_best_weights=True)
# model.fit(X, y, validation_split=0.25, batch_size=128, epochs=500,  callbacks=[early_stop, ckpt], shuffle=False)

In [None]:
model = tf.keras.models.load_model('./models/ALLFIT_17-0.7645.hdf5')
# model = tf.keras.models.load_model('./models/ALLFIT_01-0.4865.hdf5')

## Check accuracy

In [None]:
dic={}
for seed in range(0, 50, 5):
    random.seed(seed)

    x = np.load('/project/LSH/x_(7727,10,4068).npy')
    y = np.load('/project/LSH/y_(7727,1).npy')

    idx = list(range(len(x)))
    random.shuffle(idx)

    i = round(x.shape[0]*0.8)
    X_train, y_train = x[idx[:i],:,:], y[idx[:i]]
    X_test, y_test = x[idx[i:],:,:], y[idx[i:]]
    
    pred = model.predict(X_test)
    pred[pred>0.5]=1
    pred[pred<=0.5]=0
    acc = metrics.accuracy_score(y_test, pred)
    dic[seed]=acc
    print(f'정확도 :{metrics.accuracy_score(y_test, pred)}, seed_num = {seed}')
    
df = pd.DataFrame.from_dict(dic, orient='index')
print(f'seed = {seed_num}의 정확도 df 만들고 평균 확인 : {df.mean().values}')

# Entropy dict

In [None]:
def entropy(ratio_list):
    one_ratio, zero_ratio = ratio_list[0], ratio_list[1] 
    return - ((one_ratio * (np.log2(one_ratio))) + (zero_ratio * (np.log2(zero_ratio))))

X = np.load('/project/LSH/x_(7727,10,4068).npy')

entropy_dict = {}
for i in tqdm(range(len(COLS))):
    one_ratio = X[:,:,i].sum() / (X.shape[0]*X.shape[1])
    zero_ratio = 1 - one_ratio
    entropy_num = entropy([one_ratio, zero_ratio])
    entropy_dict[COLS[i]] = entropy_num

In [None]:
entropy_dict

# 2. {E(0to1) - E(1to0)} * Entropy * Lambda (0 or 1)

In [None]:
COLS = list(pd.read_csv('/project/LSH/total_data_7727.csv')['ITEMID'].sort_values().unique())

In [None]:
model = tf.keras.models.load_model('./models/ALLFIT_17-0.7645.hdf5')

In [None]:
result = []
for i in tqdm(range(X.shape[2])):
    save_cols = X[:,:,i].copy()
    #-----zero2one-----
    X[:,:,i] = 1
    pred1 = model.predict(X)
    mean_pred1 = np.mean(pred1)
    #-----one2zero-----
    X[:,:,i] = 0
    pred2 = model.predict(X)
    mean_pred2 = np.mean(pred2)
    
    result.append({'feature' : str(COLS[i]), 'one2zero' : mean_pred2,'zero2one' : mean_pred1,
                   'lambda0' : mean_pred2 - mean_pred1, 'lambda1' : (mean_pred2 - mean_pred1) * entropy_dict[COLS[i]]})

    #값 복원
    X[:,:,i] = save_cols

In [None]:
df = pd.DataFrame(result).sort_values('lambda0', ascending=False)
df

# 3. Feature Importance - Sequential Method

In [None]:
top10 = []
for n in range(10):
    if len(top10) > 1:
        print(top10)
        result = []
        for i in tqdm(range(X.shape[2])):
            save_cols = X[:,:,top10].copy()
            #-----zero2one-----
            X[:,:,top10] = 1
            pred1 = model.predict(X, batch_size=10000, workers=-1, use_multiprocessing=True)
            mean_pred1 = np.mean(pred1)
            #-----one2zero-----
            X[:,:,top10] = 0
            pred2 = model.predict(X, batch_size=10000, workers=-1, use_multiprocessing=True)
            mean_pred2 = np.mean(pred2)
            result.append({'feature_index' : i, 'one2zero' : mean_pred2,'zero2one' : mean_pred1,
                           'lambda0' : mean_pred2 - mean_pred1, 'lambda1' : (mean_pred2 - mean_pred1) * entropy_dict[COLS[i]]})
        df = pd.DataFrame(result).sort_values('lambda0', ascending=False)
        top10.append(df.feature_index[n])
    else:
        result = []
        for i in tqdm(range(X.shape[2])):
            save_cols = X[:,:,i].copy()
            #-----zero2one-----
            X[:,:,i] = 1
            pred1 = model.predict(X, batch_size=10000, workers=-1, use_multiprocessing=True)
            mean_pred1 = np.mean(pred1)
            #-----one2zero-----
            X[:,:,i] = 0
            pred2 = model.predict(X, batch_size=10000, workers=-1, use_multiprocessing=True)
            mean_pred2 = np.mean(pred2)

            result.append({'feature_index' : i, 'one2zero' : mean_pred2,'zero2one' : mean_pred1,
                           'lambda0' : mean_pred2 - mean_pred1, 'lambda1' : (mean_pred2 - mean_pred1) * entropy_dict[COLS[i]]})

        df = pd.DataFrame(result).sort_values('lambda0', ascending=False)
        top10.append(df.feature_index[0])

In [None]:
with open('./data/Sequential_top10.txt', 'w') as file:
    file.write(top10)

# Visualization

In [None]:
def visualization(top10_list):
    #-----데이터 로드-----
    X = np.load('/project/LSH/x_(7727,10,4068).npy')
    y = np.load('/project/LSH/y_(7727,1).npy')
    #-----컬럼이름 로드-----
    COLS = list(pd.read_csv('/project/LSH/total_data_7727.csv')['ITEMID'].sort_values().unique())
    #-----사망/퇴원 환자 분리-----
    d_index, s_index = np.where(y==1)[0], np.where(y==0)[0]
    d_X, s_X = X[d_index], X[s_index]
    result_d, result_s = [], []
    #-----사망/퇴원 환자별 item 1의 합계 구하기-----
    #day - 10일
    for d in range(10):
        #4068 - ITEM
        for f in range(d_X.shape[-1]):
            d_sum = d_X[:,d,f].sum()/d_X.shape[0]
            s_sum = s_X[:,d,f].sum()/s_X.shape[0]
            result_d.append({'cols':COLS[f], 'day':10-d,'per':d_sum})
            result_s.append({'cols':COLS[f], 'day':10-d,'per':s_sum})
    #-----최종 합계 df-----
    d_df = pd.DataFrame(result_d).sort_values(['cols','day']).reset_index(drop=True)
    s_df = pd.DataFrame(result_s).sort_values(['cols','day']).reset_index(drop=True)
    #-----Visualization-----
    plt.figure(figsize = (13,12), dpi=150)
    for i, f in enumerate(top10_list):
        plt.subplot(4,3,1+i)
        plt.title(f)
        ax = sns.lineplot(data = d_df[d_df['cols']==int(f)], x = 'day', y='per', label='사망')
        ax = sns.lineplot(data = s_df[s_df['cols']==int(f)], x = 'day', y='per', label='퇴원')
        ax.invert_xaxis()
        ax.legend(loc='upper left')
    plt.tight_layout()