## header

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
from os.path import exists

project_path = '/content/drive/MyDrive/Gproject/MIT_glyco'
os.chdir(project_path)

os.getcwd() # current working directory

Mounted at /content/drive


'/content/drive/MyDrive/Gproject/MIT_glyco'

In [2]:
import pandas as pd
import numpy as np

import time

version = 'v4'
update = False

In [3]:
load_name = "v4_data_all_sites.csv"
dataset = pd.read_csv(load_name)

positive_sites = dataset[dataset['positivity']==1]
negative_sites = dataset[dataset['positivity']==0]

print("total number of proteins:      ", len(dataset.protein.unique()))
print("total number of samples:       ", len(dataset))
print("total number of positive sites:", len(positive_sites))
print("total number of negative sites:", len(negative_sites))
display(dataset)

total number of proteins:       272
total number of samples:        257578
total number of positive sites: 529
total number of negative sites: 257049


Unnamed: 0,#,SEQ,SS,ASA,Phi,Psi,Theta(i-1=>i+1),Tau(i-2=>i+2),HSE_alpha_up,HSE_alpha_down,...,side_3,side_4,side_5,nAli,nPos,nS/nT,Proline,phi_psi,positivity,protein
0,1,M,C,112.7,-100.9,139.3,119.5,165.0,8.5,9.1,...,pro,cycle,small,0,0,3,0,alpha,0,A2ABU4
1,2,T,C,103.3,-102.0,132.1,117.6,-150.0,3.8,13.9,...,cycle,small,pro,0,0,3,0,alpha,0,A2ABU4
2,3,L,C,50.9,-97.8,134.7,118.5,-149.2,16.4,11.6,...,small,pro,gly,0,0,3,1,alpha,0,A2ABU4
3,4,P,C,77.2,-69.2,144.0,111.0,-105.3,7.5,16.7,...,pro,gly,small,1,0,3,0,alpha,0,A2ABU4
4,5,H,C,80.3,-95.4,141.5,118.6,-135.6,13.3,13.3,...,gly,small,very_small,2,0,3,0,alpha,0,A2ABU4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257573,2892,E,C,110.1,-95.9,129.5,114.1,-151.5,8.1,12.4,...,long,small,,1,1,3,0,alpha,0,Q9Y520
257574,2893,E,C,129.9,-89.6,124.2,111.0,-140.1,5.4,11.0,...,small,,,0,0,3,0,alpha,0,Q9Y520
257575,2894,T,C,106.7,-91.4,100.3,109.1,-163.0,3.1,9.1,...,,,,0,0,3,0,alpha,0,Q9Y520
257576,2895,K,C,160.4,-89.3,66.8,107.1,136.5,2.7,7.7,...,,,,0,1,3,0,other,0,Q9Y520


In [4]:
display(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257578 entries, 0 to 257577
Data columns (total 27 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   #                257578 non-null  int64  
 1   SEQ              257578 non-null  object 
 2   SS               257578 non-null  object 
 3   ASA              257578 non-null  float64
 4   Phi              257578 non-null  float64
 5   Psi              257578 non-null  float64
 6   Theta(i-1=>i+1)  257578 non-null  float64
 7   Tau(i-2=>i+2)    257578 non-null  float64
 8   HSE_alpha_up     257578 non-null  float64
 9   HSE_alpha_down   257578 non-null  float64
 10  P(C)             257578 non-null  float64
 11  P(H)             257578 non-null  float64
 12  P(E)             257578 non-null  float64
 13  flexibility      257578 non-null  float64
 14  side_-1          257578 non-null  object 
 15  side_1           257578 non-null  object 
 16  side_2           257578 non-null  obje

None

## case 1: without window

In [5]:
x_cat = ['SEQ', 'SS']
x_cts = ['ASA', 'Phi', 'Psi', 'Theta(i-1=>i+1)', 'Tau(i-2=>i+2)', 'HSE_alpha_up', 'HSE_alpha_down', 
         'P(C)', 'P(H)', 'P(E)', 'flexibility']
y_label = ['positivity']

data_x = pd.get_dummies(dataset[x_cts+x_cat], columns=x_cat)
data_y = dataset[y_label]

print(data_x.shape)
print(data_y.shape)

print("\nx columns:")
display(pd.Series(data_x.columns))

(257578, 34)
(257578, 1)

x columns:


0                 ASA
1                 Phi
2                 Psi
3     Theta(i-1=>i+1)
4       Tau(i-2=>i+2)
5        HSE_alpha_up
6      HSE_alpha_down
7                P(C)
8                P(H)
9                P(E)
10        flexibility
11              SEQ_A
12              SEQ_C
13              SEQ_D
14              SEQ_E
15              SEQ_F
16              SEQ_G
17              SEQ_H
18              SEQ_I
19              SEQ_K
20              SEQ_L
21              SEQ_M
22              SEQ_N
23              SEQ_P
24              SEQ_Q
25              SEQ_R
26              SEQ_S
27              SEQ_T
28              SEQ_V
29              SEQ_W
30              SEQ_Y
31               SS_C
32               SS_E
33               SS_H
dtype: object

In [6]:
input_data = data_x
output_data = data_y
window_size = 10
window_len  = 2 * window_size + 1

protein_list = list(dataset.protein.unique())
rnn_input = []
rnn_output = []
for name in protein_list:
    data = dataset[dataset['protein']==name]
    low_bound = data.index[0]
    up_bound  = data.index[-1]
    ST_idx = np.where((data['SEQ']=='S')|(data['SEQ']=='T'))[0] + low_bound
    
    for idx in ST_idx:
        start_idx = idx - window_size
        end_idx   = idx + window_size + 1
        
#         print(f"{name}, {low_bound}, {up_bound}, {start_idx}, {end_idx}")
        if start_idx < low_bound:
            zeros = np.zeros((low_bound-start_idx,input_data.shape[1]))
            temp  = input_data.iloc[low_bound:end_idx].values
            temp  = np.concatenate([zeros, temp], axis=0)
            
        elif end_idx > up_bound + 1:
            zeros = np.zeros((end_idx-up_bound-1,input_data.shape[1]))
            temp  = input_data.iloc[start_idx:up_bound+1].values
            temp  = np.concatenate([temp, zeros], axis=0)
            
        else:
            temp  = input_data.iloc[start_idx:end_idx].values
            
        rnn_input.append(temp)
        rnn_output.append(output_data.iloc[idx].values)
        
rnn_input = np.array(rnn_input)
rnn_output = np.array(rnn_output)

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit

def stratified_split(data_x, data_y, test_size=0.2, n_splits=1, random_state=1, dtype='arr'):

    split = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state)
    
    train_x, train_y, test_x, test_y  = [], [], [], []
    if dtype=='df':
        data_x = data_x.values
        data_y = data_y.values
    
    for train_index, test_index in split.split(data_x, data_y):
        train_x.append(data_x[train_index])
        train_y.append(data_y[train_index])

        test_x.append(data_x[test_index])
        test_y.append(data_y[test_index])
        
    print("train/test dataset")
    print("train:", train_x[0].shape, train_y[0].shape)
    print("test:", test_x[0].shape, test_y[0].shape)
    
    if n_splits == 1:
        return train_x[0],train_y[0], test_x[0], test_y[0]
    else:
        return train_x, train_y, test_x, test_y

In [8]:
### split data into train/test dataset ###
test_size = 0.2
n_splits = 10
random_state = 1

split = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state)
arr_x, arr_y = rnn_input, rnn_output # convert dataframe to nd-array 

i=1
train_idx_list, train_x_list, train_y_list, test_idx_list, test_x_list, test_y_list = [], [], [], [], [], []
for train_index, test_index in split.split(arr_x, arr_y):
    train_x = arr_x[train_index]
    train_y = arr_y[train_index]
    test_x = arr_x[test_index]
    test_y = arr_y[test_index]
    
    train_cts = train_x[:,:,:len(x_cts)]
    test_cts  = test_x[:,:,:len(x_cts)]
    
    x_min = train_cts.min(0).min(0)
    x_max = train_cts.max(0).max(0)
    
    train_x[:,:,:len(x_cts)] = (train_cts-x_min)/(x_max-x_min)
    test_x[:,:,:len(x_cts)] = (test_cts-x_min)/(x_max-x_min)
    
    print(f"{i}th iteration")
    print("train:", train_x.shape, train_y.shape, "check scale:", train_x.min(), train_x.max())
    print("test: ", test_x.shape, test_y.shape, "check scale:", test_x.min(), test_x.max())
    
    train_idx_list.append(train_index)
    train_x_list.append(train_x)
    train_y_list.append(train_y)
    
    test_idx_list.append(test_index)
    test_x_list.append(test_x)
    test_y_list.append(test_y)
    
    i += 1

1th iteration
train: (33011, 21, 34) (33011, 1) check scale: 0.0 1.0
test:  (8253, 21, 34) (8253, 1) check scale: 0.0 1.0
2th iteration
train: (33011, 21, 34) (33011, 1) check scale: 0.0 1.0
test:  (8253, 21, 34) (8253, 1) check scale: 0.0 1.001001001001001
3th iteration
train: (33011, 21, 34) (33011, 1) check scale: 0.0 1.0
test:  (8253, 21, 34) (8253, 1) check scale: 0.0 1.0090634441087611
4th iteration
train: (33011, 21, 34) (33011, 1) check scale: 0.0 1.0
test:  (8253, 21, 34) (8253, 1) check scale: 0.0 1.0
5th iteration
train: (33011, 21, 34) (33011, 1) check scale: 0.0 1.0
test:  (8253, 21, 34) (8253, 1) check scale: 0.0 1.0006653359946773
6th iteration
train: (33011, 21, 34) (33011, 1) check scale: 0.0 1.0
test:  (8253, 21, 34) (8253, 1) check scale: 0.0 1.0
7th iteration
train: (33011, 21, 34) (33011, 1) check scale: 0.0 1.0
test:  (8253, 21, 34) (8253, 1) check scale: 0.0 1.0090634441087611
8th iteration
train: (33011, 21, 34) (33011, 1) check scale: 0.0 1.0
test:  (8253, 21, 

In [9]:
## upsampling dataset 
import random
random_state = random_state

upsample_x_list, upsample_y_list = [], []
for train_x, train_y in zip(train_x_list, train_y_list):
    index_pos = np.where(train_y == 1)[0]
    index_neg = np.where(train_y == 0)[0]

    random.seed(random_state)
    up_index = [random.choice(index_pos) for _ in range(len(index_neg))] # get samples from positive sites as much as the number of negative sites

    upsample_pos_x = train_x[up_index]
    upsample_pos_y = train_y[up_index]
    sample_neg_x = train_x[index_neg]
    sample_neg_y = train_y[index_neg]

    sample_x = np.concatenate([upsample_pos_x, sample_neg_x], axis=0)
    sample_y = np.concatenate([upsample_pos_y, sample_neg_y], axis=0)

    shuffle_index = np.arange(len(sample_x))
    np.random.seed(random_state)
    np.random.shuffle(shuffle_index)
    sample_x = sample_x[shuffle_index]
    sample_y = sample_y[shuffle_index]
    
    upsample_x_list.append(sample_x)
    upsample_y_list.append(sample_y)

print("up-sampled train dataset:", sample_x.shape, sample_y.shape)
print("test dataset:", test_x.shape, test_y.shape)

up-sampled train dataset: (65188, 21, 34) (65188, 1)
test dataset: (8253, 21, 34) (8253, 1)


In [10]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import *
from keras.models import Model
from keras.callbacks import ModelCheckpoint, EarlyStopping

from tensorflow.keras.optimizers import Adam

def ClassifierLSTM(
    history_size, history_dim, class_num,
    rnn_layers = 1, rnn_neurons = 100,
    dense_layers = 1, dense_neurons = 100,
    optimizer = Adam(learning_rate = 0.001, beta_1=0.9, beta_2=0.999), loss="binary_crossentropy", metrics = ['accuracy']
):
    encoder_input = Input(shape=(history_size, history_dim), name='input_encoder')
    
    # encoder module
    if rnn_layers == 1:
        encoder_output, state_h, state_c = LSTM(rnn_neurons, return_state=True, name='encoder_last')(encoder_input)
        # encoder_states = [state_h, state_c]
        
    else:
        for i in range(rnn_layers):
            #first encoder layer
            if i==0: 
                encoder_output = LSTM(rnn_neurons, return_sequences=True, name="encoder_1")(encoder_input)
            #mediate encoder layer
            elif i < rnn_layers-1: 
                encoder_output = LSTM(rnn_neurons, return_sequences=True, name=f"encoder_{i+1}")(encoder_output)
            #last encoder layer
            else: 
                encoder_output, state_h, state_c  = LSTM(rnn_neurons, return_state=True, name=f"encoder_last")(encoder_output)
                # encoder_states = [state_h, state_c]
    
    # dense module
    if dense_layers == 1:
        dense_output = Dense(dense_neurons, name='dense_1')(encoder_output)
    else:
        for i in range(dense_layers):
            #first dense layer
            
            if i==0:
                dense_output = Dense(dense_neurons, name='dense_1')(encoder_output)
            #mediate encoder layer
            else:
                dense_output = Dense(dense_neurons, name=f'dense_{i+1}')(dense_output)
    dense_output = Dense(class_num, activation='sigmoid', name=f'dense_last')(dense_output)  
    
    # model compile
    model = Model(encoder_input, dense_output)
    model.compile(loss=loss,optimizer = optimizer, metrics=metrics)
    
    return model

In [11]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from seaborn import heatmap
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def scores(y_real, y_pred, rounding=4):
    
    accuracy  = 100*np.array(accuracy_score(y_real, y_pred)).round(rounding)
    precision = 100*np.array(precision_score(y_real, y_pred, average=None)).round(rounding)[1]
    recall    = 100*np.array(recall_score(y_real, y_pred, average=None)).round(rounding)[1]
    f1        = 100*np.array(f1_score(y_real, y_pred, average=None)).round(rounding)[1]
    
    return accuracy, precision, recall, f1

In [12]:
## hyper-parameter optimization
model_type = 'RNN_without_window'

valid_size = test_size/(1-test_size)
patience = 30
monitor = 'val_loss'
random_state = random_state
early_stopping_cb = EarlyStopping(patience=patience, restore_best_weights=True, monitor=monitor)

parameter_config = {
    "rnn_layers" : range(1,5),
    "rnn_neurons" : [16, 32, 64, 128, 256],
    "dnn_layers" : range(1,11),
    "dnn_neurons" : [16, 32, 64, 128, 256],
}

method = "random"
counts = 30
metrics = ['time', 'rnn_layers', 'rnn_neurons', 'dnn_layers', 'dnn_neurons', 
           'loss', 'val_loss', 'test_loss', 'accuracy', 'precision', 'recall', 'f1']


train_x = upsample_x_list[0]
train_y = upsample_y_list[0]
test_x = test_x_list[0]
test_y = test_y_list[0]

hpo_result = pd.DataFrame([], columns=metrics)
for i in range(counts):
    random.seed(i+1)
    rnn_layers = random.choice(parameter_config["rnn_layers"])
    rnn_neurons = random.choice(parameter_config["rnn_neurons"])
    dnn_layers = random.choice(parameter_config["dnn_layers"])
    dnn_neurons = random.choice(parameter_config["dnn_neurons"])
    print(f"random, {i+1} of {counts}: {rnn_layers} layers, {rnn_neurons} neurons, {dnn_layers} layers, {dnn_neurons} neurons")
    
    model_name = f'{version}_{model_type}_hpo_{i+1}of{counts}'
    save_path  = f'./model/{model_name}.h5'
    score_path = f"./score/{model_name}.csv"
    
    history_size = train_x.shape[1]
    history_dim = train_x.shape[2]
    y_dim = train_y.shape[1]
    
    model = ClassifierLSTM(history_size, history_dim, y_dim,
                           rnn_layers = rnn_layers, rnn_neurons = rnn_neurons,
                           dense_layers = dnn_layers, dense_neurons = dnn_neurons) 

    if not exists(save_path) or update:
        tf.random.set_seed(i+1)
        
        time_start = time.time()
        history = model.fit(train_x, train_y, verbose=0,
                            epochs=10000, callbacks=[early_stopping_cb], validation_split= valid_size)
        time_end = time.time()
        time_elapse = round((time_end - time_start)/60, 3)
        
        model.save_weights(save_path)
        print(f"model is saved to: {save_path}")
        
        idx = np.array(history.history[monitor]).argmin()
        val_loss = history.history['val_loss'][idx]
        loss = history.history['loss'][idx]
        test_loss = model.evaluate(test_x, test_y, verbose=0)[0]
        prediction = model.predict(test_x, verbose=0)
        prediction = prediction.round(0).astype(int)
        y_real = test_y
        y_pred = prediction
        accuracy, precision, recall, f1 = scores(y_real, y_pred)
        scores_df = pd.DataFrame([[time_elapse, rnn_layers, rnn_neurons, dnn_layers, dnn_neurons, 
                                   loss, val_loss, test_loss, accuracy, precision, recall, f1]], 
                                  columns=metrics)

        scores_df.to_csv(score_path)
        print(f"history is saved to: {score_path}")

    else:
        scores_df = pd.read_csv(score_path, index_col=0, header=0)
        print(f"history is loaded from: {score_path}")
        
    hpo_result = pd.concat([hpo_result, scores_df], axis=0)

hpo_result = hpo_result.reset_index(drop=True)

random, 1 of 30: 2 layers, 256 neurons, 2 layers, 64 neurons
history is loaded from: ./score/v4_RNN_without_window_hpo_1of30.csv
random, 2 of 30: 1 layers, 16 neurons, 2 layers, 64 neurons
history is loaded from: ./score/v4_RNN_without_window_hpo_2of30.csv
random, 3 of 30: 2 layers, 256 neurons, 9 layers, 32 neurons
history is loaded from: ./score/v4_RNN_without_window_hpo_3of30.csv
random, 4 of 30: 2 layers, 64 neurons, 2 layers, 128 neurons
history is loaded from: ./score/v4_RNN_without_window_hpo_4of30.csv
random, 5 of 30: 3 layers, 64 neurons, 9 layers, 16 neurons
history is loaded from: ./score/v4_RNN_without_window_hpo_5of30.csv
random, 6 of 30: 1 layers, 128 neurons, 5 layers, 16 neurons
history is loaded from: ./score/v4_RNN_without_window_hpo_6of30.csv
random, 7 of 30: 3 layers, 32 neurons, 7 layers, 16 neurons
history is loaded from: ./score/v4_RNN_without_window_hpo_7of30.csv
random, 8 of 30: 2 layers, 64 neurons, 7 layers, 32 neurons
history is loaded from: ./score/v4_RNN_w

In [13]:
# show the HPO result
target_metric = 'f1'
best_idx = hpo_result[target_metric].argmax()
best_parameters = hpo_result.iloc[best_idx]
print(f'best hyperparamerter: index {best_idx}')
display(best_parameters)

display(hpo_result.describe())

best hyperparamerter: index 26


time             28.641
rnn_layers            4
rnn_neurons          64
dnn_layers            5
dnn_neurons          32
loss           0.002482
val_loss       0.016598
test_loss      0.150824
accuracy          98.32
precision         26.03
recall            18.27
f1                21.47
Name: 26, dtype: object

Unnamed: 0,time,loss,val_loss,test_loss,accuracy,precision,recall,f1
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,18.248633,0.03002,0.040725,0.265071,98.102333,20.167333,13.622333,15.297
std,7.805233,0.125914,0.123697,0.161986,0.453551,9.063924,4.18042,4.270977
min,6.981,0.000457,0.00668,0.106001,97.15,0.0,0.0,0.0
25%,12.4155,0.00134,0.010482,0.156384,97.78,14.0875,11.54,12.8075
50%,16.7435,0.004353,0.015007,0.199327,98.275,20.33,13.46,15.68
75%,22.81575,0.007651,0.023507,0.348742,98.4375,25.7775,16.35,18.4225
max,35.32,0.693187,0.693131,0.688893,98.74,44.0,22.12,21.47


In [None]:
# bulid model
rnn_layers = best_parameters['rnn_layers']
rnn_neurons = best_parameters['rnn_neurons']
dnn_layers = best_parameters['dnn_layers']
dnn_neurons = best_parameters['dnn_neurons']

i=1
for train_x, train_y in zip(upsample_x_list, upsample_y_list):
    print(f"{i}th iteration")
    model_name = f'{version}_{model_type}_{rnn_layers}_{rnn_neurons}_{dnn_layers}_{dnn_neurons}_cv_{i}of{n_splits}'
    save_path  = f'./model/{model_name}.h5'
    
    if not exists(save_path) or update:
        history_size = train_x.shape[1]
        history_dim = train_x.shape[2]
        y_dim = train_y.shape[1]
        model = ClassifierLSTM(history_size, history_dim, y_dim,
                               rnn_layers = rnn_layers, rnn_neurons = rnn_neurons,
                               dense_layers = dnn_layers, dense_neurons = dnn_neurons) 

        if not exists(save_path) or update:
            tf.random.set_seed(random_state)
            history = model.fit(train_x, train_y, verbose=0,
                                epochs=10000, callbacks=[early_stopping_cb], validation_split= valid_size)
            model.save_weights(save_path)
            print(f"model is saved to: {save_path}")
    else:
        print(f"model already exists at: {save_path}")
    i += 1
display(model.summary())

1th iteration


In [None]:
# evaluate the trained model
i=1
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []
for test_x, test_y in zip(test_x_list, test_y_list):
    model_name = f'{version}_{model_type}_{rnn_layers}_{rnn_neurons}_{dnn_layers}_{dnn_neurons}_cv_{i}of{n_splits}'
    save_path  = f'./model/{model_name}.h5'
    
    model = ClassifierLSTM(history_size, history_dim, y_dim,
                               rnn_layers = rnn_layers, rnn_neurons = rnn_neurons,
                               dense_layers = dnn_layers, dense_neurons = dnn_neurons) 
    model.load_weights(save_path)
    
    prediction = model.predict(test_x, verbose=0)
    prediction = prediction.round(0).astype(int)

    y_real = test_y
    y_pred = prediction
    
    accuracy  = 100*np.array(accuracy_score(y_real, y_pred)).round(4)
    precision = 100*np.array(precision_score(y_real, y_pred, average=None)).round(4)[1]
    recall    = 100*np.array(recall_score(y_real, y_pred, average=None)).round(4)[1]
    f1        = 100*np.array(f1_score(y_real, y_pred, average=None)).round(4)[1]
    
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)
    
accuracies = np.array(accuracy_list)
precisions = np.array(precision_list)
recalls = np.array(recall_list)
f1s = np.array(f1_list)

results = pd.DataFrame(np.array([accuracies, precisions, recalls, f1s]).T, columns=['accuracy', 'precision', 'recall', 'f1-score'])
results.describe()

## case 1: with window

In [None]:
x_cat = ['SEQ', 'nS/nT', 'nAli', 'nPos', 'phi_psi', 'SS', 
         'side_-1', 'side_1', 'side_2', 'side_3','side_4', 'side_5']
x_cts = ['Proline', 'flexibility']
y_label = ['positivity']

data_x = pd.get_dummies(dataset[x_cts+x_cat], columns=x_cat)
data_y = dataset[y_label]

print(data_x.shape)
print(data_y.shape)

print("\nx columns:")
display(pd.Series(data_x.columns))

In [None]:
input_data = data_x
output_data = data_y

protein_list = list(dataset.protein.unique())
rnn_input = []
rnn_output = []
for name in protein_list:
    data = dataset[dataset['protein']==name]
    low_bound = data.index[0]
    up_bound  = data.index[-1]
    ST_idx = np.where((data['SEQ']=='S')|(data['SEQ']=='T'))[0] + low_bound
    
    for idx in ST_idx:
        start_idx = idx - window_size
        end_idx   = idx + window_size + 1
        
#         print(f"{name}, {low_bound}, {up_bound}, {start_idx}, {end_idx}")
        if start_idx < low_bound:
            zeros = np.zeros((low_bound-start_idx,input_data.shape[1]))
            temp  = input_data.iloc[low_bound:end_idx].values
            temp  = np.concatenate([zeros, temp], axis=0)
            
        elif end_idx > up_bound + 1:
            zeros = np.zeros((end_idx-up_bound-1,input_data.shape[1]))
            temp  = input_data.iloc[start_idx:up_bound+1].values
            temp  = np.concatenate([temp, zeros], axis=0)
            
        else:
            temp  = input_data.iloc[start_idx:end_idx].values
            
        rnn_input.append(temp)
        rnn_output.append(output_data.iloc[idx].values)
        
rnn_input = np.array(rnn_input)
rnn_output = np.array(rnn_output)

In [None]:
### split data into train/test dataset ###
split = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state)
arr_x, arr_y = rnn_input, rnn_output # convert dataframe to nd-array 

i=1
train_idx_list, train_x_list, train_y_list, test_idx_list, test_x_list, test_y_list = [], [], [], [], [], []
for train_index, test_index in split.split(arr_x, arr_y):
    train_x = arr_x[train_index]
    train_y = arr_y[train_index]
    test_x = arr_x[test_index]
    test_y = arr_y[test_index]
    
    train_cts = train_x[:,:,:len(x_cts)]
    test_cts  = test_x[:,:,:len(x_cts)]
    
    x_min = train_cts.min(0).min(0)
    x_max = train_cts.max(0).max(0)
    
    train_x[:,:,:len(x_cts)] = (train_cts-x_min)/(x_max-x_min)
    test_x[:,:,:len(x_cts)] = (test_cts-x_min)/(x_max-x_min)
    
    print(f"{i}th iteration")
    print("train:", train_x.shape, train_y.shape, "check scale:", train_x.min(), train_x.max())
    print("test: ", test_x.shape, test_y.shape, "check scale:", test_x.min(), test_x.max())
    
    train_idx_list.append(train_index)
    train_x_list.append(train_x)
    train_y_list.append(train_y)
    
    test_idx_list.append(test_index)
    test_x_list.append(test_x)
    test_y_list.append(test_y)
    
    i += 1

In [None]:
## upsampling dataset 
random_state = random_state

upsample_x_list, upsample_y_list = [], []
for train_x, train_y in zip(train_x_list, train_y_list):
    index_pos = np.where(train_y == 1)[0]
    index_neg = np.where(train_y == 0)[0]

    random.seed(random_state)
    up_index = [random.choice(index_pos) for _ in range(len(index_neg))] # get samples from positive sites as much as the number of negative sites

    upsample_pos_x = train_x[up_index]
    upsample_pos_y = train_y[up_index]
    sample_neg_x = train_x[index_neg]
    sample_neg_y = train_y[index_neg]

    sample_x = np.concatenate([upsample_pos_x, sample_neg_x], axis=0)
    sample_y = np.concatenate([upsample_pos_y, sample_neg_y], axis=0)

    shuffle_index = np.arange(len(sample_x))
    np.random.seed(random_state)
    np.random.shuffle(shuffle_index)
    sample_x = sample_x[shuffle_index]
    sample_y = sample_y[shuffle_index]
    
    upsample_x_list.append(sample_x)
    upsample_y_list.append(sample_y)

print("up-sampled train dataset:", sample_x.shape, sample_y.shape)
print("test dataset:", test_x.shape, test_y.shape)

In [None]:
## hyper-parameter optimization
model_type = 'RNN_without_window'

train_x = upsample_x_list[0]
train_y = upsample_y_list[0]
test_x = test_x_list[0]
test_y = test_y_list[0]

hpo_result = pd.DataFrame([], columns=metrics)
for i in range(counts):
    random.seed(i+1)
    rnn_layers = random.choice(parameter_config["rnn_layers"])
    rnn_neurons = random.choice(parameter_config["rnn_neurons"])
    dnn_layers = random.choice(parameter_config["dnn_layers"])
    dnn_neurons = random.choice(parameter_config["dnn_neurons"])
    print(f"random, {i+1} of {counts}: {rnn_layers} layers, {rnn_neurons} neurons, {dnn_layers} layers, {dnn_neurons} neurons")
    
    model_name = f'{version}_{model_type}_hpo_{i+1}of{counts}'
    save_path  = f'./model/{model_name}.h5'
    score_path = f"./score/{model_name}.csv"
    
    history_size = train_x.shape[1]
    history_dim = train_x.shape[2]
    y_dim = train_y.shape[1]
    
    model = ClassifierLSTM(history_size, history_dim, y_dim,
                           rnn_layers = rnn_layers, rnn_neurons = rnn_neurons,
                           dense_layers = dnn_layers, dense_neurons = dnn_neurons) 

    if not exists(save_path) or update:
        tf.random.set_seed(i+1)
        
        time_start = time.time()
        history = model.fit(train_x, train_y, verbose=0,
                            epochs=10000, callbacks=[early_stopping_cb], validation_split= valid_size)
        time_end = time.time()
        time_elapse = round((time_end - time_start)/60, 3)
        
        model.save_weights(save_path)
        print(f"model is saved to: {save_path}")
        
        idx = np.array(history.history[monitor]).argmin()
        val_loss = history.history['val_loss'][idx]
        loss = history.history['loss'][idx]
        test_loss = model.evaluate(test_x, test_y, verbose=0)[0]
        prediction = model.predict(test_x, verbose=0)
        prediction = prediction.round(0).astype(int)
        y_real = test_y
        y_pred = prediction
        accuracy, precision, recall, f1 = scores(y_real, y_pred)
        scores_df = pd.DataFrame([[time_elapse, rnn_layers, rnn_neurons, dnn_layers, dnn_neurons, 
                                   loss, val_loss, test_loss, accuracy, precision, recall, f1]], 
                                  columns=metrics)

        scores_df.to_csv(score_path)
        print(f"history is saved to: {score_path}")

    else:
        scores_df = pd.read_csv(score_path, index_col=0, header=0)
        print(f"history is loaded from: {score_path}")
        
    hpo_result = pd.concat([hpo_result, scores_df], axis=0)

hpo_result = hpo_result.reset_index(drop=True)

In [None]:
# show the HPO result
target_metric = 'f1'
best_idx = hpo_result[target_metric].argmax()
best_parameters = hpo_result.iloc[best_idx]
print(f'best hyperparamerter: index {best_idx}')
display(best_parameters)

display(hpo_result.describe())

In [None]:
# bulid model
rnn_layers = best_parameters['rnn_layers']
rnn_neurons = best_parameters['rnn_neurons']
dnn_layers = best_parameters['dnn_layers']
dnn_neurons = best_parameters['dnn_neurons']

i=1
for train_x, train_y in zip(upsample_x_list, upsample_y_list):
    print(f"{i}th iteration")
    model_name = f'{version}_{model_type}_{rnn_layers}_{rnn_neurons}_{dnn_layers}_{dnn_neurons}_cv_{i}of{n_splits}'
    save_path  = f'./model/{model_name}.h5'
    
    if not exists(save_path) or update:
        history_size = train_x.shape[1]
        history_dim = train_x.shape[2]
        y_dim = train_y.shape[1]
        model = ClassifierLSTM(history_size, history_dim, y_dim,
                               rnn_layers = rnn_layers, rnn_neurons = rnn_neurons,
                               dense_layers = dnn_layers, dense_neurons = dnn_neurons) 

        if not exists(save_path) or update:
            tf.random.set_seed(random_state)
            history = model.fit(train_x, train_y, verbose=0,
                                epochs=10000, callbacks=[early_stopping_cb], validation_split= valid_size)
            model.save_weights(save_path)
            print(f"model is saved to: {save_path}")
    else:
        print(f"model already exists at: {save_path}")
    i += 1
display(model.summary())

In [None]:
# evaluate the trained model
i=1
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []
for test_x, test_y in zip(test_x_list, test_y_list):
    model_name = f'{version}_{model_type}_{rnn_layers}_{rnn_neurons}_{dnn_layers}_{dnn_neurons}_cv_{i}of{n_splits}'
    save_path  = f'./model/{model_name}.h5'
    
    model = ClassifierLSTM(history_size, history_dim, y_dim,
                               rnn_layers = rnn_layers, rnn_neurons = rnn_neurons,
                               dense_layers = dnn_layers, dense_neurons = dnn_neurons) 
    model.load_weights(save_path)
    
    prediction = model.predict(test_x, verbose=0)
    prediction = prediction.round(0).astype(int)

    y_real = test_y
    y_pred = prediction
    
    accuracy  = 100*np.array(accuracy_score(y_real, y_pred)).round(4)
    precision = 100*np.array(precision_score(y_real, y_pred, average=None)).round(4)[1]
    recall    = 100*np.array(recall_score(y_real, y_pred, average=None)).round(4)[1]
    f1        = 100*np.array(f1_score(y_real, y_pred, average=None)).round(4)[1]
    
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)
    
accuracies = np.array(accuracy_list)
precisions = np.array(precision_list)
recalls = np.array(recall_list)
f1s = np.array(f1_list)

results = pd.DataFrame(np.array([accuracies, precisions, recalls, f1s]).T, columns=['accuracy', 'precision', 'recall', 'f1-score'])
results.describe()