## header

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
from os.path import exists

project_path = '/content/drive/MyDrive/Gproject/MIT_glyco'
os.chdir(project_path)

os.getcwd() # current working directory

Mounted at /content/drive


'/content/drive/MyDrive/Gproject/MIT_glyco'

In [2]:
import pandas as pd
import numpy as np

import time

version = 'v4'
update = False

In [3]:
load_name = "v4_data_all_sites.csv"
dataset = pd.read_csv(load_name)

ST_dataset = dataset[(dataset['SEQ']=='S') | (dataset['SEQ']=='T')].reset_index(drop=True)
ST_positive = ST_dataset[ST_dataset['positivity']==1]
ST_negative = ST_dataset[ST_dataset['positivity']==0]

print("total number of proteins:      ", len(ST_dataset.protein.unique()))
print("total number of samples:       ", len(ST_dataset))
print("total number of positive sites:", len(ST_positive))
print("total number of negative sites:", len(ST_negative))
display(ST_dataset)

total number of proteins:       272
total number of samples:        41264
total number of positive sites: 521
total number of negative sites: 40743


Unnamed: 0,#,SEQ,SS,ASA,Phi,Psi,Theta(i-1=>i+1),Tau(i-2=>i+2),HSE_alpha_up,HSE_alpha_down,...,side_3,side_4,side_5,nAli,nPos,nS/nT,Proline,phi_psi,positivity,protein
0,2,T,C,103.3,-102.0,132.1,117.6,-150.0,3.8,13.9,...,cycle,small,pro,0,0,3,0,alpha,0,A2ABU4
1,6,S,C,60.0,-87.4,138.5,115.2,-125.9,7.8,16.7,...,small,very_small,gly,2,0,4,1,alpha,0,A2ABU4
2,9,S,C,56.1,-89.9,142.4,116.8,121.2,8.2,13.9,...,normal,pro,normal,1,0,5,0,alpha,0,A2ABU4
3,16,S,C,75.5,-82.7,22.5,104.9,-107.4,5.9,14.2,...,very_small,normal,very_small,2,0,4,0,other,0,A2ABU4
4,18,T,C,78.2,-96.3,112.1,112.0,84.6,5.8,13.7,...,very_small,cycle,long,1,0,3,0,alpha,0,A2ABU4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41259,2876,T,C,76.2,-95.6,138.6,117.8,-135.9,8.8,13.9,...,very_small,long,small,2,0,3,0,alpha,0,Q9Y520
41260,2881,T,C,58.2,-99.5,90.7,111.0,-161.6,11.6,16.2,...,small,long,pro,2,0,4,0,other,0,Q9Y520
41261,2891,T,C,80.3,-102.2,131.1,116.5,-164.4,7.0,14.3,...,small,long,small,2,1,4,0,alpha,0,Q9Y520
41262,2894,T,C,106.7,-91.4,100.3,109.1,-163.0,3.1,9.1,...,,,,0,0,3,0,alpha,0,Q9Y520


In [4]:
display(ST_dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41264 entries, 0 to 41263
Data columns (total 27 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   #                41264 non-null  int64  
 1   SEQ              41264 non-null  object 
 2   SS               41264 non-null  object 
 3   ASA              41264 non-null  float64
 4   Phi              41264 non-null  float64
 5   Psi              41264 non-null  float64
 6   Theta(i-1=>i+1)  41264 non-null  float64
 7   Tau(i-2=>i+2)    41264 non-null  float64
 8   HSE_alpha_up     41264 non-null  float64
 9   HSE_alpha_down   41264 non-null  float64
 10  P(C)             41264 non-null  float64
 11  P(H)             41264 non-null  float64
 12  P(E)             41264 non-null  float64
 13  flexibility      41264 non-null  float64
 14  side_-1          41264 non-null  object 
 15  side_1           41264 non-null  object 
 16  side_2           41264 non-null  object 
 17  side_3      

None

# Case 1: without window

In [5]:
x_cat = ['SEQ', 'SS']
x_cts = ['ASA', 'Phi', 'Psi', 'Theta(i-1=>i+1)', 'Tau(i-2=>i+2)', 'HSE_alpha_up', 'HSE_alpha_down', 
         'P(C)', 'P(H)', 'P(E)', 'flexibility']
y_label = ['positivity']

data_x = pd.get_dummies(ST_dataset[x_cts+x_cat], columns=x_cat)
data_y = ST_dataset[y_label]

print(data_x.shape)
print(data_y.shape)

print("\nx columns:")
display(pd.Series(data_x.columns))

(41264, 16)
(41264, 1)

x columns:


0                 ASA
1                 Phi
2                 Psi
3     Theta(i-1=>i+1)
4       Tau(i-2=>i+2)
5        HSE_alpha_up
6      HSE_alpha_down
7                P(C)
8                P(H)
9                P(E)
10        flexibility
11              SEQ_S
12              SEQ_T
13               SS_C
14               SS_E
15               SS_H
dtype: object

In [6]:
from sklearn.model_selection import StratifiedShuffleSplit

def stratified_split(data_x, data_y, test_size=0.2, n_splits=1, random_state=1, dtype='arr'):

    split = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state)
    
    train_x, train_y, test_x, test_y  = [], [], [], []
    if dtype=='df':
        data_x = data_x.values
        data_y = data_y.values
    
    for train_index, test_index in split.split(data_x, data_y):
        train_x.append(data_x[train_index])
        train_y.append(data_y[train_index])

        test_x.append(data_x[test_index])
        test_y.append(data_y[test_index])
        
    print("train/test dataset")
    print("train:", train_x[0].shape, train_y[0].shape)
    print("test:", test_x[0].shape, test_y[0].shape)
    
    if n_splits == 1:
        return train_x[0],train_y[0], test_x[0], test_y[0]
    else:
        return train_x, train_y, test_x, test_y

In [7]:
### split data into train/test dataset ###
test_size = 0.2
n_splits = 10
random_state = 1

split = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state)
arr_x, arr_y = data_x.values, data_y.values # convert dataframe to nd-array 

i=1
train_idx_list, train_x_list, train_y_list, test_idx_list, test_x_list, test_y_list = [], [], [], [], [], []
for train_index, test_index in split.split(arr_x, arr_y):
    train_x = arr_x[train_index]
    train_y = arr_y[train_index]
    test_x = arr_x[test_index]
    test_y = arr_y[test_index]
    
    train_cts = train_x[:,:len(x_cts)]
    test_cts  = test_x[:,:len(x_cts)]
    
    x_min = train_cts.min(axis=0)
    x_max = train_cts.max(axis=0)
    
    train_x[:,:len(x_cts)] = (train_cts-x_min)/(x_max-x_min)
    test_x[:,:len(x_cts)] = (test_cts-x_min)/(x_max-x_min)
    
    print(f"{i}th iteration")
    print("train:", train_x.shape, train_y.shape, "check scale:", train_x.min(), train_x.max())
    print("test: ", test_x.shape, test_y.shape, "check scale:", test_x.min(), test_x.max())
    
    train_idx_list.append(train_index)
    train_x_list.append(train_x)
    train_y_list.append(train_y)
    
    test_idx_list.append(test_index)
    test_x_list.append(test_x)
    test_y_list.append(test_y)
    
    i += 1

1th iteration
train: (33011, 16) (33011, 1) check scale: 0.0 1.0
test:  (8253, 16) (8253, 1) check scale: -0.012024048096192367 1.124248496993988
2th iteration
train: (33011, 16) (33011, 1) check scale: 0.0 1.0
test:  (8253, 16) (8253, 1) check scale: 0.0 1.0173697270471465
3th iteration
train: (33011, 16) (33011, 1) check scale: 0.0 1.0
test:  (8253, 16) (8253, 1) check scale: -0.01669449081803005 1.0068027210884352
4th iteration
train: (33011, 16) (33011, 1) check scale: 0.0 1.0
test:  (8253, 16) (8253, 1) check scale: -0.010695187165775383 1.0
5th iteration
train: (33011, 16) (33011, 1) check scale: 0.0 1.0
test:  (8253, 16) (8253, 1) check scale: -0.004077471967380227 1.0
6th iteration
train: (33011, 16) (33011, 1) check scale: 0.0 1.0
test:  (8253, 16) (8253, 1) check scale: 0.0 1.1227722772277227
7th iteration
train: (33011, 16) (33011, 1) check scale: 0.0 1.0
test:  (8253, 16) (8253, 1) check scale: -0.01839464882943134 1.0068027210884352
8th iteration
train: (33011, 16) (33011,

In [8]:
## upsampling dataset 
import random
random_state = random_state

upsample_x_list, upsample_y_list = [], []
for train_x, train_y in zip(train_x_list, train_y_list):
    index_pos = np.where(train_y == 1)[0]
    index_neg = np.where(train_y == 0)[0]

    random.seed(random_state)
    up_index = [random.choice(index_pos) for _ in range(len(index_neg))] # get samples from positive sites as much as the number of negative sites

    upsample_pos_x = train_x[up_index]
    upsample_pos_y = train_y[up_index]
    sample_neg_x = train_x[index_neg]
    sample_neg_y = train_y[index_neg]

    sample_x = np.concatenate([upsample_pos_x, sample_neg_x], axis=0)
    sample_y = np.concatenate([upsample_pos_y, sample_neg_y], axis=0)

    shuffle_index = np.arange(len(sample_x))
    np.random.seed(random_state)
    np.random.shuffle(shuffle_index)
    sample_x = sample_x[shuffle_index]
    sample_y = sample_y[shuffle_index]
    
    upsample_x_list.append(sample_x)
    upsample_y_list.append(sample_y)

print("up-sampled train dataset:", sample_x.shape, sample_y.shape)
print("test dataset:", test_x.shape, test_y.shape)

up-sampled train dataset: (65188, 16) (65188, 1)
test dataset: (8253, 16) (8253, 1)


In [9]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import *
from keras.models import Model
from keras.callbacks import ModelCheckpoint, EarlyStopping

def basicMLP(x_dim, y_dim, n_layers, n_neurons):
    mlp_input = Input(shape=(x_dim,), name='dense_input')
    
    # MLP module
    for i in range(n_layers):
        if i==0:
            dense_output = Dense(n_neurons, name=f"dense_{i+1}")(mlp_input)
        else: 
            dense_output = Dense(n_neurons, name=f"dense_{i+1}")(dense_output)
    mlp_output = Dense(y_dim, name=f"dense_output", activation='sigmoid')(dense_output)
    
    model = Model(mlp_input, mlp_output)
    optimizer = keras.optimizers.Adam(learning_rate = 0.001, beta_1=0.9, beta_2=0.999)
    model.compile(loss='binary_crossentropy',optimizer = optimizer, metrics=['accuracy'])
    
    return model

In [10]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from seaborn import heatmap
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def scores(y_real, y_pred, rounding=4):
    
    accuracy  = 100*np.array(accuracy_score(y_real, y_pred)).round(rounding)
    precision = 100*np.array(precision_score(y_real, y_pred, average=None)).round(rounding)[1]
    recall    = 100*np.array(recall_score(y_real, y_pred, average=None)).round(rounding)[1]
    f1        = 100*np.array(f1_score(y_real, y_pred, average=None)).round(rounding)[1]
    
    return accuracy, precision, recall, f1

In [11]:
## hyper-parameter optimization
model_type = 'upsampled_MLP_without_window'

valid_size = test_size/(1-test_size)
patience = 30
monitor = 'val_loss'
random_state = random_state
early_stopping_cb = EarlyStopping(patience=patience, restore_best_weights=True, monitor=monitor)

parameter_config = {
    "n_layers" : range(1,11),
    "n_neurons" : [16, 32, 64, 128, 256]
}

method = "random"
counts = 30
metrics = ['time', 'n_layers', 'n_neurons', 'loss', 'val_loss', 'test_loss', 'accuracy', 'precision', 'recall', 'f1']


train_x = upsample_x_list[0]
train_y = upsample_y_list[0]
test_x = test_x_list[0]
test_y = test_y_list[0]

hpo_result = pd.DataFrame([], columns=metrics)
for i in range(counts):
    random.seed(i+1)
    n_layers = random.choice(parameter_config["n_layers"])
    n_neurons = random.choice(parameter_config["n_neurons"])
    print(f"random, {i+1} of {counts}: {n_layers} layers, {n_neurons} neurons")
    
    model_name = f'{version}_{model_type}_hpo_{i+1}of{counts}'
    save_path  = f'./model/{model_name}.h5'
    score_path = f"./score/{model_name}.csv"
    
    x_dim = train_x.shape[1]
    y_dim = train_y.shape[1]
    model = basicMLP(x_dim, y_dim, n_layers, n_neurons)
    
    if not exists(save_path) or update:
        tf.random.set_seed(i+1)
        
        time_start = time.time()
        history = model.fit(train_x, train_y, verbose=0,
                            epochs=10000, callbacks=[early_stopping_cb], validation_split= valid_size)
        time_end = time.time()
        time_elapse = round((time_end - time_start)/60, 3)
        
        model.save_weights(save_path)
        print(f"model is saved to: {save_path}")
        
        idx = np.array(history.history[monitor]).argmin()
        val_loss = history.history['val_loss'][idx]
        loss = history.history['loss'][idx]
        test_loss = model.evaluate(test_x, test_y, verbose=0)[0]
        prediction = model.predict(test_x, verbose=0)
        prediction = prediction.round(0).astype(int)
        y_real = test_y
        y_pred = prediction
        accuracy, precision, recall, f1 = scores(y_real, y_pred)
        scores_df = pd.DataFrame([[time_elapse, n_layers, n_neurons, loss, val_loss, test_loss, accuracy, precision, recall, f1]], 
                                  columns=metrics)

        scores_df.to_csv(score_path)
        print(f"history is saved to: {score_path}")

    else:
        scores_df = pd.read_csv(score_path, index_col=0, header=0)
        print(f"history is loaded from: {score_path}")
        
    hpo_result = pd.concat([hpo_result, scores_df], axis=0)

hpo_result = hpo_result.reset_index(drop=True)

random, 1 of 30: 3 layers, 256 neurons
history is loaded from: ./score/v4_upsampled_MLP_without_window_hpo_1of30.csv
random, 2 of 30: 1 layers, 16 neurons
history is loaded from: ./score/v4_upsampled_MLP_without_window_hpo_2of30.csv
random, 3 of 30: 4 layers, 256 neurons
history is loaded from: ./score/v4_upsampled_MLP_without_window_hpo_3of30.csv
random, 4 of 30: 4 layers, 64 neurons
history is loaded from: ./score/v4_upsampled_MLP_without_window_hpo_4of30.csv
random, 5 of 30: 10 layers, 64 neurons
history is loaded from: ./score/v4_upsampled_MLP_without_window_hpo_5of30.csv
random, 6 of 30: 10 layers, 16 neurons
history is loaded from: ./score/v4_upsampled_MLP_without_window_hpo_6of30.csv
random, 7 of 30: 6 layers, 32 neurons
history is loaded from: ./score/v4_upsampled_MLP_without_window_hpo_7of30.csv
random, 8 of 30: 4 layers, 64 neurons
history is loaded from: ./score/v4_upsampled_MLP_without_window_hpo_8of30.csv
random, 9 of 30: 8 layers, 256 neurons
history is loaded from: ./sco

In [12]:
# show the HPO result
target_metric = 'f1'
best_idx = hpo_result[target_metric].argmax()
best_parameters = hpo_result.iloc[best_idx]
print(f'best hyperparamerter: index {best_idx}')
display(best_parameters)

display(hpo_result.describe())

best hyperparamerter: index 10


time           16.463
n_layers            8
n_neurons         256
loss         0.661889
val_loss     0.659731
test_loss    0.666503
accuracy        58.85
precision        1.73
recall          56.73
f1               3.36
Name: 10, dtype: object

Unnamed: 0,time,loss,val_loss,test_loss,accuracy,precision,recall,f1
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,7.762433,0.661457,0.659707,0.654345,60.717,1.655667,51.665667,3.207667
std,3.76369,0.00055,0.000134,0.007295,1.175037,0.030926,1.986929,0.060211
min,3.246,0.660834,0.659497,0.639914,58.49,1.6,48.08,3.09
25%,5.27075,0.66108,0.659616,0.64959,60.07,1.6325,50.24,3.165
50%,7.0975,0.661408,0.659681,0.655113,60.73,1.66,51.44,3.21
75%,8.97,0.661626,0.659782,0.657686,61.5275,1.67,52.88,3.2375
max,19.578,0.663504,0.660098,0.668332,63.12,1.73,56.73,3.36


In [13]:
# train the models for cross validation
n_layers = best_parameters['n_layers']
n_neurons = best_parameters['n_neurons']

i=1
for train_x, train_y in zip(upsample_x_list, upsample_y_list):
    print(f"{i}th iteration")
    model_name = f'{version}_{model_type}_{n_layers}_{n_neurons}_cv_{i}of{n_splits}'
    save_path  = f'./model/{model_name}.h5'
    
    if not exists(save_path) or update:
        x_dim = train_x.shape[1]
        y_dim = train_y.shape[1]
        model = basicMLP(x_dim, y_dim, n_layers, n_neurons)

        if not exists(save_path) or update:
            tf.random.set_seed(random_state)
            history = model.fit(train_x, train_y, verbose=0,
                                epochs=10000, callbacks=[early_stopping_cb], validation_split= valid_size)
            model.save_weights(save_path)
            print(f"model is saved to: {save_path}")
    else:
        print(f"model already exists at: {save_path}")
    i += 1
display(model.summary())

1th iteration
model already exists at: ./model/v4_upsampled_MLP_without_window_8_256_cv_1of10.h5
2th iteration
model already exists at: ./model/v4_upsampled_MLP_without_window_8_256_cv_2of10.h5
3th iteration
model already exists at: ./model/v4_upsampled_MLP_without_window_8_256_cv_3of10.h5
4th iteration
model already exists at: ./model/v4_upsampled_MLP_without_window_8_256_cv_4of10.h5
5th iteration
model already exists at: ./model/v4_upsampled_MLP_without_window_8_256_cv_5of10.h5
6th iteration
model already exists at: ./model/v4_upsampled_MLP_without_window_8_256_cv_6of10.h5
7th iteration
model already exists at: ./model/v4_upsampled_MLP_without_window_8_256_cv_7of10.h5
8th iteration
model already exists at: ./model/v4_upsampled_MLP_without_window_8_256_cv_8of10.h5
9th iteration
model already exists at: ./model/v4_upsampled_MLP_without_window_8_256_cv_9of10.h5
10th iteration
model is saved to: ./model/v4_upsampled_MLP_without_window_8_256_cv_10of10.h5
Model: "model_30"
________________

None

In [14]:
# get the result of cross validation
i=1
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []
for test_x, test_y in zip(test_x_list, test_y_list):
    model_name = f'{version}_{model_type}_{n_layers}_{n_neurons}_cv_{i}of{n_splits}'
    save_path  = f'./model/{model_name}.h5'
    
    model = basicMLP(x_dim, y_dim, n_layers, n_neurons)
    model.load_weights(save_path)
    
    prediction = model.predict(test_x, verbose=0)
    prediction = prediction.round(0).astype(int)

    y_real = test_y
    y_pred = prediction
    
    accuracy  = 100*np.array(accuracy_score(y_real, y_pred)).round(4)
    precision = 100*np.array(precision_score(y_real, y_pred, average=None)).round(4)[1]
    recall    = 100*np.array(recall_score(y_real, y_pred, average=None)).round(4)[1]
    f1        = 100*np.array(f1_score(y_real, y_pred, average=None)).round(4)[1]
    
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)
    
accuracies = np.array(accuracy_list)
precisions = np.array(precision_list)
recalls = np.array(recall_list)
f1s = np.array(f1_list)

results = pd.DataFrame(np.array([accuracies, precisions, recalls, f1s]).T, columns=['accuracy', 'precision', 'recall', 'f1-score'])
results.describe()

Unnamed: 0,accuracy,precision,recall,f1-score
count,10.0,10.0,10.0,10.0
mean,59.675,1.831,58.846,3.551
std,1.566995,0.148133,4.505886,0.287613
min,57.63,1.63,52.88,3.15
25%,58.485,1.7225,56.97,3.345
50%,59.62,1.83,58.65,3.555
75%,60.315,1.8925,59.62,3.67
max,63.04,2.1,69.23,4.07


# Case 2: with window

In [15]:
x_cat = ['SEQ', 'nS/nT', 'nAli', 'nPos', 'phi_psi', 'SS', 
         'side_-1', 'side_1', 'side_2', 'side_3','side_4', 'side_5']
x_cts = ['Proline', 'flexibility']
y_label = ['positivity']

data_x = pd.get_dummies(ST_dataset[x_cts+x_cat], columns=x_cat)
data_y = ST_dataset[y_label]

print(data_x.shape)
print(data_y.shape)

print("\nx columns:")
display(pd.Series(data_x.columns))

(41264, 87)
(41264, 1)

x columns:


0               Proline
1           flexibility
2                 SEQ_S
3                 SEQ_T
4               nS/nT_1
            ...        
82          side_5_long
83        side_5_normal
84           side_5_pro
85         side_5_small
86    side_5_very_small
Length: 87, dtype: object

In [16]:
### split data into train/test dataset ###
split = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state)
arr_x, arr_y = data_x.values, data_y.values # convert dataframe to nd-array 

i=1
train_idx_list, train_x_list, train_y_list, test_idx_list, test_x_list, test_y_list = [], [], [], [], [], []
for train_index, test_index in split.split(arr_x, arr_y):
    train_x = arr_x[train_index]
    train_y = arr_y[train_index]
    test_x = arr_x[test_index]
    test_y = arr_y[test_index]
    
    train_cts = train_x[:,:len(x_cts)]
    test_cts  = test_x[:,:len(x_cts)]
    
    x_min = train_cts.min(axis=0)
    x_max = train_cts.max(axis=0)
    
    train_x[:,:len(x_cts)] = (train_cts-x_min)/(x_max-x_min)
    test_x[:,:len(x_cts)] = (test_cts-x_min)/(x_max-x_min)
    
    print(f"{i}th iteration")
    print("train:", train_x.shape, train_y.shape, "check scale:", train_x.min(), train_x.max())
    print("test: ", test_x.shape, test_y.shape, "check scale:", test_x.min(), test_x.max())
    
    train_idx_list.append(train_index)
    train_x_list.append(train_x)
    train_y_list.append(train_y)
    
    test_idx_list.append(test_index)
    test_x_list.append(test_x)
    test_y_list.append(test_y)
    
    i += 1

1th iteration
train: (33011, 87) (33011, 1) check scale: 0.0 1.0
test:  (8253, 87) (8253, 1) check scale: 0.0 1.0
2th iteration
train: (33011, 87) (33011, 1) check scale: 0.0 1.0
test:  (8253, 87) (8253, 1) check scale: 0.0 1.0
3th iteration
train: (33011, 87) (33011, 1) check scale: 0.0 1.0
test:  (8253, 87) (8253, 1) check scale: 0.0 1.0
4th iteration
train: (33011, 87) (33011, 1) check scale: 0.0 1.0
test:  (8253, 87) (8253, 1) check scale: -0.004077471967380227 1.0
5th iteration
train: (33011, 87) (33011, 1) check scale: 0.0 1.0
test:  (8253, 87) (8253, 1) check scale: -0.004077471967380227 1.0
6th iteration
train: (33011, 87) (33011, 1) check scale: 0.0 1.0
test:  (8253, 87) (8253, 1) check scale: 0.0 1.0
7th iteration
train: (33011, 87) (33011, 1) check scale: 0.0 1.0
test:  (8253, 87) (8253, 1) check scale: -0.012332990750256947 1.0
8th iteration
train: (33011, 87) (33011, 1) check scale: 0.0 1.0
test:  (8253, 87) (8253, 1) check scale: 0.0 1.0765027322404372
9th iteration
train

In [17]:
## upsampling dataset 
upsample_x_list, upsample_y_list = [], []
for train_x, train_y in zip(train_x_list, train_y_list):
    index_pos = np.where(train_y == 1)[0]
    index_neg = np.where(train_y == 0)[0]

    random.seed(random_state)
    up_index = [random.choice(index_pos) for _ in range(len(index_neg))] # get samples from positive sites as much as the number of negative sites

    upsample_pos_x = train_x[up_index]
    upsample_pos_y = train_y[up_index]
    sample_neg_x = train_x[index_neg]
    sample_neg_y = train_y[index_neg]

    sample_x = np.concatenate([upsample_pos_x, sample_neg_x], axis=0)
    sample_y = np.concatenate([upsample_pos_y, sample_neg_y], axis=0)

    shuffle_index = np.arange(len(sample_x))
    np.random.seed(random_state)
    np.random.shuffle(shuffle_index)
    sample_x = sample_x[shuffle_index]
    sample_y = sample_y[shuffle_index]
    
    upsample_x_list.append(sample_x)
    upsample_y_list.append(sample_y)

print("up-sampled train dataset:", sample_x.shape, sample_y.shape)
print("test dataset:", test_x.shape, test_y.shape)

up-sampled train dataset: (65188, 87) (65188, 1)
test dataset: (8253, 87) (8253, 1)


In [18]:
## hyper-parameter optimization
model_type = 'upsampled_MLP_original'

train_x = upsample_x_list[0]
train_y = upsample_y_list[0]
test_x = test_x_list[0]
test_y = test_y_list[0]

hpo_result = pd.DataFrame([], columns=metrics)
for i in range(counts):
    random.seed(i+1)
    n_layers = random.choice(parameter_config["n_layers"])
    n_neurons = random.choice(parameter_config["n_neurons"])
    print(f"random, {i+1} of {counts}: {n_layers} layers, {n_neurons} neurons")
    
    model_name = f'{version}_{model_type}_hpo_{i+1}of{counts}'
    save_path  = f'./model/{model_name}.h5'
    score_path = f"./score/{model_name}.csv"
    
    x_dim = train_x.shape[1]
    y_dim = train_y.shape[1]

    model = basicMLP(x_dim, y_dim, n_layers, n_neurons)
    if not exists(save_path) or update:
        tf.random.set_seed(i+1)
        
        time_start = time.time()
         history = model.fit(train_x, train_y, verbose=0,
                            epochs=10000, callbacks=[early_stopping_cb], validation_split= valid_size)
        time_end = time.time()
        time_elapse = round((time_end - time_start)/60, 3)
        
        model.save_weights(save_path)
        print(f"model is saved to: {save_path}")
        
        idx = np.array(history.history[monitor]).argmin()
        val_loss = history.history['val_loss'][idx]
        loss = history.history['loss'][idx]
        test_loss = model.evaluate(test_x, test_y, verbose=0)[0]
        prediction = model.predict(test_x, verbose=0)
        prediction = prediction.round(0).astype(int)
        y_real = test_y
        y_pred = prediction
        accuracy, precision, recall, f1 = scores(y_real, y_pred)
        scores_df = pd.DataFrame([[time_elapse, n_layers, n_neurons, loss, val_loss, test_loss, accuracy, precision, recall, f1]], 
                                  columns=metrics)

        scores_df.to_csv(score_path)
        print(f"history is saved to: {score_path}")

    else:
        scores_df = pd.read_csv(score_path, index_col=0, header=0)
        print(f"history is loaded from: {score_path}")
        
    hpo_result = pd.concat([hpo_result, scores_df], axis=0)

hpo_result = hpo_result.reset_index(drop=True)

random, 1 of 30: 3 layers, 256 neurons
model is saved to: ./model/v4_upsampled_MLP_original_hpo_1of30.h5
history is saved to: ./score/v4_upsampled_MLP_original_hpo_1of30.csv
random, 2 of 30: 1 layers, 16 neurons
model is saved to: ./model/v4_upsampled_MLP_original_hpo_2of30.h5
history is saved to: ./score/v4_upsampled_MLP_original_hpo_2of30.csv
random, 3 of 30: 4 layers, 256 neurons
model is saved to: ./model/v4_upsampled_MLP_original_hpo_3of30.h5
history is saved to: ./score/v4_upsampled_MLP_original_hpo_3of30.csv
random, 4 of 30: 4 layers, 64 neurons
model is saved to: ./model/v4_upsampled_MLP_original_hpo_4of30.h5
history is saved to: ./score/v4_upsampled_MLP_original_hpo_4of30.csv
random, 5 of 30: 10 layers, 64 neurons
model is saved to: ./model/v4_upsampled_MLP_original_hpo_5of30.h5
history is saved to: ./score/v4_upsampled_MLP_original_hpo_5of30.csv
random, 6 of 30: 10 layers, 16 neurons
model is saved to: ./model/v4_upsampled_MLP_original_hpo_6of30.h5
history is saved to: ./scor

In [19]:
# show the HPO result
target_metric = 'f1'
best_idx = hpo_result[target_metric].argmax()
best_parameters = hpo_result.iloc[best_idx]
print(f'best hyperparamerter: index {best_idx}')
display(best_parameters)

display(hpo_result.describe())

best hyperparamerter: index 4


time            3.547
n_layers           10
n_neurons          64
loss         0.601009
val_loss      0.59527
test_loss    0.579639
accuracy        69.66
precision        2.19
recall          52.88
f1               4.21
Name: 4, dtype: object

Unnamed: 0,time,loss,val_loss,test_loss,accuracy,precision,recall,f1
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,7.2842,0.598798,0.594807,0.593279,68.275333,2.112333,53.268333,4.061667
std,4.666805,0.000979,0.000183,0.006746,0.682161,0.041662,0.967364,0.076478
min,2.816,0.597796,0.594545,0.573693,67.27,2.04,50.96,3.93
25%,4.3015,0.598097,0.594689,0.591254,67.825,2.09,52.88,4.0225
50%,5.847,0.598546,0.594762,0.59425,68.175,2.11,52.88,4.05
75%,8.24975,0.599227,0.594916,0.597948,68.6225,2.145,53.85,4.13
max,25.64,0.601957,0.59527,0.603813,70.25,2.19,55.77,4.21


In [20]:
# bulid model
n_layers = best_parameters['n_layers']
n_neurons = best_parameters['n_neurons']

i=1
for train_x, train_y in zip(upsample_x_list, upsample_y_list):
    print(f"{i}th iteration")
    model_name = f'{version}_{model_type}_{n_layers}_{n_neurons}_cv_{i}of{n_splits}'
    save_path  = f'./model/{model_name}.h5'
    
    if not exists(save_path) or update:
        x_dim = train_x.shape[1]
        y_dim = train_y.shape[1]
        model = basicMLP(x_dim, y_dim, n_layers, n_neurons)

        if not exists(save_path) or update:
            tf.random.set_seed(random_state)
            history = model.fit(train_x, train_y, verbose=0,
                                epochs=10000, callbacks=[early_stopping_cb], validation_split= valid_size)
            model.save_weights(save_path)
            print(f"model is saved to: {save_path}")
    else:
        print(f"model already exists at: {save_path}")
    i += 1
display(model.summary())

1th iteration
model is saved to: ./model/v4_upsampled_MLP_original_10_64_cv_1of10.h5
2th iteration
model is saved to: ./model/v4_upsampled_MLP_original_10_64_cv_2of10.h5
3th iteration
model is saved to: ./model/v4_upsampled_MLP_original_10_64_cv_3of10.h5
4th iteration
model is saved to: ./model/v4_upsampled_MLP_original_10_64_cv_4of10.h5
5th iteration
model is saved to: ./model/v4_upsampled_MLP_original_10_64_cv_5of10.h5
6th iteration
model is saved to: ./model/v4_upsampled_MLP_original_10_64_cv_6of10.h5
7th iteration
model is saved to: ./model/v4_upsampled_MLP_original_10_64_cv_7of10.h5
8th iteration
model is saved to: ./model/v4_upsampled_MLP_original_10_64_cv_8of10.h5
9th iteration
model is saved to: ./model/v4_upsampled_MLP_original_10_64_cv_9of10.h5
10th iteration
model is saved to: ./model/v4_upsampled_MLP_original_10_64_cv_10of10.h5
Model: "model_80"
_________________________________________________________________
 Layer (type)                Output Shape              Param #  

None

In [21]:
# evaluate the trained model
i=1
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []
for test_x, test_y in zip(test_x_list, test_y_list):
    model_name = f'{version}_{model_type}_{n_layers}_{n_neurons}_cv_{i}of{n_splits}'
    save_path  = f'./model/{model_name}.h5'
    
    model = basicMLP(x_dim, y_dim, n_layers, n_neurons)
    model.load_weights(save_path)
    
    prediction = model.predict(test_x, verbose=0)
    prediction = prediction.round(0).astype(int)

    y_real = test_y
    y_pred = prediction
    
    accuracy  = 100*np.array(accuracy_score(y_real, y_pred)).round(4)
    precision = 100*np.array(precision_score(y_real, y_pred, average=None)).round(4)[1]
    recall    = 100*np.array(recall_score(y_real, y_pred, average=None)).round(4)[1]
    f1        = 100*np.array(f1_score(y_real, y_pred, average=None)).round(4)[1]
    
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)
    
accuracies = np.array(accuracy_list)
precisions = np.array(precision_list)
recalls = np.array(recall_list)
f1s = np.array(f1_list)

results = pd.DataFrame(np.array([accuracies, precisions, recalls, f1s]).T, columns=['accuracy', 'precision', 'recall', 'f1-score'])
results.describe()

Unnamed: 0,accuracy,precision,recall,f1-score
count,10.0,10.0,10.0,10.0
mean,67.367,2.418,63.173,4.658
std,0.816402,0.23729,5.587247,0.459971
min,66.5,2.1,53.85,4.04
25%,66.73,2.27,60.58,4.37
50%,67.29,2.37,62.98,4.565
75%,67.6175,2.56,66.8275,4.9375
max,69.28,2.83,71.15,5.45
