In [1]:
import pandas as pd
import numpy as np

import time
from os import getcwd 
from os.path import exists

getcwd() # current working directory

version = 'v4'
update = False

In [2]:
load_name = "v4_data_all_sites.csv"
dataset = pd.read_csv(load_name)

ST_dataset = dataset[(dataset['SEQ']=='S') | (dataset['SEQ']=='T')].reset_index(drop=True)
ST_positive = ST_dataset[ST_dataset['positivity']==1]
ST_negative = ST_dataset[ST_dataset['positivity']==0]

print("total number of proteins:      ", len(ST_dataset.protein.unique()))
print("total number of samples:       ", len(ST_dataset))
print("total number of positive sites:", len(ST_positive))
print("total number of negative sites:", len(ST_negative))
display(ST_dataset)

total number of proteins:       272
total number of samples:        41264
total number of positive sites: 521
total number of negative sites: 40743


Unnamed: 0,#,SEQ,SS,ASA,Phi,Psi,Theta(i-1=>i+1),Tau(i-2=>i+2),HSE_alpha_up,HSE_alpha_down,...,side_3,side_4,side_5,nAli,nPos,nS/nT,Proline,phi_psi,positivity,protein
0,2,T,C,103.3,-102.0,132.1,117.6,-150.0,3.8,13.9,...,cycle,small,pro,0,0,3,0,alpha,0,A2ABU4
1,6,S,C,60.0,-87.4,138.5,115.2,-125.9,7.8,16.7,...,small,very_small,gly,2,0,4,1,alpha,0,A2ABU4
2,9,S,C,56.1,-89.9,142.4,116.8,121.2,8.2,13.9,...,normal,pro,normal,1,0,5,0,alpha,0,A2ABU4
3,16,S,C,75.5,-82.7,22.5,104.9,-107.4,5.9,14.2,...,very_small,normal,very_small,2,0,4,0,other,0,A2ABU4
4,18,T,C,78.2,-96.3,112.1,112.0,84.6,5.8,13.7,...,very_small,cycle,long,1,0,3,0,alpha,0,A2ABU4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41259,2876,T,C,76.2,-95.6,138.6,117.8,-135.9,8.8,13.9,...,very_small,long,small,2,0,3,0,alpha,0,Q9Y520
41260,2881,T,C,58.2,-99.5,90.7,111.0,-161.6,11.6,16.2,...,small,long,pro,2,0,4,0,other,0,Q9Y520
41261,2891,T,C,80.3,-102.2,131.1,116.5,-164.4,7.0,14.3,...,small,long,small,2,1,4,0,alpha,0,Q9Y520
41262,2894,T,C,106.7,-91.4,100.3,109.1,-163.0,3.1,9.1,...,,,,0,0,3,0,alpha,0,Q9Y520


In [3]:
display(ST_dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41264 entries, 0 to 41263
Data columns (total 27 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   #                41264 non-null  int64  
 1   SEQ              41264 non-null  object 
 2   SS               41264 non-null  object 
 3   ASA              41264 non-null  float64
 4   Phi              41264 non-null  float64
 5   Psi              41264 non-null  float64
 6   Theta(i-1=>i+1)  41264 non-null  float64
 7   Tau(i-2=>i+2)    41264 non-null  float64
 8   HSE_alpha_up     41264 non-null  float64
 9   HSE_alpha_down   41264 non-null  float64
 10  P(C)             41264 non-null  float64
 11  P(H)             41264 non-null  float64
 12  P(E)             41264 non-null  float64
 13  flexibility      41264 non-null  float64
 14  side_-1          41264 non-null  object 
 15  side_1           41264 non-null  object 
 16  side_2           41264 non-null  object 
 17  side_3      

None

# Case 1: without window

In [4]:
x_cat = ['SEQ', 'SS']
x_cts = ['ASA', 'Phi', 'Psi', 'Theta(i-1=>i+1)', 'Tau(i-2=>i+2)', 'HSE_alpha_up', 'HSE_alpha_down', 
         'P(C)', 'P(H)', 'P(E)', 'flexibility']
y_label = ['positivity']

data_x = pd.get_dummies(ST_dataset[x_cts+x_cat], columns=x_cat)
data_y = ST_dataset[y_label]

print(data_x.shape)
print(data_y.shape)

print("\nx columns:")
display(pd.Series(data_x.columns))

(41264, 16)
(41264, 1)

x columns:


0                 ASA
1                 Phi
2                 Psi
3     Theta(i-1=>i+1)
4       Tau(i-2=>i+2)
5        HSE_alpha_up
6      HSE_alpha_down
7                P(C)
8                P(H)
9                P(E)
10        flexibility
11              SEQ_S
12              SEQ_T
13               SS_C
14               SS_E
15               SS_H
dtype: object

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit

def stratified_split(data_x, data_y, test_size=0.2, n_splits=1, random_state=1, dtype='arr'):

    split = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state)
    
    train_x, train_y, test_x, test_y  = [], [], [], []
    if dtype=='df':
        data_x = data_x.values
        data_y = data_y.values
    
    for train_index, test_index in split.split(data_x, data_y):
        train_x.append(data_x[train_index])
        train_y.append(data_y[train_index])

        test_x.append(data_x[test_index])
        test_y.append(data_y[test_index])
        
    print("train/test dataset")
    print("train:", train_x[0].shape, train_y[0].shape)
    print("test:", test_x[0].shape, test_y[0].shape)
    
    if n_splits == 1:
        return train_x[0],train_y[0], test_x[0], test_y[0]
    else:
        return train_x, train_y, test_x, test_y

In [6]:
### split data into train/test dataset ###
test_size = 0.2
n_splits = 10
random_state = 1

split = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state)
arr_x, arr_y = data_x.values, data_y.values # convert dataframe to nd-array 

i=1
train_idx_list, train_x_list, train_y_list, test_idx_list, test_x_list, test_y_list = [], [], [], [], [], []
for train_index, test_index in split.split(arr_x, arr_y):
    train_x = arr_x[train_index]
    train_y = arr_y[train_index]
    test_x = arr_x[test_index]
    test_y = arr_y[test_index]
    
    train_cts = train_x[:,:len(x_cts)]
    test_cts  = test_x[:,:len(x_cts)]
    
    x_min = train_cts.min(axis=0)
    x_max = train_cts.max(axis=0)
    
    train_x[:,:len(x_cts)] = (train_cts-x_min)/(x_max-x_min)
    test_x[:,:len(x_cts)] = (test_cts-x_min)/(x_max-x_min)
    
    print(f"{i}th iteration")
    print("train:", train_x.shape, train_y.shape, "check scale:", train_x.min(), train_x.max())
    print("test: ", test_x.shape, test_y.shape, "check scale:", test_x.min(), test_x.max())
    
    train_idx_list.append(train_index)
    train_x_list.append(train_x)
    train_y_list.append(train_y)
    
    test_idx_list.append(test_index)
    test_x_list.append(test_x)
    test_y_list.append(test_y)
    
    i += 1

1th iteration
train: (33011, 16) (33011, 1) check scale: 0.0 1.0
test:  (8253, 16) (8253, 1) check scale: -0.012024048096192367 1.124248496993988
2th iteration
train: (33011, 16) (33011, 1) check scale: 0.0 1.0
test:  (8253, 16) (8253, 1) check scale: 0.0 1.0173697270471465
3th iteration
train: (33011, 16) (33011, 1) check scale: 0.0 1.0
test:  (8253, 16) (8253, 1) check scale: -0.01669449081803005 1.0068027210884352
4th iteration
train: (33011, 16) (33011, 1) check scale: 0.0 1.0
test:  (8253, 16) (8253, 1) check scale: -0.010695187165775383 1.0
5th iteration
train: (33011, 16) (33011, 1) check scale: 0.0 1.0
test:  (8253, 16) (8253, 1) check scale: -0.004077471967380227 1.0
6th iteration
train: (33011, 16) (33011, 1) check scale: 0.0 1.0
test:  (8253, 16) (8253, 1) check scale: 0.0 1.1227722772277227
7th iteration
train: (33011, 16) (33011, 1) check scale: 0.0 1.0
test:  (8253, 16) (8253, 1) check scale: -0.01839464882943134 1.0068027210884352
8th iteration
train: (33011, 16) (33011,

In [7]:
## upsampling dataset 
import random
random_state = random_state

upsample_x_list, upsample_y_list = [], []
for train_x, train_y in zip(train_x_list, train_y_list):
    index_pos = np.where(train_y == 1)[0]
    index_neg = np.where(train_y == 0)[0]

    random.seed(random_state)
    up_index = [random.choice(index_pos) for _ in range(len(index_neg))] # get samples from positive sites as much as the number of negative sites

    upsample_pos_x = train_x[up_index]
    upsample_pos_y = train_y[up_index]
    sample_neg_x = train_x[index_neg]
    sample_neg_y = train_y[index_neg]

    sample_x = np.concatenate([upsample_pos_x, sample_neg_x], axis=0)
    sample_y = np.concatenate([upsample_pos_y, sample_neg_y], axis=0)

    shuffle_index = np.arange(len(sample_x))
    np.random.seed(random_state)
    np.random.shuffle(shuffle_index)
    sample_x = sample_x[shuffle_index]
    sample_y = sample_y[shuffle_index]
    
    upsample_x_list.append(sample_x)
    upsample_y_list.append(sample_y)

print("up-sampled train dataset:", sample_x.shape, sample_y.shape)
print("test dataset:", test_x.shape, test_y.shape)

up-sampled train dataset: (65188, 16) (65188, 1)
test dataset: (8253, 16) (8253, 1)


In [8]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import *
from keras.models import Model
from keras.callbacks import ModelCheckpoint, EarlyStopping

def basicMLP(x_dim, y_dim, n_layers, n_neurons):
    mlp_input = Input(shape=(x_dim,), name='dense_input')
    
    # MLP module
    for i in range(n_layers):
        if i==0:
            dense_output = Dense(n_neurons, name=f"dense_{i+1}")(mlp_input)
        else: 
            dense_output = Dense(n_neurons, name=f"dense_{i+1}")(dense_output)
    mlp_output = Dense(y_dim, name=f"dense_output", activation='sigmoid')(dense_output)
    
    model = Model(mlp_input, mlp_output)
    optimizer = keras.optimizers.Adam(learning_rate = 0.001, beta_1=0.9, beta_2=0.999)
    model.compile(loss='binary_crossentropy',optimizer = optimizer, metrics=['accuracy'])
    
    return model

In [9]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from seaborn import heatmap
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def scores(y_real, y_pred, rounding=4):
    
    accuracy  = 100*np.array(accuracy_score(y_real, y_pred)).round(rounding)
    precision = 100*np.array(precision_score(y_real, y_pred, average=None)).round(rounding)[1]
    recall    = 100*np.array(recall_score(y_real, y_pred, average=None)).round(rounding)[1]
    f1        = 100*np.array(f1_score(y_real, y_pred, average=None)).round(rounding)[1]
    
    return accuracy, precision, recall, f1

In [None]:
## hyper-parameter optimization
model_type = 'upsampled_MLP_without_window'

valid_size = test_size/(1-test_size)
patience = 30
monitor = 'val_loss'
random_state = random_state
early_stopping_cb = EarlyStopping(patience=patience, restore_best_weights=True, monitor=monitor)

parameter_config = {
    "n_layers" : range(1,20),
    "n_neurons" : range(1, 501)
}

method = "random"
counts = 100
metrics = ['time', 'n_layers', 'n_neurons', 'loss', 'val_loss', 'test_loss', 'accuracy', 'precision', 'recall', 'f1']


train_x = upsample_x_list[0]
train_y = upsample_y_list[0]
test_x = test_x_list[0]
test_y = test_y_list[0]

hpo_result = pd.DataFrame([], columns=metrics)
for i in range(counts):
    random.seed(i+1)
    n_layers = random.choice(parameter_config["n_layers"])
    n_neurons = random.choice(parameter_config["n_neurons"])
    print(f"random, {i+1} of {counts}: {n_layers} layers, {n_neurons} neurons")
    
    model_name = f'{version}_{model_type}_hpo_{i+1}of{counts}'
    save_path  = f'./model/{model_name}.h5'
    score_path = f"./score/{model_name}.csv"
    
    x_dim = train_x.shape[1]
    y_dim = train_y.shape[1]
    model = basicMLP(x_dim, y_dim, n_layers, n_neurons)
    
    if not exists(save_path) or update:
        tf.random.set_seed(i+1)
        
        time_start = time.time()
        history = model.fit(train_x, train_y, verbose=0,
                            epochs=10000, callbacks=[early_stopping_cb], validation_split= valid_size)
        time_end = time.time()
        time_elapse = round((time_end - time_start)/60, 3)
        
        model.save_weights(save_path)
        print(f"model is saved to: {save_path}")
        
        idx = np.array(history.history[monitor]).argmin()
        val_loss = history.history['val_loss'][idx]
        loss = history.history['loss'][idx]
        test_loss = model.evaluate(test_x, test_y, verbose=0)[0]
        prediction = model.predict(test_x, verbose=0)
        prediction = prediction.round(0).astype(int)
        y_real = test_y
        y_pred = prediction
        accuracy, precision, recall, f1 = scores(y_real, y_pred)
        scores_df = pd.DataFrame([[time_elapse, n_layers, n_neurons, loss, val_loss, test_loss, accuracy, precision, recall, f1]], 
                                  columns=metrics)

        scores_df.to_csv(score_path)
        print(f"history is saved to: {score_path}")

    else:
        scores_df = pd.read_csv(score_path, index_col=0, header=0)
        print(f"history is loaded from: {score_path}")
        
    hpo_result = pd.concat([hpo_result, scores_df], axis=0)

hpo_result = hpo_result.reset_index(drop=True)

random, 1 of 100: 5 layers, 292 neurons
history is loaded from: ./score/v4_upsampled_MLP_without_window_hpo_1of100.csv
random, 2 of 100: 2 layers, 47 neurons
model is saved to: ./model/v4_upsampled_MLP_without_window_hpo_2of100.h5
history is saved to: ./score/v4_upsampled_MLP_without_window_hpo_2of100.csv
random, 3 of 100: 8 layers, 304 neurons
model is saved to: ./model/v4_upsampled_MLP_without_window_hpo_3of100.h5
history is saved to: ./score/v4_upsampled_MLP_without_window_hpo_3of100.csv
random, 4 of 100: 8 layers, 156 neurons
model is saved to: ./model/v4_upsampled_MLP_without_window_hpo_4of100.h5
history is saved to: ./score/v4_upsampled_MLP_without_window_hpo_4of100.csv
random, 5 of 100: 9 layers, 380 neurons
model is saved to: ./model/v4_upsampled_MLP_without_window_hpo_5of100.h5
history is saved to: ./score/v4_upsampled_MLP_without_window_hpo_5of100.csv
random, 6 of 100: 19 layers, 421 neurons
model is saved to: ./model/v4_upsampled_MLP_without_window_hpo_6of100.h5
history is s

model is saved to: ./model/v4_upsampled_MLP_without_window_hpo_44of100.h5
history is saved to: ./score/v4_upsampled_MLP_without_window_hpo_44of100.csv
random, 45 of 100: 9 layers, 214 neurons
model is saved to: ./model/v4_upsampled_MLP_without_window_hpo_45of100.h5
history is saved to: ./score/v4_upsampled_MLP_without_window_hpo_45of100.csv
random, 46 of 100: 3 layers, 205 neurons
model is saved to: ./model/v4_upsampled_MLP_without_window_hpo_46of100.h5
history is saved to: ./score/v4_upsampled_MLP_without_window_hpo_46of100.csv
random, 47 of 100: 12 layers, 33 neurons
model is saved to: ./model/v4_upsampled_MLP_without_window_hpo_47of100.h5
history is saved to: ./score/v4_upsampled_MLP_without_window_hpo_47of100.csv
random, 48 of 100: 18 layers, 162 neurons
model is saved to: ./model/v4_upsampled_MLP_without_window_hpo_48of100.h5
history is saved to: ./score/v4_upsampled_MLP_without_window_hpo_48of100.csv
random, 49 of 100: 3 layers, 177 neurons
model is saved to: ./model/v4_upsampled

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "D:\anaconda3\envs\dualattn\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\SEOKYOUNG\AppData\Local\Temp\ipykernel_34496\782584765.py", line 45, in <module>
    epochs=10000, callbacks=[early_stopping_cb], validation_split= valid_size)
  File "D:\anaconda3\envs\dualattn\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
    return fn(*args, **kwargs)
  File "D:\anaconda3\envs\dualattn\lib\site-packages\keras\engine\training.py", line 1409, in fit
    tmp_logs = self.train_function(iterator)
  File "D:\anaconda3\envs\dualattn\lib\site-packages\tensorflow\python\util\traceback_utils.py", line 150, in error_handler
    return fn(*args, **kwargs)
  File "D:\anaconda3\envs\dualattn\lib\site-packages\tensorflow\python\eager\def_function.py", line 915, in __call__
    result = self._call(*args, **kwds)
  File "D:\anaconda3\envs\

Traceback (most recent call last):
  File "D:\anaconda3\envs\dualattn\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\SEOKYOUNG\AppData\Local\Temp\ipykernel_34496\782584765.py", line 45, in <module>
    epochs=10000, callbacks=[early_stopping_cb], validation_split= valid_size)
  File "D:\anaconda3\envs\dualattn\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
    return fn(*args, **kwargs)
  File "D:\anaconda3\envs\dualattn\lib\site-packages\keras\engine\training.py", line 1409, in fit
    tmp_logs = self.train_function(iterator)
  File "D:\anaconda3\envs\dualattn\lib\site-packages\tensorflow\python\util\traceback_utils.py", line 150, in error_handler
    return fn(*args, **kwargs)
  File "D:\anaconda3\envs\dualattn\lib\site-packages\tensorflow\python\eager\def_function.py", line 915, in __call__
    result = self._call(*args, **kwds)
  File "D:\anaconda3\envs\

In [10]:
# show the HPO result
target_metric = 'f1'
best_idx = hpo_result[target_metric].argmax()
best_parameters = hpo_result.iloc[best_idx]
print(f'best hyperparamerter: index {best_idx}')
display(best_parameters)

display(hpo_result.describe())

best hyperparamerter: index 20


time            6.845
n_layers            6
n_neurons         215
loss         0.663332
val_loss     0.660195
test_loss    0.639638
accuracy         63.3
precision        1.78
recall          51.92
f1               3.44
Name: 0, dtype: object

Unnamed: 0,time,loss,val_loss,test_loss,accuracy,precision,recall,f1
count,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0
mean,14.488219,0.668633,0.660753,0.653729,60.687969,1.661719,51.847656,3.219375
std,5.589315,0.025304,0.003013,0.015569,3.273064,0.048519,3.873572,0.090797
min,6.845,0.660879,0.659464,0.601813,43.2,1.56,41.35,3.03
25%,9.766,0.661477,0.659684,0.643853,59.4175,1.63,50.0,3.1575
50%,13.835,0.662095,0.659808,0.653765,60.905,1.67,51.92,3.225
75%,18.46375,0.664659,0.660076,0.662754,62.185,1.69,53.85,3.275
max,29.273,0.851449,0.674532,0.70412,68.34,1.78,71.15,3.44


ERROR! Session/line number was not unique in database. History logging moved to new session 841


In [None]:
# train the models for cross validation
n_layers = best_parameters['n_layers']
n_neurons = best_parameters['n_neurons']

i=1
for train_x, train_y in zip(upsample_x_list, upsample_y_list):
    print(f"{i}th iteration")
    model_name = f'{version}_{model_type}_{n_layers}_{n_neurons}_cv_{i}of{n_splits}'
    save_path  = f'./model/{model_name}.h5'
    
    if not exists(save_path) or update:
        x_dim = train_x.shape[1]
        y_dim = train_y.shape[1]
        model = basicMLP(x_dim, y_dim, n_layers, n_neurons)

        if not exists(save_path) or update:
            tf.random.set_seed(random_state)
            history = model.fit(train_x, train_y, verbose=0,
                                epochs=10000, callbacks=[early_stopping_cb], validation_split= valid_size)
            model.save_weights(save_path)
            print(f"model is saved to: {save_path}")
    else:
        print(f"model already exists at: {save_path}")
    i += 1
display(model.summary())

1th iteration


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "D:\anaconda3\envs\dualattn\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\SEOKYOUNG\AppData\Local\Temp\ipykernel_34496\3092089089.py", line 19, in <module>
    epochs=10000, callbacks=[early_stopping_cb], validation_split= valid_size)
  File "D:\anaconda3\envs\dualattn\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
    return fn(*args, **kwargs)
  File "D:\anaconda3\envs\dualattn\lib\site-packages\keras\engine\training.py", line 1456, in fit
    _use_cached_eval_dataset=True)
  File "D:\anaconda3\envs\dualattn\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
    return fn(*args, **kwargs)
  File "D:\anaconda3\envs\dualattn\lib\site-packages\keras\engine\training.py", line 1756, in evaluate
    tmp_logs = self.test_function(iterator)
  File "D:\anaconda3\envs\dualattn\lib\site-packages\ten

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "D:\anaconda3\envs\dualattn\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\SEOKYOUNG\AppData\Local\Temp\ipykernel_34496\3092089089.py", line 19, in <module>
    epochs=10000, callbacks=[early_stopping_cb], validation_split= valid_size)
  File "D:\anaconda3\envs\dualattn\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
    return fn(*args, **kwargs)
  File "D:\anaconda3\envs\dualattn\lib\site-packages\keras\engine\training.py", line 1456, in fit
    _use_cached_eval_dataset=True)
  File "D:\anaconda3\envs\dualattn\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
    return fn(*args, **kwargs)
  File "D:\anaconda3\envs\dualattn\lib\site-packages\keras\engine\training.py", line 1756, in evaluate
    tmp_logs = self.test_function(iterator)
  File "D:\anaconda3\envs\dualattn\lib\site-packages\ten

In [11]:
# get the result of cross validation
i=1
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []
for test_x, test_y in zip(test_x_list, test_y_list):
    model_name = f'{version}_{model_type}_{n_layers}_{n_neurons}_cv_{i}of{n_splits}'
    save_path  = f'./model/{model_name}.h5'
    
    model = basicMLP(x_dim, y_dim, n_layers, n_neurons)
    model.load_weights(save_path)
    
    prediction = model.predict(test_x, verbose=0)
    prediction = prediction.round(0).astype(int)

    y_real = test_y
    y_pred = prediction
    
    accuracy  = 100*np.array(accuracy_score(y_real, y_pred)).round(4)
    precision = 100*np.array(precision_score(y_real, y_pred, average=None)).round(4)[1]
    recall    = 100*np.array(recall_score(y_real, y_pred, average=None)).round(4)[1]
    f1        = 100*np.array(f1_score(y_real, y_pred, average=None)).round(4)[1]
    
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)
    
accuracies = np.array(accuracy_list)
precisions = np.array(precision_list)
recalls = np.array(recall_list)
f1s = np.array(f1_list)

results = pd.DataFrame(np.array([accuracies, precisions, recalls, f1s]).T, columns=['accuracy', 'precision', 'recall', 'f1-score'])
results.describe()

ERROR! Session/line number was not unique in database. History logging moved to new session 842


FileNotFoundError: [Errno 2] Unable to open file (unable to open file: name = './model/v4_upsampled_MLP_without_window_6_215_cv_1of10.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

# Case 2: with window

In [None]:
x_cat = ['SEQ', 'nS/nT', 'nAli', 'nPos', 'phi_psi', 'SS', 
         'side_-1', 'side_1', 'side_2', 'side_3','side_4', 'side_5']
x_cts = ['Proline', 'flexibility']
y_label = ['positivity']

data_x = pd.get_dummies(ST_dataset[x_cts+x_cat], columns=x_cat)
data_y = ST_dataset[y_label]

print(data_x.shape)
print(data_y.shape)

print("\nx columns:")
display(pd.Series(data_x.columns))

In [None]:
### split data into train/test dataset ###
split = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state)
arr_x, arr_y = data_x.values, data_y.values # convert dataframe to nd-array 

i=1
train_idx_list, train_x_list, train_y_list, test_idx_list, test_x_list, test_y_list = [], [], [], [], [], []
for train_index, test_index in split.split(arr_x, arr_y):
    train_x = arr_x[train_index]
    train_y = arr_y[train_index]
    test_x = arr_x[test_index]
    test_y = arr_y[test_index]
    
    train_cts = train_x[:,:len(x_cts)]
    test_cts  = test_x[:,:len(x_cts)]
    
    x_min = train_cts.min(axis=0)
    x_max = train_cts.max(axis=0)
    
    train_x[:,:len(x_cts)] = (train_cts-x_min)/(x_max-x_min)
    test_x[:,:len(x_cts)] = (test_cts-x_min)/(x_max-x_min)
    
    print(f"{i}th iteration")
    print("train:", train_x.shape, train_y.shape, "check scale:", train_x.min(), train_x.max())
    print("test: ", test_x.shape, test_y.shape, "check scale:", test_x.min(), test_x.max())
    
    train_idx_list.append(train_index)
    train_x_list.append(train_x)
    train_y_list.append(train_y)
    
    test_idx_list.append(test_index)
    test_x_list.append(test_x)
    test_y_list.append(test_y)
    
    i += 1

In [None]:
## upsampling dataset 
upsample_x_list, upsample_y_list = [], []
for train_x, train_y in zip(train_x_list, train_y_list):
    index_pos = np.where(train_y == 1)[0]
    index_neg = np.where(train_y == 0)[0]

    random.seed(random_state)
    up_index = [random.choice(index_pos) for _ in range(len(index_neg))] # get samples from positive sites as much as the number of negative sites

    upsample_pos_x = train_x[up_index]
    upsample_pos_y = train_y[up_index]
    sample_neg_x = train_x[index_neg]
    sample_neg_y = train_y[index_neg]

    sample_x = np.concatenate([upsample_pos_x, sample_neg_x], axis=0)
    sample_y = np.concatenate([upsample_pos_y, sample_neg_y], axis=0)

    shuffle_index = np.arange(len(sample_x))
    np.random.seed(random_state)
    np.random.shuffle(shuffle_index)
    sample_x = sample_x[shuffle_index]
    sample_y = sample_y[shuffle_index]
    
    upsample_x_list.append(sample_x)
    upsample_y_list.append(sample_y)

print("up-sampled train dataset:", sample_x.shape, sample_y.shape)
print("test dataset:", test_x.shape, test_y.shape)

In [None]:
## hyper-parameter optimization
model_type = 'upsampled_MLP_original'

train_x = upsample_x_list[0]
train_y = upsample_y_list[0]
test_x = test_x_list[0]
test_y = test_y_list[0]

hpo_result = pd.DataFrame([], columns=metrics)
for i in range(counts):
    random.seed(i+1)
    n_layers = random.choice(parameter_config["n_layers"])
    n_neurons = random.choice(parameter_config["n_neurons"])
    print(f"random, {i+1} of {counts}: {n_layers} layers, {n_neurons} neurons")
    
    model_name = f'{version}_{model_type}_hpo_{i+1}of{counts}'
    save_path  = f'./model/{model_name}.h5'
    score_path = f"./score/{model_name}.csv"
    
    x_dim = train_x.shape[1]
    y_dim = train_y.shape[1]

    model = basicMLP(x_dim, y_dim, n_layers, n_neurons)
    if not exists(save_path) or update:
        tf.random.set_seed(i+1)
        
        time_start = time.time()
        history = model.fit(train_x, train_y, verbose=0,
                            epochs=10000, callbacks=[early_stopping_cb], validation_split= valid_size)
        time_end = time.time()
        time_elapse = round((time_end - time_start)/60, 3)
        
        model.save_weights(save_path)
        print(f"model is saved to: {save_path}")
        
        idx = np.array(history.history[monitor]).argmin()
        val_loss = history.history['val_loss'][idx]
        loss = history.history['loss'][idx]
        test_loss = model.evaluate(test_x, test_y, verbose=0)[0]
        prediction = model.predict(test_x, verbose=0)
        prediction = prediction.round(0).astype(int)
        y_real = test_y
        y_pred = prediction
        accuracy, precision, recall, f1 = scores(y_real, y_pred)
        scores_df = pd.DataFrame([[time_elapse, n_layers, n_neurons, loss, val_loss, test_loss, accuracy, precision, recall, f1]], 
                                  columns=metrics)

        scores_df.to_csv(score_path)
        print(f"history is saved to: {score_path}")

    else:
        scores_df = pd.read_csv(score_path, index_col=0, header=0)
        print(f"history is loaded from: {score_path}")
        
    hpo_result = pd.concat([hpo_result, scores_df], axis=0)

hpo_result = hpo_result.reset_index(drop=True)

In [None]:
# show the HPO result
target_metric = 'f1'
best_idx = hpo_result[target_metric].argmax()
best_parameters = hpo_result.iloc[best_idx]
print(f'best hyperparamerter: index {best_idx}')
display(best_parameters)

display(hpo_result.describe())

In [None]:
# bulid model
n_layers = best_parameters['n_layers']
n_neurons = best_parameters['n_neurons']

i=1
for train_x, train_y in zip(upsample_x_list, upsample_y_list):
    print(f"{i}th iteration")
    model_name = f'{version}_{model_type}_{n_layers}_{n_neurons}_cv_{i}of{n_splits}'
    save_path  = f'./model/{model_name}.h5'
    
    if not exists(save_path) or update:
        x_dim = train_x.shape[1]
        y_dim = train_y.shape[1]
        model = basicMLP(x_dim, y_dim, n_layers, n_neurons)

        if not exists(save_path) or update:
            tf.random.set_seed(random_state)
            history = model.fit(train_x, train_y, verbose=0,
                                epochs=10000, callbacks=[early_stopping_cb], validation_split= valid_size)
            model.save_weights(save_path)
            print(f"model is saved to: {save_path}")
    else:
        print(f"model already exists at: {save_path}")
    i += 1
display(model.summary())

In [None]:
# evaluate the trained model
i=1
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []
for test_x, test_y in zip(test_x_list, test_y_list):
    model_name = f'{version}_{model_type}_{n_layers}_{n_neurons}_cv_{i}of{n_splits}'
    save_path  = f'./model/{model_name}.h5'
    
    model = basicMLP(x_dim, y_dim, n_layers, n_neurons)
    model.load_weights(save_path)
    
    prediction = model.predict(test_x, verbose=0)
    prediction = prediction.round(0).astype(int)

    y_real = test_y
    y_pred = prediction
    
    accuracy  = 100*np.array(accuracy_score(y_real, y_pred)).round(4)
    precision = 100*np.array(precision_score(y_real, y_pred, average=None)).round(4)[1]
    recall    = 100*np.array(recall_score(y_real, y_pred, average=None)).round(4)[1]
    f1        = 100*np.array(f1_score(y_real, y_pred, average=None)).round(4)[1]
    
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)
    
accuracies = np.array(accuracy_list)
precisions = np.array(precision_list)
recalls = np.array(recall_list)
f1s = np.array(f1_list)

results = pd.DataFrame(np.array([accuracies, precisions, recalls, f1s]).T, columns=['accuracy', 'precision', 'recall', 'f1-score'])
results.describe()