In [13]:
# '''
# function ConnectButton(){
#     console.log("Connect pushed");
#     document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click()
# }

# setInterval(ConnectButton,60000);
# '''

# from google.colab import drive
# from os import chdir

# drive.mount('/content/drive')
# project_path = '/content/drive/MyDrive/Gproject/o-linked-site-prediction-feature-augment'
# chdir(project_path)

In [14]:
import os
import numpy as np
import pandas as pd

# set seed for the reproducible result
SEED = 42

In [15]:
data_dir = './data/integrated_features' # we will get names from the augmented proteins
protein_names = [x.split('.')[0] for x in os.listdir(data_dir) if x.split('.')[1] == 'csv'] # get protein name list to be processed for building machine learning models
print('the number of initial proteins:', len(protein_names))
print(protein_names[:10])

the number of initial proteins: 104
['A2ABU4', 'A2AHJ4', 'A2AKB9', 'A2AQ25', 'E9Q1P8', 'E9Q5G3', 'O08537', 'O09061', 'O35303', 'O70263']


## hyper parameter optimization by K-fold cross-validation

In [16]:
import tensorflow as tf
import keras
from keras.optimizers import Adam
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold

import time
from IPython.display import clear_output

from ml_models import *

epochs = 1000
from keras.callbacks import EarlyStopping
patience = 30
callbacks = [EarlyStopping(patience=patience, restore_best_weights=True, monitor='val_loss')]

test_size = 0.2


### set initial parameters

In [17]:
from functions import *

In [18]:
initial_params = default_params.copy()
initial_params.update({
    'window_size'    : 10
    })

print('initial parameters')
for key, value in initial_params.items():
    print(f'{key:<14} : {value}')

search_space = {
    'dnn_layers'    : [1, 2, 3, 4, 5],
    'dnn_neurons'   : [32, 64, 128, 256],
    'learning_rate' : [0.0001, 0.001, 0.01]
}

initial parameters
rnn_layers     : 1
rnn_neurons    : 64
dnn_layers     : 3
dnn_neurons    : 64
activation     : softmax
loss           : categorical_crossentropy
metrics        : accuracy
optimizer_type : Adam
learning_rate  : 0.001
regularizer    : {'input': None, 'hidden': None, 'bias': None}
window_size    : 10


## set variables

In [19]:
model_type = 'MLP'

# set continuous input features
x_cts = []
print('continuous features:')
display(dict(zip(range(len(x_cts)), x_cts)))

# set categorical input features
x_cat = ['residue']
print('categorical features:')
print(dict(zip(range(len(x_cat)), x_cat)))

# input features
x_var = x_cts + x_cat

# set continuos output feature
y_cts = []

# set categorical output feature
y_cat = ['positivity']

# output features
y_var = y_cts + y_cat

continuous features:


{}

categorical features:
{0: 'residue'}


# build amino acid sequence dataset

### model training with K-fold

In [20]:
keras.utils.set_random_seed(SEED)
tf.config.experimental.enable_op_determinism()

model_update  = False

params = initial_params.copy()
MODELs = []
METRICs = []
METRIC_MEAN = []
model_id = 1
verbose = 0
for param_name, space in search_space.items():
    for point in space:
        clear_output(wait=True)
        display(METRIC_MEAN)
        params[param_name] = point
        for key, value in search_space.items():
            print(f'{key:<14} : {value}')
        
        data_x = []
        data_y = []
        for name in protein_names:
            data = pd.read_csv(f'./data/integrated_features/{name}.csv')
            ST_idx = np.where((data['residue'] == 'S') | (data['residue'] == 'T'))[0]
            
            # get X dataset
            x_onehot = get_onehots(data[x_var], columns = x_cat)
            x_features = list(x_onehot.columns)
            
            # get Y dataset
            y_onehot = get_onehots(data[y_var], columns = y_cat)
            y_labels = list(y_onehot.columns)
            
            for idx in ST_idx:
                window_x = np.array(get_window(x_onehot, idx, params['window_size']))
                window_x = window_x.reshape(-1)
                label_y  = np.array(y_onehot.iloc[idx])
                
                data_x.append(window_x)
                data_y.append(label_y)
                
        data_x = np.array(data_x)
        data_y = np.array(data_y)
        model = MLP_CLS(data_x.shape[-1], data_y.shape[-1], params)  # I don't know why, but this row is helping producing the same training result of a neural network

        print('data x shape:', data_x.shape)
        print('data y shape:', data_y.shape)
        print('class y counts:', data_y.sum(0))
        print(f'class y ratio: {(data_y.sum(0)/len(data_y)).round(4)}')
        
        splitter = StratifiedShuffleSplit(n_splits = 1, test_size = test_size, random_state = SEED)
        train_idx, test_idx = list(splitter.split(data_x, data_y))[0]
        
        train_x = data_x[train_idx]
        train_y = data_y[train_idx]
        
        test_x = data_x[test_idx]
        test_y = data_y[test_idx]
        
        train_x, test_x = data_scaling(train_x, test_x)
        
        splitter_kf = KFold(n_splits = 5)
        for cv_idx, (train_idx_kf, test_idx_kf) in enumerate(splitter_kf.split(train_x, train_y)):
            train_x_kf, train_y_kf = train_x[train_idx_kf], train_y[train_idx_kf]
            test_x_kf, test_y_kf   = train_x[test_idx_kf],  train_y[test_idx_kf]
            
            model = MLP_CLS(data_x.shape[-1], data_y.shape[-1], params) 
            model_name = name_model(f'{model_type}_KFOLD', params)
            
            model_folder  = f'./models/{model_name}_{train_x.shape}_{test_x.shape}'
            if not os.path.exists(model_folder):
                os.makedirs(model_folder)
            model_path    = f'{model_folder}/{cv_idx}.h5'
            metric_path   = f'{model_folder}/{cv_idx}.csv'
            
            
            if not os.path.exists(model_path) or model_update:
                time_start = time.time()
                history = model.fit(train_x_kf, train_y_kf, verbose=verbose, 
                                    epochs = 10000, callbacks = callbacks,
                                    validation_data = (test_x_kf, test_y_kf))
                time_end = time.time()
                training_time = round((time_end - time_start)/60, 3)
                
                model.save_weights(model_path)
                
                test_loss, accuracy, precision, recall, f1 = metrics_classification(model, test_x, test_y)
                model_metrics = {
                    'model_id' : model_id,
                    'cv_idx'   : cv_idx,
                    **{f'train_{x}': train_x.shape[x] for x in range(len(train_x.shape))},
                    'train_y'     : train_y.shape[-1],
                    'test_size'   : test_x.shape[0],
                    **params,
                    'regularizer_input' : params['regularizer']['input'],
                    'regularizer_hidden' : params['regularizer']['hidden'],
                    'regularizer_bias' : params['regularizer']['bias'],
                    'training_time': training_time,
                    'test_loss': test_loss,
                    'accuracy': accuracy,
                    **{f'precision_{x}': precision[x] for x in range(len(precision))},
                    **{f'recall_{x}'   : recall[x] for x in range(len(recall))},
                    **{f'f1_{x}'       : f1[x] for x in range(len(f1))}}
                
                model_metrics = pd.DataFrame([model_metrics]).drop(['activation', 'loss', 'metrics', 'optimizer_type', 'regularizer'], axis=1)
                model_metrics.to_csv(metric_path, index=False)
                
            else:
                model.load_weights(model_path)
                model_metrics = pd.read_csv(metric_path, header=0)
                
            print(f'f1 score: {model_metrics.f1_1[0]}')
            
            model_metrics['model_id'] = model_id
            METRICs.append(model_metrics)
            MODELs.append(model)
        
        METRIC_MEAN = pd.concat(METRICs).groupby('model_id').mean()
        f1_best = METRIC_MEAN.sort_values('f1_1', ascending=False).iloc[0].f1_1
        print(f'best f1 score: {f1_best}')
        model_id += 1
    if param_name in ['learning_rate']: # for float-type parameters
        best_value = float(METRIC_MEAN.sort_values('f1_1', ascending=False).iloc[0][param_name])
        params[param_name] = best_value
        search_space[param_name][search_space[param_name].index(best_value)] = f"{best_value}"
    else: # for int-type parameters
        best_value = int(METRIC_MEAN.sort_values('f1_1', ascending=False).iloc[0][param_name])
        params[param_name] = best_value
        search_space[param_name][search_space[param_name].index(best_value)] = f"{best_value}"

clear_output(wait=True)
display(METRIC_MEAN)
for key, value in search_space.items():
    print(f'{key:<14} : {value}')


Unnamed: 0_level_0,cv_idx,train_0,train_1,train_y,test_size,rnn_layers,rnn_neurons,dnn_layers,dnn_neurons,learning_rate,...,regularizer_bias,training_time,test_loss,accuracy,precision_0,precision_1,recall_0,recall_1,f1_0,f1_1
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.0,9222.0,420.0,2.0,2306.0,1.0,64.0,1.0,64.0,0.001,...,,0.2278,0.158818,96.278,96.278,20.0,100.0,0.232,98.104,0.46
2,2.0,9222.0,420.0,2.0,2306.0,1.0,64.0,2.0,64.0,0.001,...,,0.2486,0.160254,96.27,96.278,10.0,99.99,0.232,98.1,0.454
3,2.0,9222.0,420.0,2.0,2306.0,1.0,64.0,3.0,64.0,0.001,...,,0.2218,0.165711,96.252,96.278,5.0,99.972,0.232,98.09,0.444
4,2.0,9222.0,420.0,2.0,2306.0,1.0,64.0,4.0,64.0,0.001,...,,0.2232,0.162925,96.27,96.286,20.0,99.982,0.466,98.098,0.91
5,2.0,9222.0,420.0,2.0,2306.0,1.0,64.0,5.0,64.0,0.001,...,,0.2372,0.160589,96.262,96.27,0.0,99.99,0.0,98.096,0.0
6,2.0,9222.0,420.0,2.0,2306.0,1.0,64.0,4.0,32.0,0.001,...,,0.1994,0.160905,96.278,96.278,20.0,100.0,0.232,98.104,0.46
7,2.0,9222.0,420.0,2.0,2306.0,1.0,64.0,4.0,64.0,0.001,...,,0.2232,0.162925,96.27,96.286,20.0,99.982,0.466,98.098,0.91
8,2.0,9222.0,420.0,2.0,2306.0,1.0,64.0,4.0,128.0,0.001,...,,0.307,0.167437,96.26,96.294,46.666,99.964,0.696,98.094,1.37
9,2.0,9222.0,420.0,2.0,2306.0,1.0,64.0,4.0,256.0,0.001,...,,0.4298,0.174028,96.176,96.29,21.666,99.87,0.696,98.05,1.316
10,2.0,9222.0,420.0,2.0,2306.0,1.0,64.0,4.0,128.0,0.0001,...,,0.3406,0.161321,96.278,96.278,20.0,100.0,0.232,98.104,0.46


dnn_layers     : [1, 2, 3, '4', 5]
dnn_neurons    : [32, 64, '128', 256]
learning_rate  : [0.0001, '0.001', 0.01]


In [21]:
print('data x shape: ', data_x.shape)
print('data y shape: ', data_y.shape)
print('train x shape:', train_x.shape)
print('test  x shape:', test_y.shape)

data x shape:  (11528, 420)
data y shape:  (11528, 2)
train x shape: (9222, 420)
test  x shape: (2306, 2)


In [22]:
print('optimal parameters')
for key, value in params.items():
    print(f'{key:<14} : {value}')

optimal parameters
rnn_layers     : 1
rnn_neurons    : 64
dnn_layers     : 4
dnn_neurons    : 128
activation     : softmax
loss           : categorical_crossentropy
metrics        : accuracy
optimizer_type : Adam
learning_rate  : 0.001
regularizer    : {'input': None, 'hidden': None, 'bias': None}
window_size    : 10


## evaluate model's general performance through Monte-Carlo cross-validation

In [23]:
keras.utils.set_random_seed(SEED)
tf.config.experimental.enable_op_determinism()

model_update = False
            
MODELs = []
METRICs = []
METRIC_MEAN = []
verbose = 0
        
data_x = []
data_y = []

for name in protein_names:
    data = pd.read_csv(f'./data/integrated_features/{name}.csv')
    ST_idx = np.where((data['residue'] == 'S') | (data['residue'] == 'T'))[0]
    
    # get X dataset
    x_onehot = get_onehots(data[x_var], columns = x_cat)
    x_features = list(x_onehot.columns)
    
    # get Y dataset
    y_onehot = get_onehots(data[y_var], columns = y_cat)
    y_labels = list(y_onehot.columns)
    
    for idx in ST_idx:
        window_x = np.array(get_window(x_onehot, idx, params['window_size']))
        window_x = window_x.reshape(-1)
        label_y  = np.array(y_onehot.iloc[idx])
        
        data_x.append(window_x)
        data_y.append(label_y)
        
data_x = np.array(data_x)
data_y = np.array(data_y)

print('data x shape:', data_x.shape)
print('data y shape:', data_y.shape)
print('class y counts:', data_y.sum(0))
print(f'class y ratio: {(data_y.sum(0)/len(data_y)).round(4)}')

splitter = StratifiedShuffleSplit(n_splits = 5, test_size = test_size, random_state = SEED)
for cv_idx, (train_idx, test_idx) in enumerate(splitter.split(data_x, data_y)):
    train_x, train_y = data_x[train_idx], data_y[train_idx]
    test_x , test_y  = data_x[test_idx],  data_y[test_idx]
    
    train_x, test_x = data_scaling(train_x, test_x)
    
    model = MLP_CLS(data_x.shape[-1], data_y.shape[-1], params)
    model_name = name_model(f'{model_type}', params)
    
    
    model_folder  = f'./models/{model_name}_{train_x.shape}_{test_x.shape}'
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)
    model_path    = f'{model_folder}/{cv_idx}.h5'
    metric_path   = f'{model_folder}/{cv_idx}.csv'
    
    
    if not os.path.exists(model_path) or model_update:
        time_start = time.time()
        history = model.fit(train_x, train_y, verbose=verbose, 
                            epochs = 10000, callbacks = callbacks,
                            validation_split = test_size/(1-test_size))
        time_end = time.time()
        training_time = round((time_end - time_start)/60, 3)
        
        model.save_weights(model_path)
        
        test_loss, accuracy, precision, recall, f1 = metrics_classification(model, test_x, test_y)
        model_metrics = {
            'model_id' : model_id,
            'cv_idx'   : cv_idx,
            **{f'train_{x}': train_x.shape[x] for x in range(len(train_x.shape))},
            'train_y'     : train_y.shape[-1],
            'test_size'   : test_x.shape[0],
            **params,
            'regularizer_input' : params['regularizer']['input'],
            'regularizer_hidden' : params['regularizer']['hidden'],
            'regularizer_bias' : params['regularizer']['bias'],
            'training_time': training_time,
            'test_loss': test_loss,
            'accuracy': accuracy,
            **{f'precision_{x}': precision[x] for x in range(len(precision))},
            **{f'recall_{x}'   : recall[x] for x in range(len(recall))},
            **{f'f1_{x}'       : f1[x] for x in range(len(f1))}}
        model_metrics = pd.DataFrame([model_metrics]).drop(['activation', 'loss', 'metrics', 'optimizer_type', 'regularizer'], axis=1)
        model_metrics.to_csv(metric_path, index=False)
        
    else:
        model.load_weights(model_path)
        model_metrics = pd.read_csv(metric_path, header=0)
        
    print(f'f1 score: {model_metrics.f1_1[0]}')
    
    METRICs.append(model_metrics)
    MODELs.append(model)

METRICs = pd.concat(METRICs)
METRIC_MEAN = METRICs.groupby('model_id').mean()
METRIC_STD = METRICs.groupby('model_id').std()

data x shape: (11528, 420)
data y shape: (11528, 2)
class y counts: [11100   428]
class y ratio: [0.9629 0.0371]


  train_data_sc = (train_data - x_min) / (x_max - x_min)
  test_data_sc  = (test_data - x_min)  / (x_max - x_min)
  _warn_prf(average, modifier, msg_start, len(result))
  train_data_sc = (train_data - x_min) / (x_max - x_min)
  test_data_sc  = (test_data - x_min)  / (x_max - x_min)


f1 score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  train_data_sc = (train_data - x_min) / (x_max - x_min)
  test_data_sc  = (test_data - x_min)  / (x_max - x_min)


f1 score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  train_data_sc = (train_data - x_min) / (x_max - x_min)
  test_data_sc  = (test_data - x_min)  / (x_max - x_min)


f1 score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  train_data_sc = (train_data - x_min) / (x_max - x_min)
  test_data_sc  = (test_data - x_min)  / (x_max - x_min)


f1 score: 0.0
f1 score: 0.0


In [24]:
METRICs

Unnamed: 0,model_id,cv_idx,train_0,train_1,train_y,test_size,rnn_layers,rnn_neurons,dnn_layers,dnn_neurons,...,regularizer_bias,training_time,test_loss,accuracy,precision_0,precision_1,recall_0,recall_1,f1_0,f1_1
0,13,0,9222,420,2,2306,1,64,4,128,...,,0.341,0.160427,96.27,96.27,0.0,100.0,0.0,98.1,0.0
0,13,1,9222,420,2,2306,1,64,4,128,...,,0.347,0.15989,96.27,96.27,0.0,100.0,0.0,98.1,0.0
0,13,2,9222,420,2,2306,1,64,4,128,...,,0.316,0.16223,96.27,96.27,0.0,100.0,0.0,98.1,0.0
0,13,3,9222,420,2,2306,1,64,4,128,...,,0.324,0.14829,96.27,96.27,0.0,100.0,0.0,98.1,0.0
0,13,4,9222,420,2,2306,1,64,4,128,...,,0.327,0.160771,96.05,96.26,0.0,99.77,0.0,97.99,0.0


In [25]:
METRIC_MEAN

Unnamed: 0_level_0,cv_idx,train_0,train_1,train_y,test_size,rnn_layers,rnn_neurons,dnn_layers,dnn_neurons,learning_rate,window_size,training_time,test_loss,accuracy,precision_0,precision_1,recall_0,recall_1,f1_0,f1_1
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
13,2.0,9222.0,420.0,2.0,2306.0,1.0,64.0,4.0,128.0,0.001,10.0,0.331,0.158322,96.226,96.268,0.0,99.954,0.0,98.078,0.0


In [26]:
METRIC_STD

Unnamed: 0_level_0,cv_idx,train_0,train_1,train_y,test_size,rnn_layers,rnn_neurons,dnn_layers,dnn_neurons,learning_rate,window_size,training_time,test_loss,accuracy,precision_0,precision_1,recall_0,recall_1,f1_0,f1_1
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
13,1.581139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012708,0.005674,0.098387,0.004472,0.0,0.102859,0.0,0.049193,0.0


In [27]:
# '''
# function ConnectButton(){
#     console.log("Connect pushed");
#     document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click()
# }

# setInterval(ConnectButton,60000);
# '''

# from google.colab import drive
# from os import chdir

# drive.mount('/content/drive')
# project_path = '/content/drive/MyDrive/Gproject/o-linked-site-prediction-feature-augment'
# chdir(project_path)

In [28]:
import os
import numpy as np
import pandas as pd

# set seed for the reproducible result
SEED = 42

In [29]:
data_dir = './data/integrated_features' # we will get names from the augmented proteins
protein_names = [x.split('.')[0] for x in os.listdir(data_dir) if x.split('.')[1] == 'csv'] # get protein name list to be processed for building machine learning models
print('the number of initial proteins:', len(protein_names))
print(protein_names[:10])

the number of initial proteins: 104
['A2ABU4', 'A2AHJ4', 'A2AKB9', 'A2AQ25', 'E9Q1P8', 'E9Q5G3', 'O08537', 'O09061', 'O35303', 'O70263']


## hyper parameter optimization by K-fold cross-validation

In [30]:
import tensorflow as tf
import keras
from keras.optimizers import Adam
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold

import time
from IPython.display import clear_output

from ml_models import *

epochs = 1000
from keras.callbacks import EarlyStopping
patience = 30
callbacks = [EarlyStopping(patience=patience, restore_best_weights=True, monitor='val_loss')]

test_size = 0.2


### set initial parameters

In [31]:
from functions import *

In [32]:
initial_params = default_params.copy()
initial_params.update({
    'window_size'    : 10
    })

print('initial parameters')
for key, value in initial_params.items():
    print(f'{key:<14} : {value}')

search_space = {
    'dnn_layers'    : [1, 2, 3, 4, 5],
    'dnn_neurons'   : [32, 64, 128, 256],
    'learning_rate' : [0.0001, 0.001, 0.01]
}

initial parameters
rnn_layers     : 1
rnn_neurons    : 64
dnn_layers     : 3
dnn_neurons    : 64
activation     : softmax
loss           : categorical_crossentropy
metrics        : accuracy
optimizer_type : Adam
learning_rate  : 0.001
regularizer    : {'input': None, 'hidden': None, 'bias': None}
window_size    : 10


## set variables

In [33]:
model_type = 'MLP_UP'

# set continuous input features
x_cts = []
print('continuous features:')
display(dict(zip(range(len(x_cts)), x_cts)))

# set categorical input features
x_cat = ['residue']
print('categorical features:')
print(dict(zip(range(len(x_cat)), x_cat)))

# input features
x_var = x_cts + x_cat

# set continuos output feature
y_cts = []

# set categorical output feature
y_cat = ['positivity']

# output features
y_var = y_cts + y_cat

continuous features:


{}

categorical features:
{0: 'residue'}


# build amino acid sequence dataset

### model training with K-fold

In [34]:
keras.utils.set_random_seed(SEED)
tf.config.experimental.enable_op_determinism()

model_update  = False

params = initial_params.copy()
MODELs = []
METRICs = []
METRIC_MEAN = []
model_id = 1
verbose = 0
for param_name, space in search_space.items():
    for point in space:
        clear_output(wait=True)
        display(METRIC_MEAN)
        params[param_name] = point
        for key, value in search_space.items():
            print(f'{key:<14} : {value}')
        
        data_x = []
        data_y = []
        for name in protein_names:
            data = pd.read_csv(f'./data/integrated_features/{name}.csv')
            ST_idx = np.where((data['residue'] == 'S') | (data['residue'] == 'T'))[0]
            
            # get X dataset
            x_onehot = get_onehots(data[x_var], columns = x_cat)
            x_features = list(x_onehot.columns)
            
            # get Y dataset
            y_onehot = get_onehots(data[y_var], columns = y_cat)
            y_labels = list(y_onehot.columns)
            
            for idx in ST_idx:
                window_x = np.array(get_window(x_onehot, idx, params['window_size']))
                window_x = window_x.reshape(-1)
                label_y  = np.array(y_onehot.iloc[idx])
                
                data_x.append(window_x)
                data_y.append(label_y)
                
        data_x = np.array(data_x)
        data_y = np.array(data_y)
        model = MLP_CLS(data_x.shape[-1], data_y.shape[-1], params)  # I don't know why, but this row is helping producing the same training result of a neural network

        print('data x shape:', data_x.shape)
        print('data y shape:', data_y.shape)
        print('class y counts:', data_y.sum(0))
        print(f'class y ratio: {(data_y.sum(0)/len(data_y)).round(4)}')
        
        splitter = StratifiedShuffleSplit(n_splits = 1, test_size = test_size, random_state = SEED)
        train_idx, test_idx = list(splitter.split(data_x, data_y))[0]
        
        train_x = data_x[train_idx]
        train_y = data_y[train_idx]
        
        test_x = data_x[test_idx]
        test_y = data_y[test_idx]
        
        train_x, test_x = data_scaling(train_x, test_x)
        
        splitter_kf = KFold(n_splits = 5)
        for cv_idx, (train_idx_kf, test_idx_kf) in enumerate(splitter_kf.split(train_x, train_y)):
            train_x_kf, train_y_kf = train_x[train_idx_kf], train_y[train_idx_kf]
            test_x_kf, test_y_kf   = train_x[test_idx_kf],  train_y[test_idx_kf]
            
            train_x_kf, train_y_kf = upsample_data(train_x_kf, train_y_kf) # up-sample the training dataset
            
            model = MLP_CLS(data_x.shape[-1], data_y.shape[-1], params) 
            model_name = name_model(f'{model_type}_KFOLD', params)
            
            model_folder  = f'./models/{model_name}_{train_x.shape}_{test_x.shape}'
            if not os.path.exists(model_folder):
                os.makedirs(model_folder)
            model_path    = f'{model_folder}/{cv_idx}.h5'
            metric_path   = f'{model_folder}/{cv_idx}.csv'
            
            
            if not os.path.exists(model_path) or model_update:
                time_start = time.time()
                history = model.fit(train_x_kf, train_y_kf, verbose=verbose, 
                                    epochs = 10000, callbacks = callbacks,
                                    validation_data = (test_x_kf, test_y_kf))
                time_end = time.time()
                training_time = round((time_end - time_start)/60, 3)
                
                model.save_weights(model_path)
                
                test_loss, accuracy, precision, recall, f1 = metrics_classification(model, test_x, test_y)
                model_metrics = {
                    'model_id' : model_id,
                    'cv_idx'   : cv_idx,
                    **{f'train_{x}': train_x.shape[x] for x in range(len(train_x.shape))},
                    'train_y'     : train_y.shape[-1],
                    'test_size'   : test_x.shape[0],
                    **params,
                    'regularizer_input' : params['regularizer']['input'],
                    'regularizer_hidden' : params['regularizer']['hidden'],
                    'regularizer_bias' : params['regularizer']['bias'],
                    'training_time': training_time,
                    'test_loss': test_loss,
                    'accuracy': accuracy,
                    **{f'precision_{x}': precision[x] for x in range(len(precision))},
                    **{f'recall_{x}'   : recall[x] for x in range(len(recall))},
                    **{f'f1_{x}'       : f1[x] for x in range(len(f1))}}
                
                model_metrics = pd.DataFrame([model_metrics]).drop(['activation', 'loss', 'metrics', 'optimizer_type', 'regularizer'], axis=1)
                model_metrics.to_csv(metric_path, index=False)
                
            else:
                model.load_weights(model_path)
                model_metrics = pd.read_csv(metric_path, header=0)
                
            print(f'f1 score: {model_metrics.f1_1[0]}')
            
            model_metrics['model_id'] = model_id
            METRICs.append(model_metrics)
            MODELs.append(model)
        
        METRIC_MEAN = pd.concat(METRICs).groupby('model_id').mean()
        f1_best = METRIC_MEAN.sort_values('f1_1', ascending=False).iloc[0].f1_1
        print(f'best f1 score: {f1_best}')
        model_id += 1
    if param_name in ['learning_rate']: # for float-type parameters
        best_value = float(METRIC_MEAN.sort_values('f1_1', ascending=False).iloc[0][param_name])
        params[param_name] = best_value
        search_space[param_name][search_space[param_name].index(best_value)] = f"{best_value}"
    else: # for int-type parameters
        best_value = int(METRIC_MEAN.sort_values('f1_1', ascending=False).iloc[0][param_name])
        params[param_name] = best_value
        search_space[param_name][search_space[param_name].index(best_value)] = f"{best_value}"

clear_output(wait=True)
display(METRIC_MEAN)
for key, value in search_space.items():
    print(f'{key:<14} : {value}')


Unnamed: 0_level_0,cv_idx,train_0,train_1,train_y,test_size,rnn_layers,rnn_neurons,dnn_layers,dnn_neurons,learning_rate,...,regularizer_bias,training_time,test_loss,accuracy,precision_0,precision_1,recall_0,recall_1,f1_0,f1_1
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.0,9222.0,420.0,2.0,2306.0,1.0,64.0,1.0,64.0,0.001,...,,0.392,0.448702,80.418,96.942,6.764,82.252,33.024,88.988,11.218
2,2.0,9222.0,420.0,2.0,2306.0,1.0,64.0,2.0,64.0,0.001,...,,0.4258,0.447681,80.746,96.896,6.62,82.648,31.628,89.198,10.936
3,2.0,9222.0,420.0,2.0,2306.0,1.0,64.0,3.0,64.0,0.001,...,,0.3914,0.448122,80.392,96.89,6.524,82.272,31.86,88.976,10.82
4,2.0,9222.0,420.0,2.0,2306.0,1.0,64.0,4.0,64.0,0.001,...,,0.6952,0.445132,80.244,96.918,6.592,82.09,32.556,88.884,10.954
5,2.0,9222.0,420.0,2.0,2306.0,1.0,64.0,5.0,64.0,0.001,...,,0.6646,0.441334,80.548,96.916,6.684,82.414,32.326,89.068,11.062
6,2.0,9222.0,420.0,2.0,2306.0,1.0,64.0,1.0,32.0,0.001,...,,0.3496,0.445958,80.33,97.0,6.936,82.108,34.418,88.93,11.538
7,2.0,9222.0,420.0,2.0,2306.0,1.0,64.0,1.0,64.0,0.001,...,,0.392,0.448702,80.418,96.942,6.764,82.252,33.024,88.988,11.218
8,2.0,9222.0,420.0,2.0,2306.0,1.0,64.0,1.0,128.0,0.001,...,,0.584,0.450252,80.858,96.87,6.54,82.792,30.928,89.274,10.784
9,2.0,9222.0,420.0,2.0,2306.0,1.0,64.0,1.0,256.0,0.001,...,,0.778,0.449549,80.918,96.862,6.506,82.866,30.696,89.312,10.726
10,2.0,9222.0,420.0,2.0,2306.0,1.0,64.0,1.0,32.0,0.0001,...,,0.4128,0.474191,77.822,97.07,6.692,79.36,38.14,87.324,11.382


dnn_layers     : ['1', 2, 3, 4, 5]
dnn_neurons    : ['32', 64, 128, 256]
learning_rate  : [0.0001, '0.001', 0.01]


In [35]:
print('data x shape: ', data_x.shape)
print('data y shape: ', data_y.shape)
print('train x shape:', train_x.shape)
print('test  x shape:', test_y.shape)

data x shape:  (11528, 420)
data y shape:  (11528, 2)
train x shape: (9222, 420)
test  x shape: (2306, 2)


In [36]:
print('optimal parameters')
for key, value in params.items():
    print(f'{key:<14} : {value}')

optimal parameters
rnn_layers     : 1
rnn_neurons    : 64
dnn_layers     : 1
dnn_neurons    : 32
activation     : softmax
loss           : categorical_crossentropy
metrics        : accuracy
optimizer_type : Adam
learning_rate  : 0.001
regularizer    : {'input': None, 'hidden': None, 'bias': None}
window_size    : 10


## evaluate model's general performance through Monte-Carlo cross-validation

In [37]:
keras.utils.set_random_seed(SEED)
tf.config.experimental.enable_op_determinism()

model_update = False
            
MODELs = []
METRICs = []
METRIC_MEAN = []
verbose = 0
        
data_x = []
data_y = []

for name in protein_names:
    data = pd.read_csv(f'./data/integrated_features/{name}.csv')
    ST_idx = np.where((data['residue'] == 'S') | (data['residue'] == 'T'))[0]
    
    # get X dataset
    x_onehot = get_onehots(data[x_var], columns = x_cat)
    x_features = list(x_onehot.columns)
    
    # get Y dataset
    y_onehot = get_onehots(data[y_var], columns = y_cat)
    y_labels = list(y_onehot.columns)
    
    for idx in ST_idx:
        window_x = np.array(get_window(x_onehot, idx, params['window_size']))
        window_x = window_x.reshape(-1)
        label_y  = np.array(y_onehot.iloc[idx])
        
        data_x.append(window_x)
        data_y.append(label_y)
        
data_x = np.array(data_x)
data_y = np.array(data_y)

print('data x shape:', data_x.shape)
print('data y shape:', data_y.shape)
print('class y counts:', data_y.sum(0))
print(f'class y ratio: {(data_y.sum(0)/len(data_y)).round(4)}')

splitter = StratifiedShuffleSplit(n_splits = 5, test_size = test_size, random_state = SEED)
for cv_idx, (train_idx, test_idx) in enumerate(splitter.split(data_x, data_y)):
    train_x, train_y = data_x[train_idx], data_y[train_idx]
    test_x , test_y  = data_x[test_idx],  data_y[test_idx]
    
    train_x, train_y = upsample_data(train_x, train_y) # up-sample the training dataset
    
    train_x, test_x = data_scaling(train_x, test_x)
    
    model = MLP_CLS(data_x.shape[-1], data_y.shape[-1], params)
    model_name = name_model(f'{model_type}', params)
    
    
    model_folder  = f'./models/{model_name}_{train_x.shape}_{test_x.shape}'
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)
    model_path    = f'{model_folder}/{cv_idx}.h5'
    metric_path   = f'{model_folder}/{cv_idx}.csv'
    
    
    if not os.path.exists(model_path) or model_update:
        time_start = time.time()
        history = model.fit(train_x, train_y, verbose=verbose, 
                            epochs = 10000, callbacks = callbacks,
                            validation_split = test_size/(1-test_size))
        time_end = time.time()
        training_time = round((time_end - time_start)/60, 3)
        
        model.save_weights(model_path)
        
        test_loss, accuracy, precision, recall, f1 = metrics_classification(model, test_x, test_y)
        model_metrics = {
            'model_id' : model_id,
            'cv_idx'   : cv_idx,
            **{f'train_{x}': train_x.shape[x] for x in range(len(train_x.shape))},
            'train_y'     : train_y.shape[-1],
            'test_size'   : test_x.shape[0],
            **params,
            'regularizer_input' : params['regularizer']['input'],
            'regularizer_hidden' : params['regularizer']['hidden'],
            'regularizer_bias' : params['regularizer']['bias'],
            'training_time': training_time,
            'test_loss': test_loss,
            'accuracy': accuracy,
            **{f'precision_{x}': precision[x] for x in range(len(precision))},
            **{f'recall_{x}'   : recall[x] for x in range(len(recall))},
            **{f'f1_{x}'       : f1[x] for x in range(len(f1))}}
        model_metrics = pd.DataFrame([model_metrics]).drop(['activation', 'loss', 'metrics', 'optimizer_type', 'regularizer'], axis=1)
        model_metrics.to_csv(metric_path, index=False)
        
    else:
        model.load_weights(model_path)
        model_metrics = pd.read_csv(metric_path, header=0)
        
    print(f'f1 score: {model_metrics.f1_1[0]}')
    
    METRICs.append(model_metrics)
    MODELs.append(model)

METRICs = pd.concat(METRICs)
METRIC_MEAN = METRICs.groupby('model_id').mean()
METRIC_STD = METRICs.groupby('model_id').std()

data x shape: (11528, 420)
data y shape: (11528, 2)
class y counts: [11100   428]
class y ratio: [0.9629 0.0371]


  train_data_sc = (train_data - x_min) / (x_max - x_min)
  test_data_sc  = (test_data - x_min)  / (x_max - x_min)


f1 score: 11.18


  train_data_sc = (train_data - x_min) / (x_max - x_min)
  test_data_sc  = (test_data - x_min)  / (x_max - x_min)


f1 score: 12.590000000000002


  train_data_sc = (train_data - x_min) / (x_max - x_min)
  test_data_sc  = (test_data - x_min)  / (x_max - x_min)


f1 score: 14.09


  train_data_sc = (train_data - x_min) / (x_max - x_min)
  test_data_sc  = (test_data - x_min)  / (x_max - x_min)


f1 score: 13.889999999999999


  train_data_sc = (train_data - x_min) / (x_max - x_min)
  test_data_sc  = (test_data - x_min)  / (x_max - x_min)


f1 score: 11.940000000000001


In [38]:
METRICs

Unnamed: 0,model_id,cv_idx,train_0,train_1,train_y,test_size,rnn_layers,rnn_neurons,dnn_layers,dnn_neurons,...,regularizer_bias,training_time,test_loss,accuracy,precision_0,precision_1,recall_0,recall_1,f1_0,f1_1
0,13,0,17760,420,2,2306,1,64,1,32,...,,0.921,0.526099,75.89,97.11,6.48,77.25,40.7,86.05,11.18
0,13,1,17760,420,2,2306,1,64,1,32,...,,0.858,0.58833,73.5,97.52,7.18,74.37,51.16,84.39,12.59
0,13,2,17760,420,2,2306,1,64,1,32,...,,1.232,0.511851,75.67,97.7,8.11,76.53,53.49,85.83,14.09
0,13,3,17760,420,2,2306,1,64,1,32,...,,0.646,0.546221,73.11,97.85,7.89,73.69,58.14,84.07,13.89
0,13,4,17760,420,2,2306,1,64,1,32,...,,0.879,0.500908,76.32,97.23,6.93,77.61,43.02,86.32,11.94


In [39]:
METRIC_MEAN

Unnamed: 0_level_0,cv_idx,train_0,train_1,train_y,test_size,rnn_layers,rnn_neurons,dnn_layers,dnn_neurons,learning_rate,window_size,training_time,test_loss,accuracy,precision_0,precision_1,recall_0,recall_1,f1_0,f1_1
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
13,2.0,17760.0,420.0,2.0,2306.0,1.0,64.0,1.0,32.0,0.001,10.0,0.9072,0.534682,74.898,97.482,7.318,75.89,49.302,85.332,12.738


In [40]:
METRIC_STD

Unnamed: 0_level_0,cv_idx,train_0,train_1,train_y,test_size,rnn_layers,rnn_neurons,dnn_layers,dnn_neurons,learning_rate,window_size,training_time,test_loss,accuracy,precision_0,precision_1,recall_0,recall_1,f1_0,f1_1
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
13,1.581139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210432,0.034449,1.479314,0.310757,0.6757,1.758408,7.289713,1.027093,1.249108


In [41]:
# '''
# function ConnectButton(){
#     console.log("Connect pushed");
#     document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click()
# }

# setInterval(ConnectButton,60000);
# '''

# from google.colab import drive
# from os import chdir

# drive.mount('/content/drive')
# project_path = '/content/drive/MyDrive/Gproject/o-linked-site-prediction-feature-augment'
# chdir(project_path)

In [42]:
import os
import numpy as np
import pandas as pd

# set seed for the reproducible result
SEED = 42

In [43]:
data_dir = './data/integrated_features' # we will get names from the augmented proteins
protein_names = [x.split('.')[0] for x in os.listdir(data_dir) if x.split('.')[1] == 'csv'] # get protein name list to be processed for building machine learning models
print('the number of initial proteins:', len(protein_names))
print(protein_names[:10])

the number of initial proteins: 104
['A2ABU4', 'A2AHJ4', 'A2AKB9', 'A2AQ25', 'E9Q1P8', 'E9Q5G3', 'O08537', 'O09061', 'O35303', 'O70263']


## hyper parameter optimization by K-fold cross-validation

In [44]:
import tensorflow as tf
import keras
from keras.optimizers import Adam
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold

import time
from IPython.display import clear_output

from ml_models import *

epochs = 1000
from keras.callbacks import EarlyStopping
patience = 30
callbacks = [EarlyStopping(patience=patience, restore_best_weights=True, monitor='val_loss')]

test_size = 0.2


### set initial parameters

In [45]:
from functions import *

In [46]:
initial_params = default_params.copy()
initial_params.update({
    'window_size'    : 10
    })

print('initial parameters')
for key, value in initial_params.items():
    print(f'{key:<14} : {value}')

search_space = {
    'dnn_layers'    : [1, 2, 3, 4, 5],
    'dnn_neurons'   : [32, 64, 128, 256],
    'learning_rate' : [0.0001, 0.001, 0.01]
}

initial parameters
rnn_layers     : 1
rnn_neurons    : 64
dnn_layers     : 3
dnn_neurons    : 64
activation     : softmax
loss           : categorical_crossentropy
metrics        : accuracy
optimizer_type : Adam
learning_rate  : 0.001
regularizer    : {'input': None, 'hidden': None, 'bias': None}
window_size    : 10


## set variables

In [47]:
model_type = 'MLP_UP_BASIC'

basic_columns = dict(pd.read_csv('./data/basic_columns.csv', header=0).values.squeeze())
print('# of augmented features:', len(basic_columns))

# set continuous input features
x_cts = [x for x in basic_columns.keys() if basic_columns.get(x) != 'object']
print('continuous features:')
display(dict(zip(range(len(x_cts)), x_cts)))

# set categorical input features
x_cat = ['residue'] + \
        [x for x in basic_columns.keys() if basic_columns.get(x) == 'object']
print('categorical features:')
print(dict(zip(range(len(x_cat)), x_cat)))

# input features
x_var = x_cts + x_cat

# set continuos output feature
y_cts = []

# set categorical output feature
y_cat = ['positivity']

# output features
y_var = y_cts + y_cat

# of augmented features: 13
continuous features:


{0: 'npa(-3,-1)', 1: 'ppo(-7,-5)', 2: 'n(S/T)', 3: 'flexibility', 4: 'p(1)'}

categorical features:
{0: 'residue', 1: 'side_-1', 2: 'side_1', 3: 'side_2', 4: 'side_3', 5: 'side_4', 6: 'side_5', 7: 'ss', 8: 'ss_angle'}


# build amino acid sequence dataset

### model training with K-fold

In [48]:
keras.utils.set_random_seed(SEED)
tf.config.experimental.enable_op_determinism()

model_update  = False

params = initial_params.copy()
MODELs = []
METRICs = []
METRIC_MEAN = []
model_id = 1
verbose = 0
for param_name, space in search_space.items():
    for point in space:
        clear_output(wait=True)
        display(METRIC_MEAN)
        params[param_name] = point
        for key, value in search_space.items():
            print(f'{key:<14} : {value}')
        
        data_x = []
        data_y = []
        for name in protein_names:
            data = pd.read_csv(f'./data/integrated_features/{name}.csv')
            ST_idx = np.where((data['residue'] == 'S') | (data['residue'] == 'T'))[0]
            
            # get X dataset
            x_onehot = get_onehots(data[x_var], columns = x_cat)
            x_features = list(x_onehot.columns)
            
            # get Y dataset
            y_onehot = get_onehots(data[y_var], columns = y_cat)
            y_labels = list(y_onehot.columns)
            
            for idx in ST_idx:
                window_x = x_onehot.iloc[idx]
                label_y  = np.array(y_onehot.iloc[idx])
                
                data_x.append(window_x)
                data_y.append(label_y)
                
        data_x = np.array(data_x)
        data_y = np.array(data_y)
        model = MLP_CLS(data_x.shape[-1], data_y.shape[-1], params) # I don't know why, but this row is helping producing the same training result of a neural network
        
        print('data x shape:', data_x.shape)
        print('data y shape:', data_y.shape)
        print('class y counts:', data_y.sum(0))
        print(f'class y ratio: {(data_y.sum(0)/len(data_y)).round(4)}')
        
        splitter = StratifiedShuffleSplit(n_splits = 1, test_size = test_size, random_state = SEED)
        train_idx, test_idx = list(splitter.split(data_x, data_y))[0]
        
        train_x = data_x[train_idx]
        train_y = data_y[train_idx]
        
        test_x = data_x[test_idx]
        test_y = data_y[test_idx]
        
        train_x, test_x = data_scaling(train_x, test_x)
        
        splitter_kf = KFold(n_splits = 5)
        for cv_idx, (train_idx_kf, test_idx_kf) in enumerate(splitter_kf.split(train_x, train_y)):
            train_x_kf, train_y_kf = train_x[train_idx_kf], train_y[train_idx_kf]
            test_x_kf, test_y_kf   = train_x[test_idx_kf],  train_y[test_idx_kf]
            
            train_x_kf, train_y_kf = upsample_data(train_x_kf, train_y_kf) # up-sample the training dataset
            
            model = MLP_CLS(data_x.shape[-1], data_y.shape[-1], params) 
            model_name = name_model(f'{model_type}_KFOLD', params)
            
            model_folder  = f'./models/{model_name}_{train_x.shape}_{test_x.shape}'
            if not os.path.exists(model_folder):
                os.makedirs(model_folder)
            model_path    = f'{model_folder}/{cv_idx}.h5'
            metric_path   = f'{model_folder}/{cv_idx}.csv'
            
            
            if not os.path.exists(model_path) or model_update:
                time_start = time.time()
                history = model.fit(train_x_kf, train_y_kf, verbose=verbose, 
                                    epochs = 10000, callbacks = callbacks,
                                    validation_data = (test_x_kf, test_y_kf))
                time_end = time.time()
                training_time = round((time_end - time_start)/60, 3)
                
                model.save_weights(model_path)
                
                test_loss, accuracy, precision, recall, f1 = metrics_classification(model, test_x, test_y)
                model_metrics = {
                    'model_id' : model_id,
                    'cv_idx'   : cv_idx,
                    **{f'train_{x}': train_x.shape[x] for x in range(len(train_x.shape))},
                    'train_y'     : train_y.shape[-1],
                    'test_size'   : test_x.shape[0],
                    **params,
                    'regularizer_input' : params['regularizer']['input'],
                    'regularizer_hidden' : params['regularizer']['hidden'],
                    'regularizer_bias' : params['regularizer']['bias'],
                    'training_time': training_time,
                    'test_loss': test_loss,
                    'accuracy': accuracy,
                    **{f'precision_{x}': precision[x] for x in range(len(precision))},
                    **{f'recall_{x}'   : recall[x] for x in range(len(recall))},
                    **{f'f1_{x}'       : f1[x] for x in range(len(f1))}}
                
                model_metrics = pd.DataFrame([model_metrics]).drop(['activation', 'loss', 'metrics', 'optimizer_type', 'regularizer'], axis=1)
                model_metrics.to_csv(metric_path, index=False)
                
            else:
                model.load_weights(model_path)
                model_metrics = pd.read_csv(metric_path, header=0)
                
            print(f'f1 score: {model_metrics.f1_1[0]}')
            
            model_metrics['model_id'] = model_id
            METRICs.append(model_metrics)
            MODELs.append(model)
        
        METRIC_MEAN = pd.concat(METRICs).groupby('model_id').mean()
        f1_best = METRIC_MEAN.sort_values('f1_1', ascending=False).iloc[0].f1_1
        print(f'best f1 score: {f1_best}')
        model_id += 1
    if param_name in ['learning_rate']: # for float-type parameters
        best_value = float(METRIC_MEAN.sort_values('f1_1', ascending=False).iloc[0][param_name])
        params[param_name] = best_value
        search_space[param_name][search_space[param_name].index(best_value)] = f"{best_value}"
    else: # for int-type parameters
        best_value = int(METRIC_MEAN.sort_values('f1_1', ascending=False).iloc[0][param_name])
        params[param_name] = best_value
        search_space[param_name][search_space[param_name].index(best_value)] = f"{best_value}"

clear_output(wait=True)
display(METRIC_MEAN)
for key, value in search_space.items():
    print(f'{key:<14} : {value}')


Unnamed: 0_level_0,cv_idx,train_0,train_1,train_y,test_size,rnn_layers,rnn_neurons,dnn_layers,dnn_neurons,learning_rate,...,regularizer_bias,training_time,test_loss,accuracy,precision_0,precision_1,recall_0,recall_1,f1_0,f1_1
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.0,9222.0,73.0,2.0,2306.0,1.0,64.0,1.0,64.0,0.001,...,,0.376,0.534175,73.156,97.286,6.518,74.188,46.51,84.17,11.428
2,2.0,9222.0,73.0,2.0,2306.0,1.0,64.0,2.0,64.0,0.001,...,,0.355,0.533415,73.686,97.264,6.518,74.774,45.582,84.528,11.392
3,2.0,9222.0,73.0,2.0,2306.0,1.0,64.0,3.0,64.0,0.001,...,,0.3608,0.53667,73.208,97.312,6.572,74.228,46.976,84.202,11.522
4,2.0,9222.0,73.0,2.0,2306.0,1.0,64.0,4.0,64.0,0.001,...,,0.4836,0.540062,72.958,97.254,6.406,73.998,46.046,84.04,11.242
5,2.0,9222.0,73.0,2.0,2306.0,1.0,64.0,5.0,64.0,0.001,...,,0.4982,0.538522,73.252,97.444,6.954,74.162,49.764,84.214,12.2
6,2.0,9222.0,73.0,2.0,2306.0,1.0,64.0,5.0,32.0,0.001,...,,0.4292,0.541445,72.774,97.34,6.59,73.74,47.906,83.9,11.584
7,2.0,9222.0,73.0,2.0,2306.0,1.0,64.0,5.0,64.0,0.001,...,,0.4982,0.538522,73.252,97.444,6.954,74.162,49.764,84.214,12.2
8,2.0,9222.0,73.0,2.0,2306.0,1.0,64.0,5.0,128.0,0.001,...,,0.6702,0.539587,73.174,97.252,6.436,74.238,45.814,84.19,11.278
9,2.0,9222.0,73.0,2.0,2306.0,1.0,64.0,5.0,256.0,0.001,...,,0.9046,0.533426,74.302,97.324,6.82,75.38,46.512,84.95,11.894
10,2.0,9222.0,73.0,2.0,2306.0,1.0,64.0,5.0,64.0,0.0001,...,,0.535,0.551029,71.838,97.34,6.456,72.74,48.604,83.25,11.392


dnn_layers     : [1, 2, 3, 4, '5']
dnn_neurons    : [32, '64', 128, 256]
learning_rate  : [0.0001, '0.001', 0.01]


In [49]:
print('data x shape: ', data_x.shape)
print('data y shape: ', data_y.shape)
print('train x shape:', train_x.shape)
print('test  x shape:', test_y.shape)

data x shape:  (11528, 73)
data y shape:  (11528, 2)
train x shape: (9222, 73)
test  x shape: (2306, 2)


In [50]:
print('optimal parameters')
for key, value in params.items():
    print(f'{key:<14} : {value}')

optimal parameters
rnn_layers     : 1
rnn_neurons    : 64
dnn_layers     : 5
dnn_neurons    : 64
activation     : softmax
loss           : categorical_crossentropy
metrics        : accuracy
optimizer_type : Adam
learning_rate  : 0.001
regularizer    : {'input': None, 'hidden': None, 'bias': None}
window_size    : 10


## evaluate model's general performance through Monte-Carlo cross-validation

In [51]:
keras.utils.set_random_seed(SEED)
tf.config.experimental.enable_op_determinism()

model_update = False
            
MODELs = []
METRICs = []
METRIC_MEAN = []
verbose = 0
        
data_x = []
data_y = []

for name in protein_names:
    data = pd.read_csv(f'./data/integrated_features/{name}.csv')
    ST_idx = np.where((data['residue'] == 'S') | (data['residue'] == 'T'))[0]
    
    # get X dataset
    x_onehot = get_onehots(data[x_var], columns = x_cat)
    x_features = list(x_onehot.columns)
    
    # get Y dataset
    y_onehot = get_onehots(data[y_var], columns = y_cat)
    y_labels = list(y_onehot.columns)
    
    for idx in ST_idx:
        window_x = x_onehot.iloc[idx]
        label_y  = np.array(y_onehot.iloc[idx])
        
        data_x.append(window_x)
        data_y.append(label_y)
        
data_x = np.array(data_x)
data_y = np.array(data_y)

print('data x shape:', data_x.shape)
print('data y shape:', data_y.shape)
print('class y counts:', data_y.sum(0))
print(f'class y ratio: {(data_y.sum(0)/len(data_y)).round(4)}')

splitter = StratifiedShuffleSplit(n_splits = 5, test_size = test_size, random_state = SEED)
for cv_idx, (train_idx, test_idx) in enumerate(splitter.split(data_x, data_y)):
    train_x, train_y = data_x[train_idx], data_y[train_idx]
    test_x , test_y  = data_x[test_idx],  data_y[test_idx]
    
    train_x, train_y = upsample_data(train_x, train_y) # up-sample the training dataset
    
    train_x, test_x = data_scaling(train_x, test_x)
    
    model = MLP_CLS(data_x.shape[-1], data_y.shape[-1], params)
    model_name = name_model(f'{model_type}', params)
    
    
    model_folder  = f'./models/{model_name}_{train_x.shape}_{test_x.shape}'
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)
    model_path    = f'{model_folder}/{cv_idx}.h5'
    metric_path   = f'{model_folder}/{cv_idx}.csv'
    
    
    if not os.path.exists(model_path) or model_update:
        time_start = time.time()
        history = model.fit(train_x, train_y, verbose=verbose, 
                            epochs = 10000, callbacks = callbacks,
                            validation_split = test_size/(1-test_size))
        time_end = time.time()
        training_time = round((time_end - time_start)/60, 3)
        
        model.save_weights(model_path)
        
        test_loss, accuracy, precision, recall, f1 = metrics_classification(model, test_x, test_y)
        model_metrics = {
            'model_id' : model_id,
            'cv_idx'   : cv_idx,
            **{f'train_{x}': train_x.shape[x] for x in range(len(train_x.shape))},
            'train_y'     : train_y.shape[-1],
            'test_size'   : test_x.shape[0],
            **params,
            'regularizer_input' : params['regularizer']['input'],
            'regularizer_hidden' : params['regularizer']['hidden'],
            'regularizer_bias' : params['regularizer']['bias'],
            'training_time': training_time,
            'test_loss': test_loss,
            'accuracy': accuracy,
            **{f'precision_{x}': precision[x] for x in range(len(precision))},
            **{f'recall_{x}'   : recall[x] for x in range(len(recall))},
            **{f'f1_{x}'       : f1[x] for x in range(len(f1))}}
        model_metrics = pd.DataFrame([model_metrics]).drop(['activation', 'loss', 'metrics', 'optimizer_type', 'regularizer'], axis=1)
        model_metrics.to_csv(metric_path, index=False)
        
    else:
        model.load_weights(model_path)
        model_metrics = pd.read_csv(metric_path, header=0)
        
    print(f'f1 score: {model_metrics.f1_1[0]}')
    
    METRICs.append(model_metrics)
    MODELs.append(model)

METRICs = pd.concat(METRICs)
METRIC_MEAN = METRICs.groupby('model_id').mean()
METRIC_STD = METRICs.groupby('model_id').std()

data x shape: (11528, 73)
data y shape: (11528, 2)
class y counts: [11100   428]
class y ratio: [0.9629 0.0371]


  train_data_sc = (train_data - x_min) / (x_max - x_min)
  test_data_sc  = (test_data - x_min)  / (x_max - x_min)


f1 score: 11.24


  train_data_sc = (train_data - x_min) / (x_max - x_min)
  test_data_sc  = (test_data - x_min)  / (x_max - x_min)


f1 score: 11.33


  train_data_sc = (train_data - x_min) / (x_max - x_min)
  test_data_sc  = (test_data - x_min)  / (x_max - x_min)


f1 score: 10.96


  train_data_sc = (train_data - x_min) / (x_max - x_min)
  test_data_sc  = (test_data - x_min)  / (x_max - x_min)


f1 score: 12.45


  train_data_sc = (train_data - x_min) / (x_max - x_min)
  test_data_sc  = (test_data - x_min)  / (x_max - x_min)


f1 score: 12.4


In [52]:
METRICs

Unnamed: 0,model_id,cv_idx,train_0,train_1,train_y,test_size,rnn_layers,rnn_neurons,dnn_layers,dnn_neurons,...,regularizer_bias,training_time,test_loss,accuracy,precision_0,precision_1,recall_0,recall_1,f1_0,f1_1
0,13,0,17760,73,2,2306,1,64,5,64,...,,0.552,0.587849,69.17,97.42,6.29,69.82,52.33,81.34,11.24
0,13,1,17760,73,2,2306,1,64,5,64,...,,0.679,0.666035,60.62,97.95,6.18,60.36,67.44,74.69,11.33
0,13,2,17760,73,2,2306,1,64,5,64,...,,1.39,0.613147,65.48,97.53,6.06,65.81,56.98,78.59,10.96
0,13,3,17760,73,2,2306,1,64,5,64,...,,1.161,0.620496,65.22,98.04,6.87,65.18,66.28,78.3,12.45
0,13,4,17760,73,2,2306,1,64,5,64,...,,0.864,0.591603,68.13,97.81,6.91,68.42,60.47,80.52,12.4


In [53]:
METRIC_MEAN

Unnamed: 0_level_0,cv_idx,train_0,train_1,train_y,test_size,rnn_layers,rnn_neurons,dnn_layers,dnn_neurons,learning_rate,window_size,training_time,test_loss,accuracy,precision_0,precision_1,recall_0,recall_1,f1_0,f1_1
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
13,2.0,17760.0,73.0,2.0,2306.0,1.0,64.0,5.0,64.0,0.001,10.0,0.9292,0.615826,65.724,97.75,6.462,65.918,60.7,78.688,11.676


In [54]:
METRIC_STD

Unnamed: 0_level_0,cv_idx,train_0,train_1,train_y,test_size,rnn_layers,rnn_neurons,dnn_layers,dnn_neurons,learning_rate,window_size,training_time,test_loss,accuracy,precision_0,precision_1,recall_0,recall_1,f1_0,f1_1
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
13,1.581139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.344673,0.031303,3.317684,0.266927,0.399337,3.637763,6.33467,2.575533,0.697445
