In [1]:
# '''
# function ConnectButton(){
#     console.log("Connect pushed");
#     document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click()
# }

# setInterval(ConnectButton,60000);
# '''

# from google.colab import drive
# from os import chdir

# drive.mount('/content/drive')
# project_path = '/content/drive/MyDrive/Gproject/o-linked-site-prediction-feature-augment'
# chdir(project_path)

In [2]:
import os
import numpy as np
import pandas as pd

# set seed for the reproducible result
SEED = 42

In [3]:
data_dir = './data/integrated_features' # we will get names from the augmented proteins
protein_names = [x.split('.')[0] for x in os.listdir(data_dir) if x.split('.')[1] == 'csv'] # get protein name list to be processed for building machine learning models
print('the number of initial proteins:', len(protein_names))
print(protein_names[:10])

the number of initial proteins: 104
['A2ABU4', 'A2AHJ4', 'A2AKB9', 'A2AQ25', 'E9Q1P8', 'E9Q5G3', 'O08537', 'O09061', 'O35303', 'O70263']


## hyper parameter optimization by K-fold cross-validation

In [4]:
import tensorflow as tf
import keras
from keras.optimizers import Adam
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold

import time
from IPython.display import clear_output

from ml_models import *

epochs = 1000
from keras.callbacks import EarlyStopping
patience = 30
callbacks = [EarlyStopping(patience=patience, restore_best_weights=True, monitor='val_loss')]

test_size = 0.2


### set initial parameters

In [5]:
from functions import *

In [27]:
initial_params = default_params.copy()
initial_params.update({
    'window_size'    : 10
    })

print('initial parameters')
for key, value in initial_params.items():
    print(f'{key:<14} : {value}')

search_space = {
    'rnn_layers'    : [2, 3, 4, 5],
    'dnn_layers'    : [2, 3, 4, 5],
}

initial parameters
rnn_layers     : 1
rnn_neurons    : 64
dnn_layers     : 3
dnn_neurons    : 64
activation     : softmax
loss           : categorical_crossentropy
metrics        : accuracy
optimizer_type : Adam
learning_rate  : 0.001
regularizer    : {'input': None, 'hidden': None, 'bias': None}
window_size    : 10


In [28]:
ratio_list = [0.05, 0.10, 0.20, 0.40, 0.80]
regularizer_list = [
    {'input': 'L21_0.00001', 'hidden': None,          'bias': None},
    {'input': 'L21_0.00001', 'hidden': 'L21_0.00001', 'bias': None},
    {'input': 'L1_0.0001',   'hidden': None,          'bias': None},
    {'input': 'L1_0.0001',   'hidden': 'L1_0.0001',   'bias': None},
]
yes_no = ['WITH', 'NO']

In [37]:
model_update  = False

keras.utils.set_random_seed(SEED)
tf.config.experimental.enable_op_determinism()
verbose = 0

search_terms = ['0A', '5A', '10A', '15A', '20A', '25A']
model_id = 1
ALL_METRICS = []

for yn in yes_no:
    for ratio in ratio_list:
        for regularizer in regularizer_list:
            METRICs = []
            METRIC_MEAN = []
            
            weights = pd.read_csv(f'./weights/SLSTM_UP_AUGMENT_KFOLD_4_64_3_64_Adam_0.001_{regularizer.get("input")}_{regularizer.get("hidden")}_None_10_(11528, 21, 518).csv', 
                                index_col=0, names=['weights'])
            selected_features = weights.sort_values(by = 'weights', ascending = False).iloc[ : int(len(weights) * ratio)]
            selected_features = [x for x in selected_features.weights.index if any(term in x for term in search_terms)]
            
            model_type = f'SELECTED_{yn}_SEQ_{regularizer.get("input")}_{regularizer.get("hidden")}'
            
            # set input features
            x_cts = selected_features
            x_cat = ['residue'] if yn == 'WITH' else []
            x_var = x_cts + x_cat
            
            # set output feature
            y_cts = []
            y_cat = ['positivity']
            y_var = y_cts + y_cat
            
            params = initial_params.copy()
            
            data_x = []
            data_y = []
            for name in protein_names:
                data = pd.read_csv(f'./data/integrated_features/{name}.csv')
                ST_idx = np.where((data['residue'] == 'S') | (data['residue'] == 'T'))[0]
                
                # get X dataset
                x_onehot = get_onehots(data[x_var], columns = x_cat)
                x_features = list(x_onehot.columns)
                
                # get Y dataset
                y_onehot = get_onehots(data[y_var], columns = y_cat)
                y_labels = list(y_onehot.columns)
                
                for idx in ST_idx:
                    window_x = np.array(get_window(x_onehot, idx, params['window_size']))
                    label_y  = np.array(y_onehot.iloc[idx])
                    
                    data_x.append(window_x)
                    data_y.append(label_y)
                    
            model = LSTM_CLS(data_x.shape[1], data_x.shape[-1], data_y.shape[-1], params) # I don't know why, but this row is helping producing the same training result of a neural network
            
            splitter = StratifiedShuffleSplit(n_splits = 1, test_size = test_size, random_state = SEED)
            train_idx, test_idx = list(splitter.split(data_x, data_y))[0]
            
            train_x = data_x[train_idx]
            train_y = data_y[train_idx]
            
            test_x = data_x[test_idx]
            test_y = data_y[test_idx]
            
            train_x, test_x = data_scaling(train_x, test_x)
            
            for param_name, space in search_space.items():
                for point in space:
                    clear_output(wait=True)
                    if ALL_METRICS:
                        display(pd.concat(ALL_METRICS, axis=0))
                    else:
                        display(ALL_METRICS)
                    display(METRIC_MEAN)
                    params[param_name] = point
                            
                    print('data x shape:', data_x.shape)
                    print('data y shape:', data_y.shape)
                    print('class y counts:', data_y.sum(0))
                    print(f'class y ratio: {(data_y.sum(0)/len(data_y)).round(4)}')
                    
                    splitter_kf = KFold(n_splits = 5)
                    
                    for cv_idx, (train_idx_kf, test_idx_kf) in enumerate(splitter_kf.split(train_x, train_y)):
                        train_x_kf, train_y_kf = train_x[train_idx_kf], train_y[train_idx_kf]
                        test_x_kf, test_y_kf   = train_x[test_idx_kf],  train_y[test_idx_kf]
                        
                        # up-sample the training dataset
                        train_x_kf, train_y_kf = upsample_data(train_x_kf, train_y_kf) 
                        
                        model = LSTM_CLS(data_x.shape[1], data_x.shape[-1], data_y.shape[-1], params) 
                        model_name = name_model(f'{model_type}_KFOLD', params)
                        
                        model_folder  = f'./models/{model_name}_{data_x.shape}'
                        if not os.path.exists(model_folder):
                            os.makedirs(model_folder)
                        model_path    = f'{model_folder}/{cv_idx}.h5'
                        metric_path   = f'{model_folder}/{cv_idx}.csv'
                        
                        
                        if not os.path.exists(model_path) or model_update:
                            time_start = time.time()
                            history = model.fit(train_x_kf, train_y_kf, verbose=verbose, 
                                                epochs = 10000, callbacks = callbacks,
                                                validation_data = (test_x_kf, test_y_kf))
                            time_end = time.time()
                            training_time = round((time_end - time_start)/60, 3)
                            
                            model.save_weights(model_path)
                            
                            test_loss, accuracy, precision, recall, f1 = metrics_classification(model, test_x, test_y)
                            model_metrics = {
                                'model_id' : model_id,
                                'cv_idx'   : cv_idx,
                                **{f'train_{x}': train_x.shape[x] for x in range(len(train_x.shape))},
                                'train_y'     : train_y.shape[-1],
                                'test_size'   : test_x.shape[0],
                                **params,
                                'regularizer_input' : params['regularizer']['input'],
                                'regularizer_hidden' : params['regularizer']['hidden'],
                                'regularizer_bias' : params['regularizer']['bias'],
                                'training_time': training_time,
                                'test_loss': test_loss,
                                'accuracy': accuracy,
                                **{f'precision_{x}': precision[x] for x in range(len(precision))},
                                **{f'recall_{x}'   : recall[x] for x in range(len(recall))},
                                **{f'f1_{x}'       : f1[x] for x in range(len(f1))}}
                            
                            model_metrics = pd.DataFrame([model_metrics]).drop(['activation', 'loss', 'metrics', 'optimizer_type', 'regularizer'], axis=1)
                            model_metrics.to_csv(metric_path, index=False)
                            
                        else:
                            model.load_weights(model_path)
                            model_metrics = pd.read_csv(metric_path, header=0)
                            
                        print(f'f1 score: {model_metrics.f1_1[0]}')
                        
                        model_metrics['model_id'] = model_id
                        METRICs.append(model_metrics)
                    
                    METRIC_MEAN = pd.concat(METRICs).groupby('model_id').mean()
                    model_id += 1
                    
                if param_name in ['learning_rate']: # for float-type parameters
                    best_value = float(METRIC_MEAN.sort_values('f1_1', ascending=False).iloc[0][param_name])
                    params[param_name] = best_value
                    
                else: # for int-type parameters
                    best_value = int(METRIC_MEAN.sort_values('f1_1', ascending=False).iloc[0][param_name])
                    params[param_name] = best_value
            
            ALL_METRICS.append(METRIC_MEAN.sort_values('f1_1', ascending=False).iloc[[0]])

Unnamed: 0_level_0,cv_idx,train_0,train_1,train_2,train_y,test_size,rnn_layers,rnn_neurons,dnn_layers,dnn_neurons,...,regularizer_bias,training_time,test_loss,accuracy,precision_0,precision_1,recall_0,recall_1,f1_0,f1_1
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,2.0,400.0,21.0,38.0,2.0,100.0,5.0,64.0,5.0,64.0,...,,0.1736,0.223039,95.6,99.38,7.5,96.162,40.0,97.722,12.444
13,2.0,400.0,21.0,41.0,2.0,100.0,5.0,64.0,2.0,64.0,...,,0.1816,0.386105,86.6,99.794,6.044,86.668,80.0,92.544,11.034


Unnamed: 0_level_0,cv_idx,train_0,train_1,train_2,train_y,test_size,rnn_layers,rnn_neurons,dnn_layers,dnn_neurons,...,window_size,training_time,test_loss,accuracy,precision_0,precision_1,recall_0,recall_1,f1_0,f1_1
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17,2.0,400.0,21.0,43.0,2.0,100.0,2.0,64.0,3.0,64.0,...,10.0,0.0856,0.487073,78.2,98.982,0.87,78.79,20.0,87.572,1.666


data x shape: (500, 21, 43)
data y shape: (500, 2)
class y counts: [494   6]
class y ratio: [0.988 0.012]
f1 score: 0.0
f1 score: 0.0


KeyboardInterrupt: 

In [None]:
pd.concat(ALL_METRICS, axis=0)

cv_idx                  2.000000
train_0               400.000000
train_1                21.000000
train_2                38.000000
train_y                 2.000000
test_size             100.000000
rnn_layers              5.000000
rnn_neurons            64.000000
dnn_layers              5.000000
dnn_neurons            64.000000
learning_rate           0.001000
window_size            10.000000
regularizer_input            NaN
regularizer_hidden           NaN
regularizer_bias             NaN
training_time           0.173600
test_loss               0.223039
accuracy               95.600000
precision_0            99.380000
precision_1             7.500000
recall_0               96.162000
recall_1               40.000000
f1_0                   97.722000
f1_1                   12.444000
Name: 8, dtype: float64

In [36]:
model_path

'./models/SELECTED_WITH_SEQ_L21_0.00001_L21_0.00001_KFOLD_3_64_3_64_Adam_0.001_None_None_None_10_(500, 21, 41)/1.h5'

In [35]:
os.path.exists(model_path)

True