In [1]:
# '''
# function ConnectButton(){
#     console.log("Connect pushed");
#     document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click()
# }

# setInterval(ConnectButton,60000);
# '''

# from google.colab import drive
# from os import chdir

# drive.mount('/content/drive')
# project_path = '/content/drive/MyDrive/Gproject/o-linked-site-prediction-feature-augment'
# chdir(project_path)

In [2]:
import os
import numpy as np
import pandas as pd

# set seed for the reproducible result
SEED = 42

In [3]:
data_dir = './data/integrated_features' # we will get names from the augmented proteins
protein_names = [x.split('.')[0] for x in os.listdir(data_dir) if x.split('.')[1] == 'csv'] # get protein name list to be processed for building machine learning models
print('the number of initial proteins:', len(protein_names))
print(protein_names[:10])

the number of initial proteins: 104
['A2ABU4', 'A2AHJ4', 'A2AKB9', 'A2AQ25', 'E9Q1P8', 'E9Q5G3', 'O08537', 'O09061', 'O35303', 'O70263']


## hyper parameter optimization by K-fold cross-validation

In [4]:
import tensorflow as tf
import keras
from keras.optimizers import Adam
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold

import time
from IPython.display import clear_output

from ml_models import *

epochs = 1000
from keras.callbacks import EarlyStopping
patience = 30
callbacks = [EarlyStopping(patience=patience, restore_best_weights=True, monitor='val_loss')]

test_size = 0.2


### set initial parameters

In [5]:
from functions import *

In [6]:
initial_params = default_params.copy()
initial_params.update({
    'window_size'    : 10,
    'rnn_layers'     : 5,
    'rnn_neurons'    : 64,
    'dnn_layers'     : 3,
    'dnn_neurons'    : 64
    })

print('initial parameters')
for key, value in initial_params.items():
    print(f'{key:<14} : {value}')
    
n_splits = 5

initial parameters
rnn_layers     : 5
rnn_neurons    : 64
dnn_layers     : 3
dnn_neurons    : 64
activation     : softmax
loss           : categorical_crossentropy
metrics        : accuracy
optimizer_type : Adam
learning_rate  : 0.001
regularizer    : {'input': None, 'hidden': None, 'bias': None}
window_size    : 10


In [19]:
n_splits = 5

regularizer_list = [
    {'input': None, 'hidden': None, 'bias': None},
    
    {'input': 'L21_0.00001', 'hidden': None,          'bias': None},
    {'input': 'L21_0.00001', 'hidden': 'L21_0.00001', 'bias': None},
    {'input':  'L1_0.00001', 'hidden': None,          'bias': None},
    {'input':  'L1_0.00001',  'hidden': 'L1_0.00001', 'bias': None},
    
    {'input': 'L21_0.0001', 'hidden': None,          'bias': None},
    {'input': 'L21_0.0001', 'hidden': 'L21_0.0001', 'bias': None},
    {'input':  'L1_0.0001', 'hidden': None,          'bias': None},
    {'input':  'L1_0.0001',  'hidden': 'L1_0.0001', 'bias': None},
]
weights_list = []
for regularizer in regularizer_list:
    weights_list.append(pd.read_csv(f'./weights/SLSTM_UP_AUGMENT_ONLY_5_64_3_64_Adam_0.001_{regularizer.get("input")}_{regularizer.get("hidden")}_None_10_(11528 21 498).csv',
                                index_col=0, names=['weights'], header = 0))
print(len(list(weights_list[0].index)))
display(list(weights_list[0].index)[:10])
display(list(weights_list[0].index)[-10:])

['number_hydrophobic_0A',
 'number_hydrophilic_0A',
 'number_polar_0A',
 'number_aromatic_0A',
 'number_aliphatic_0A',
 'number_charged_0A',
 'number_positive_0A',
 'number_negative_0A',
 'number_gly_0A',
 'number_very_small_0A']

In [None]:
ratio_list = [0.05, 0.10, 0.20, 0.40]

model_update  = False
pd.set_option('display.max_rows', None)

keras.utils.set_random_seed(SEED)
tf.config.experimental.enable_op_determinism()
verbose = 0

METRICs = []

for i, regularizer in enumerate(regularizer_list):
    for i, ratio in enumerate(ratio_list):
        for j, top_bottom in enumerate(['TOP', 'BOTTOM']):
            model_id = i * 10 + j + 1
            
            top_weights = pd.concat([weights.sort_values(by = 'weights', ascending = False)[:int(len(weights)*ratio)] for weights in weights_list], axis=1)    
            weight_scores = (top_weights > 0).sum(axis=1).sort_values(ascending=False)
            display(weight_scores[weight_scores >= min_score])

            best_features = list(weight_scores[weight_scores >= min_score].index)
            not_in_best   = [x for x in list(weights_list[0].index) if x not in best_features]

            model_type = f'{top_bottom}_{ratio}'

            # set input features
            x_cts = best_features if top_bottom == 'TOP' else not_in_best
            x_cat = []
            x_var = x_cts + x_cat

            # set output feature
            y_cts = []
            y_cat = ['positivity']
            y_var = y_cts + y_cat

            params = initial_params.copy()

            data_x = []
            data_y = []
            for name in protein_names:
                data = pd.read_csv(f'./data/integrated_features/{name}.csv')
                ST_idx = np.where((data['residue'] == 'S') | (data['residue'] == 'T'))[0]

                # get X dataset
                x_onehot = get_onehots(data[x_var], columns = x_cat)
                x_features = list(x_onehot.columns)

                # get Y dataset
                y_onehot = get_onehots(data[y_var], columns = y_cat)
                y_labels = list(y_onehot.columns)

                for idx in ST_idx:
                    window_x = np.array(get_window(x_onehot, idx, params['window_size']))
                    label_y  = np.array(y_onehot.iloc[idx])

                    data_x.append(window_x)
                    data_y.append(label_y)

            data_x = np.array(data_x)
            data_y = np.array(data_y)

            for cv_idx in range(n_splits):
                clear_output(wait=True)
                if METRICs:
                    display(pd.concat(METRICs).drop(['input', 'hidden'], axis=1).groupby('model_id').mean())
                else:
                    display(METRICs)

                print('data x shape:', data_x.shape)
                print(params)

                model = LSTM_CLS(data_x.shape[1], data_x.shape[-1], data_y.shape[-1], params) # I don't know why, but this row is helping producing the same training result of a neural network

                splitter = StratifiedShuffleSplit(n_splits = n_splits, test_size = test_size, random_state = SEED)
                train_idx, test_idx = list(splitter.split(data_x, data_y))[cv_idx]

                train_x = data_x[train_idx]
                train_y = data_y[train_idx]

                test_x = data_x[test_idx]
                test_y = data_y[test_idx]

                train_x, train_y = upsample_data(train_x, train_y) # up-sample the training dataset
                train_x, test_x = data_scaling(train_x, test_x)

                model = LSTM_CLS(data_x.shape[1], data_x.shape[-1], data_y.shape[-1], params)
                model_name = name_model(f'{model_type}', params)
                print('model_name:', model_name)

                model_folder  = f'./models/{model_name}_{data_x.shape}'
                if not os.path.exists(model_folder):
                    os.makedirs(model_folder)
                model_path    = f'{model_folder}/{cv_idx}.h5'
                metric_path   = f'{model_folder}/{cv_idx}.csv'

                if not os.path.exists(model_path) or model_update:
                    time_start = time.time()
                    history = model.fit(train_x, train_y, verbose=verbose,
                                        epochs = 10000, callbacks = callbacks,
                                        validation_split = test_size/(1-test_size))
                    time_end = time.time()
                    training_time = round((time_end - time_start)/60, 3)

                    model.save_weights(model_path)

                    test_loss, accuracy, precision, recall, f1 = metrics_classification(model, test_x, test_y)
                    model_metrics = {
                        'model_id' : model_id,
                        'cv_idx'   : cv_idx,
                        **{f'train_{x}': train_x.shape[x] for x in range(len(train_x.shape))},
                        'train_y'     : train_y.shape[-1],
                        'test_size'   : test_x.shape[0],
                        **params,
                        'regularizer_input' : params['regularizer']['input'],
                        'regularizer_hidden' : params['regularizer']['hidden'],
                        'regularizer_bias' : params['regularizer']['bias'],
                        'training_time': training_time,
                        'test_loss': test_loss,
                        'accuracy': accuracy,
                        **{f'precision_{x}': precision[x] for x in range(len(precision))},
                        **{f'recall_{x}'   : recall[x] for x in range(len(recall))},
                        **{f'f1_{x}'       : f1[x] for x in range(len(f1))}}
                    model_metrics = pd.DataFrame([model_metrics]).drop(['activation', 'loss', 'metrics', 'optimizer_type', 'regularizer'], axis=1)
                    model_metrics.to_csv(metric_path, index=False)

                else:
                    model.load_weights(model_path)
                    model_metrics = pd.read_csv(metric_path, header=0)
                    model_metrics['model_id'] = model_id

                print(f'f1 score: {model_metrics.f1_1[0]}')

                model_metrics['ratio'] = ratio
                METRICs.append(model_metrics[['model_id', 'cv_idx', 'input', 'hidden',
                                                'train_2', 'rnn_layers', 'dnn_layers',
                                                'f1_1', 'precision_1', 'recall_1', 'training_time', 'test_loss', 'accuracy']])
            

In [None]:
model_update  = False
pd.set_option('display.max_rows', None)

keras.utils.set_random_seed(SEED)
tf.config.experimental.enable_op_determinism()
verbose = 0

METRICs = []
for cv_idx in range(n_splits):
    for i, regularizer in enumerate(regularizer_list):
        for j, ratio in enumerate(ratio_list):
            clear_output(wait=True)
            if METRICs:
                display(pd.concat(METRICs, axis=0).groupby('model_id').mean())
            else:
                display(METRICs)
                
            model_id = 10 * i + j + 1
            
            weights = pd.read_csv(f'./weights/SLSTM_UP_AUGMENT_ONLY_5_64_3_64_Adam_0.001_{regularizer.get("input")}_{regularizer.get("hidden")}_None_10_(11528, 21, 498).csv',
                                index_col=0, names=['weights'])
            selected_features = weights.sort_values(by = 'weights', ascending = False).iloc[ : int(len(weights) * ratio)]
            selected_features = [x for x in selected_features.weights.index]

            model_type = f'SELECTED_{ratio}_{regularizer.get("input")}_{regularizer.get("hidden")}'
            
            # set input features
            x_cts = selected_features
            x_cat = []
            x_var = x_cts + x_cat

            # set output feature
            y_cts = []
            y_cat = ['positivity']
            y_var = y_cts + y_cat

            params = initial_params.copy()

            data_x = []
            data_y = []
            for name in protein_names:
                data = pd.read_csv(f'./data/integrated_features/{name}.csv')
                ST_idx = np.where((data['residue'] == 'S') | (data['residue'] == 'T'))[0]

                # get X dataset
                x_onehot = get_onehots(data[x_var], columns = x_cat)
                x_features = list(x_onehot.columns)

                # get Y dataset
                y_onehot = get_onehots(data[y_var], columns = y_cat)
                y_labels = list(y_onehot.columns)

                for idx in ST_idx:
                    window_x = np.array(get_window(x_onehot, idx, params['window_size']))
                    label_y  = np.array(y_onehot.iloc[idx])

                    data_x.append(window_x)
                    data_y.append(label_y)

            data_x = np.array(data_x)
            data_y = np.array(data_y)

            print('data x shape:', data_x.shape)
            print(params)

            model = LSTM_CLS(data_x.shape[1], data_x.shape[-1], data_y.shape[-1], params) # I don't know why, but this row is helping producing the same training result of a neural network

            splitter = StratifiedShuffleSplit(n_splits = n_splits, test_size = test_size, random_state = SEED)
            train_idx, test_idx = list(splitter.split(data_x, data_y))[cv_idx]

            train_x = data_x[train_idx]
            train_y = data_y[train_idx]

            test_x = data_x[test_idx]
            test_y = data_y[test_idx]

            train_x, test_x = data_scaling(train_x, test_x)
    
            model = LSTM_CLS(data_x.shape[1], data_x.shape[-1], data_y.shape[-1], params)
            model_name = name_model(f'{model_type}', params)
            print('model_name:', model_name)

            model_folder  = f'./models/{model_name}_{data_x.shape}'
            if not os.path.exists(model_folder):
                os.makedirs(model_folder)
            model_path    = f'{model_folder}/{cv_idx}.h5'
            metric_path   = f'{model_folder}/{cv_idx}.csv'

            if not os.path.exists(model_path) or model_update:
                time_start = time.time()
                history = model.fit(train_x, train_y, verbose=verbose,
                                    epochs = 10000, callbacks = callbacks,
                                    validation_split = test_size/(1-test_size))
                time_end = time.time()
                training_time = round((time_end - time_start)/60, 3)

                model.save_weights(model_path)

                test_loss, accuracy, precision, recall, f1 = metrics_classification(model, test_x, test_y)
                model_metrics = {
                    'model_id' : model_id,
                    'cv_idx'   : cv_idx,
                    **{f'train_{x}': train_x.shape[x] for x in range(len(train_x.shape))},
                    'train_y'     : train_y.shape[-1],
                    'test_size'   : test_x.shape[0],
                    **params,
                    'regularizer_input' : params['regularizer']['input'],
                    'regularizer_hidden' : params['regularizer']['hidden'],
                    'regularizer_bias' : params['regularizer']['bias'],
                    'training_time': training_time,
                    'test_loss': test_loss,
                    'accuracy': accuracy,
                    **{f'precision_{x}': precision[x] for x in range(len(precision))},
                    **{f'recall_{x}'   : recall[x] for x in range(len(recall))},
                    **{f'f1_{x}'       : f1[x] for x in range(len(f1))}}
                model_metrics = pd.DataFrame([model_metrics]).drop(['activation', 'loss', 'metrics', 'optimizer_type', 'regularizer'], axis=1)
                model_metrics.to_csv(metric_path, index=False)

            else:
                model.load_weights(model_path)
                model_metrics = pd.read_csv(metric_path, header=0)
                model_metrics['model_id'] = model_id

            print(f'f1 score: {model_metrics.f1_1[0]}')

            model_metrics['ratio'] = ratio
            model_metrics['input']  = regularizer.get("input", None)
            model_metrics['hidden']  = regularizer.get("hidden", None)
            METRICs.append(model_metrics[['model_id', 'cv_idx', 'input', 'hidden',
                                          'train_2', 'rnn_layers', 'dnn_layers',
                                          'f1_1', 'precision_1', 'recall_1', 'training_time', 'test_loss', 'accuracy']])

[]

Unnamed: 0_level_0,cv_idx,train_0,train_1,train_2,train_y,test_size,rnn_layers,rnn_neurons,dnn_layers,dnn_neurons,...,window_size,training_time,test_loss,accuracy,precision_0,precision_1,recall_0,recall_1,f1_0,f1_1
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.0,9222.0,21.0,18.0,2.0,2306.0,2.0,64.0,3.0,64.0,...,10.0,3.951,0.37875,87.442,97.156,10.802,89.586,32.094,93.178,15.878
2,2.0,9222.0,21.0,18.0,2.0,2306.0,3.0,64.0,3.0,64.0,...,10.0,12.6398,0.317314,90.4,97.13,16.892,92.776,29.07,94.804,20.052
3,2.0,9222.0,21.0,18.0,2.0,2306.0,4.0,64.0,3.0,64.0,...,10.0,18.0984,0.273853,94.006,96.984,22.05,96.784,22.326,96.88,21.662


data x shape: (11528, 21, 18)
data y shape: (11528, 2)
class y counts: [11100   428]
class y ratio: [0.9629 0.0371]
f1 score: 22.07
f1 score: 22.75
f1 score: 22.34
f1 score: 25.14


KeyboardInterrupt: 

In [None]:
model_update  = False
pd.set_option('display.max_rows', None)

keras.utils.set_random_seed(SEED)
tf.config.experimental.enable_op_determinism()
verbose = 0

METRICs = []
for cv_idx in range(n_splits):
    for i, regularizer in enumerate(regularizer_list):
        for j, ratio in enumerate(ratio_list):
            clear_output(wait=True)
            if METRICs:
                display(pd.concat(METRICs, axis=0).groupby('model_id').mean())
            else:
                display(METRICs)
                
            model_id = 10 * i + j + 1
            
            weights = pd.read_csv(f'./weights/SLSTM_UP_AUGMENT_ONLY_5_64_3_64_Adam_0.001_{regularizer.get("input")}_{regularizer.get("hidden")}_None_10_(11528, 21, 498).csv',
                                index_col=0, names=['weights'])
            selected_features = weights.sort_values(by = 'weights', ascending = True).iloc[ : int(len(weights) * ratio)]
            selected_features = [x for x in selected_features.weights.index]

            model_type = f'SELECTED_C_{ratio}_{regularizer.get("input")}_{regularizer.get("hidden")}'
            
            # set input features
            x_cts = selected_features
            x_cat = []
            x_var = x_cts + x_cat

            # set output feature
            y_cts = []
            y_cat = ['positivity']
            y_var = y_cts + y_cat

            params = initial_params.copy()

            data_x = []
            data_y = []
            for name in protein_names:
                data = pd.read_csv(f'./data/integrated_features/{name}.csv')
                ST_idx = np.where((data['residue'] == 'S') | (data['residue'] == 'T'))[0]

                # get X dataset
                x_onehot = get_onehots(data[x_var], columns = x_cat)
                x_features = list(x_onehot.columns)

                # get Y dataset
                y_onehot = get_onehots(data[y_var], columns = y_cat)
                y_labels = list(y_onehot.columns)

                for idx in ST_idx:
                    window_x = np.array(get_window(x_onehot, idx, params['window_size']))
                    label_y  = np.array(y_onehot.iloc[idx])

                    data_x.append(window_x)
                    data_y.append(label_y)

            data_x = np.array(data_x)
            data_y = np.array(data_y)

            print('data x shape:', data_x.shape)
            print(params)

            model = LSTM_CLS(data_x.shape[1], data_x.shape[-1], data_y.shape[-1], params) # I don't know why, but this row is helping producing the same training result of a neural network

            splitter = StratifiedShuffleSplit(n_splits = n_splits, test_size = test_size, random_state = SEED)
            train_idx, test_idx = list(splitter.split(data_x, data_y))[cv_idx]

            train_x = data_x[train_idx]
            train_y = data_y[train_idx]

            test_x = data_x[test_idx]
            test_y = data_y[test_idx]

            train_x, test_x = data_scaling(train_x, test_x)
    
            model = LSTM_CLS(data_x.shape[1], data_x.shape[-1], data_y.shape[-1], params)
            model_name = name_model(f'{model_type}', params)

            model_folder  = f'./models/{model_name}_{data_x.shape}'
            if not os.path.exists(model_folder):
                os.makedirs(model_folder)
            model_path    = f'{model_folder}/{cv_idx}.h5'
            metric_path   = f'{model_folder}/{cv_idx}.csv'

            if not os.path.exists(model_path) or model_update:
                time_start = time.time()
                history = model.fit(train_x, train_y, verbose=verbose,
                                    epochs = 10000, callbacks = callbacks,
                                    validation_split = test_size/(1-test_size))
                time_end = time.time()
                training_time = round((time_end - time_start)/60, 3)

                model.save_weights(model_path)

                test_loss, accuracy, precision, recall, f1 = metrics_classification(model, test_x, test_y)
                model_metrics = {
                    'model_id' : model_id,
                    'cv_idx'   : cv_idx,
                    **{f'train_{x}': train_x.shape[x] for x in range(len(train_x.shape))},
                    'train_y'     : train_y.shape[-1],
                    'test_size'   : test_x.shape[0],
                    **params,
                    'regularizer_input' : params['regularizer']['input'],
                    'regularizer_hidden' : params['regularizer']['hidden'],
                    'regularizer_bias' : params['regularizer']['bias'],
                    'training_time': training_time,
                    'test_loss': test_loss,
                    'accuracy': accuracy,
                    **{f'precision_{x}': precision[x] for x in range(len(precision))},
                    **{f'recall_{x}'   : recall[x] for x in range(len(recall))},
                    **{f'f1_{x}'       : f1[x] for x in range(len(f1))}}
                model_metrics = pd.DataFrame([model_metrics]).drop(['activation', 'loss', 'metrics', 'optimizer_type', 'regularizer'], axis=1)
                model_metrics.to_csv(metric_path, index=False)

            else:
                model.load_weights(model_path)
                model_metrics = pd.read_csv(metric_path, header=0)
                model_metrics['model_id'] = model_id

            print(f'f1 score: {model_metrics.f1_1[0]}')

            model_metrics['ratio'] = ratio
            model_metrics['input']  = regularizer.get("input", None)
            model_metrics['hidden']  = regularizer.get("hidden", None)
            METRICs.append(model_metrics[['model_id', 'cv_idx', 'input', 'hidden',
                                          'train_2', 'rnn_layers', 'dnn_layers',
                                          'f1_1', 'precision_1', 'recall_1', 'training_time', 'test_loss', 'accuracy']])