In [1]:
# '''
# function ConnectButton(){
#     console.log("Connect pushed");
#     document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click()
# }

# setInterval(ConnectButton,60000);
# '''

# from google.colab import drive
# from os import chdir

# drive.mount('/content/drive')
# project_path = '/content/drive/MyDrive/Gproject/o-linked-site-prediction-feature-augment'
# chdir(project_path)

In [2]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 2722095239187598168
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 9973006336
locality {
  bus_id: 1
  links {
  }
}
incarnation: 3649729354498515290
physical_device_desc: "device: 0, name: NVIDIA RTX 4000 Ada Generation Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9"
xla_global_id: 416903419
]


In [3]:
import os
import numpy as np
import pandas as pd

# set seed for the reproducible result
SEED = 42

In [4]:
data_dir = './data/integrated_features' # we will get names from the augmented proteins
protein_names = [x.split('.')[0] for x in os.listdir(data_dir) if x.split('.')[1] == 'csv'] # get protein name list to be processed for building machine learning models
print('the number of initial proteins:', len(protein_names))
print(protein_names[:10])

the number of initial proteins: 104
['A2ABU4', 'A2AHJ4', 'A2AKB9', 'A2AQ25', 'E9Q1P8', 'E9Q5G3', 'O08537', 'O09061', 'O35303', 'O70263']


## hyper parameter optimization by K-fold cross-validation

In [5]:
import tensorflow as tf
import keras
from keras.optimizers import Adam
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold

import time
from IPython.display import clear_output

from ml_models import *

epochs = 1000
from keras.callbacks import EarlyStopping
patience = 30
callbacks = [EarlyStopping(patience=patience, restore_best_weights=True, monitor='val_loss')]

test_size = 0.2


### set initial parameters

In [6]:
from functions import *

In [7]:
initial_params = default_params.copy()
initial_params.update({
    'window_size'    : 10,
    'rnn_layers'     : 5,
    'rnn_neurons'    : 64,
    'dnn_layers'     : 3,
    'dnn_neurons'    : 64
    })

print('initial parameters')
for key, value in initial_params.items():
    print(f'{key:<14} : {value}')

search_space = {
    'rnn_layers'    : [2, 3, 4, 5],
    'dnn_layers'    : [2, 3, 4, 5],
}

initial parameters
rnn_layers     : 5
rnn_neurons    : 64
dnn_layers     : 3
dnn_neurons    : 64
activation     : softmax
loss           : categorical_crossentropy
metrics        : accuracy
optimizer_type : Adam
learning_rate  : 0.001
regularizer    : {'input': None, 'hidden': None, 'bias': None}
window_size    : 10


In [8]:
n_splits = 5

regularizer_list = [
    {'input': None,          'hidden': None,          'bias': None},
    
    {'input': 'L21_0.001', 'hidden': None,          'bias': None},
    {'input': 'L21_0.001', 'hidden': 'L21_0.001', 'bias': None},
    {'input':  'L1_0.001', 'hidden': None,           'bias': None},
    {'input':  'L1_0.001', 'hidden':  'L1_0.001', 'bias': None},
    
    {'input': 'L21_0.0001', 'hidden': None,          'bias': None},
    {'input': 'L21_0.0001', 'hidden': 'L21_0.0001', 'bias': None},
    {'input':  'L1_0.0001', 'hidden': None,           'bias': None},
    {'input':  'L1_0.0001', 'hidden':  'L1_0.0001', 'bias': None},
    
    {'input': 'L21_0.00001', 'hidden': None,          'bias': None},
    {'input': 'L21_0.00001', 'hidden': 'L21_0.00001', 'bias': None},
    {'input':  'L1_0.00001', 'hidden': None,           'bias': None},
    {'input':  'L1_0.00001', 'hidden':  'L1_0.00001', 'bias': None},
    
    {'input': 'L21_0.000001', 'hidden': None,          'bias': None},
    {'input': 'L21_0.000001', 'hidden': 'L21_0.000001', 'bias': None},
    {'input':  'L1_0.000001', 'hidden': None,           'bias': None},
    {'input':  'L1_0.000001', 'hidden':  'L1_0.000001', 'bias': None},
    
    {'input': 'L21_0.0000001', 'hidden': None,          'bias': None},
    {'input': 'L21_0.0000001', 'hidden': 'L21_0.0000001', 'bias': None},
    {'input':  'L1_0.0000001', 'hidden': None,           'bias': None},
    {'input':  'L1_0.0000001', 'hidden':  'L1_0.0000001', 'bias': None},
    
    {'input': 'L21_0.00000001', 'hidden': None,          'bias': None},
    {'input': 'L21_0.00000001', 'hidden': 'L21_0.00000001', 'bias': None},
    {'input':  'L1_0.00000001', 'hidden': None,           'bias': None},
    {'input':  'L1_0.00000001', 'hidden':  'L1_0.00000001', 'bias': None}
]

top_bottom = ['TOP', 'BOTTOM']
ratio_list = [0.05, 0.10, 0.20, 0.40, 0.60]
# ratio_list = [0.05, 0.10, 0.20, 0.40, 0.60, 0.80, 1.00]

In [9]:
model_update  = False
pd.set_option('display.max_rows', None)

keras.utils.set_random_seed(SEED)
tf.config.experimental.enable_op_determinism()
verbose = 0

METRICs = []
for k, where in enumerate(top_bottom):
    for i, regularizer in enumerate(regularizer_list):
        for j, ratio in enumerate(ratio_list):
            input  = regularizer.get('input')
            hidden = regularizer.get('hidden')
            lambda_value = float(input.split('_')[1]) if input else 0

            model_id = 1000*k + 10 * i + j + 1

            weights = pd.read_csv(f'./weights/SLSTM_UP_AUGMENT_ONLY_5_64_3_64_Adam_0.001_{input}_{hidden}_None_10_(11528, 21, 498).csv',
                                index_col=0, names=['weights'], header=0)
            ascending = False if where == 'TOP' else True
            selected_features = weights.sort_values(by = 'weights', ascending = ascending).iloc[ : int(len(weights) * ratio)]
            selected_features = [x for x in selected_features.weights.index]

            # set input features
            x_cts = selected_features
            x_cat = []
            x_var = x_cts + x_cat

            # set output feature
            y_cts = []
            y_cat = ['positivity']
            y_var = y_cts + y_cat

            params = initial_params.copy()

            data_x = []
            data_y = []
            for name in protein_names:
                data = pd.read_csv(f'./data/integrated_features/{name}.csv')
                ST_idx = np.where((data['residue'] == 'S') | (data['residue'] == 'T'))[0]

                # get X dataset
                x_onehot = get_onehots(data[x_var], columns = x_cat)
                x_features = list(x_onehot.columns)

                # get Y dataset
                y_onehot = get_onehots(data[y_var], columns = y_cat)
                y_labels = list(y_onehot.columns)

                for idx in ST_idx:
                    window_x = np.array(get_window(x_onehot, idx, params['window_size']))
                    label_y  = np.array(y_onehot.iloc[idx])

                    data_x.append(window_x)
                    data_y.append(label_y)

            data_x = np.array(data_x)
            data_y = np.array(data_y)

            for cv_idx in range(n_splits):
                clear_output(wait=True)
                if METRICs:
                    display(pd.concat(METRICs).groupby('model_id').mean())
                else:
                    display(METRICs)

                print('data x shape:', data_x.shape)
                print(params)

                model = LSTM_CLS(data_x.shape[1], data_x.shape[-1], data_y.shape[-1], params) # I don't know why, but this row is helping producing the same training result of a neural network

                splitter = StratifiedShuffleSplit(n_splits = n_splits, test_size = test_size, random_state = SEED)
                train_idx, test_idx = list(splitter.split(data_x, data_y))[cv_idx]

                train_x = data_x[train_idx]
                train_y = data_y[train_idx]

                test_x = data_x[test_idx]
                test_y = data_y[test_idx]

                train_x, train_y = upsample_data(train_x, train_y) # up-sample the training dataset
                train_x, test_x = data_scaling(train_x, test_x)

                model = LSTM_CLS(data_x.shape[1], data_x.shape[-1], data_y.shape[-1], params)

                model_type = f'{where}_{input}_{hidden}_{lambda_value}_{ratio}'
                model_name = name_model(f'{model_type}', params)
                print('model_name:', model_name)

                model_folder  = f'./models/{model_name}_{data_x.shape}'
                if not os.path.exists(model_folder):
                    os.makedirs(model_folder)
                model_path    = f'{model_folder}/{cv_idx}.h5'
                metric_path   = f'{model_folder}/{cv_idx}.csv'

                if not os.path.exists(model_path) or model_update:
                    time_start = time.time()
                    history = model.fit(train_x, train_y, verbose=verbose,
                                        epochs = 10000, callbacks = callbacks,
                                        validation_split = test_size/(1-test_size))
                    time_end = time.time()
                    training_time = round((time_end - time_start)/60, 3)

                    model.save_weights(model_path)

                    test_loss, accuracy, precision, recall, f1 = metrics_classification(model, test_x, test_y)
                    model_metrics = {
                        'model_id' : model_id,
                        'cv_idx'   : cv_idx,
                        **{f'train_{x}': train_x.shape[x] for x in range(len(train_x.shape))},
                        'train_y'     : train_y.shape[-1],
                        'test_size'   : test_x.shape[0],
                        **params,
                        'regularizer_input' : params['regularizer']['input'],
                        'regularizer_hidden' : params['regularizer']['hidden'],
                        'regularizer_bias' : params['regularizer']['bias'],
                        'training_time': training_time,
                        'test_loss': test_loss,
                        'accuracy': accuracy,
                        **{f'precision_{x}': precision[x] for x in range(len(precision))},
                        **{f'recall_{x}'   : recall[x] for x in range(len(recall))},
                        **{f'f1_{x}'       : f1[x] for x in range(len(f1))}}
                    model_metrics = pd.DataFrame([model_metrics]).drop(['activation', 'loss', 'metrics', 'optimizer_type', 'regularizer'], axis=1)
                    model_metrics.to_csv(metric_path, index=False)

                else:
                    model.load_weights(model_path)
                    model_metrics = pd.read_csv(metric_path, header=0)
                    model_metrics['model_id'] = model_id

                print(f'f1 score: {model_metrics.f1_1[0]}')

                model_metrics['input']  = int(input.split('_')[0][1:]) if input else 0
                model_metrics['hidden']  = int(hidden.split('_')[0][1:]) if hidden else 0
                model_metrics['lambda']  = lambda_value
                model_metrics['where'] = 1 if where == 'TOP' else 0
                model_metrics['ratio'] = ratio

                METRICs.append(model_metrics[['model_id', 'cv_idx', 'input', 'hidden', 'lambda', 'where', 'ratio', 'f1_1',
                                            'train_2', 'precision_1', 'recall_1', 'training_time', 'test_loss', 'accuracy']])
        pd.concat(METRICs).to_csv('./figures/top vs bottom.csv')
        pd.concat(METRICs).groupby('model_id').mean().to_csv('./figures/top vs bottom mean.csv')

clear_output(wait=True)
display(pd.concat(METRICs).drop(['input', 'hidden'], axis=1).groupby('model_id').mean())

Unnamed: 0_level_0,cv_idx,input,hidden,lambda,where,ratio,f1_1,train_2,precision_1,recall_1,training_time,test_loss,accuracy
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,2.0,0.0,0.0,0.0,1.0,0.05,21.888,24.0,23.05,21.396,11.3732,0.369023,94.312
2,2.0,0.0,0.0,0.0,1.0,0.1,25.76,49.0,29.176,23.492,7.0996,0.364519,95.108
3,2.0,0.0,0.0,0.0,1.0,0.2,29.638,99.0,31.818,28.14,6.2234,0.32273,95.046
4,2.0,0.0,0.0,0.0,1.0,0.4,33.032,199.0,36.75,30.932,6.5042,0.338587,95.308
5,2.0,0.0,0.0,0.0,1.0,0.6,32.178,298.0,33.93,30.698,5.8574,0.353926,95.162
11,2.0,21.0,0.0,0.001,1.0,0.05,25.39,24.0,26.168,25.118,7.1138,0.336032,94.504
12,2.0,21.0,0.0,0.001,1.0,0.1,27.432,49.0,29.402,26.512,7.9956,0.327818,94.86
13,2.0,21.0,0.0,0.001,1.0,0.2,29.052,99.0,32.314,26.744,7.4676,0.325094,95.222
14,2.0,21.0,0.0,0.001,1.0,0.4,30.998,199.0,35.25,28.14,8.5666,0.338733,95.36
15,2.0,21.0,0.0,0.001,1.0,0.6,28.27,298.0,31.546,26.046,7.4486,0.333292,95.106


data x shape: (11528, 21, 199)
{'rnn_layers': 5, 'rnn_neurons': 64, 'dnn_layers': 3, 'dnn_neurons': 64, 'activation': 'softmax', 'loss': 'categorical_crossentropy', 'metrics': 'accuracy', 'optimizer_type': 'Adam', 'learning_rate': 0.001, 'regularizer': {'input': None, 'hidden': None, 'bias': None}, 'window_size': 10}
model_name: TOP_L1_0.00000001_L1_0.00000001_1e-08_0.4_5_64_3_64_Adam_0.001_None_None_None_10


KeyboardInterrupt: 

In [None]:
display(pd.concat(METRICs).groupby('model_id').std())

Unnamed: 0_level_0,cv_idx,train_2,rnn_layers,dnn_layers,f1_1,precision_1,recall_1,training_time,test_loss,accuracy
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1.581139,0.0,0.0,0.0,2.621942,5.225,2.518974,2.784406,0.078662,0.737747
2,1.581139,0.0,0.0,0.0,2.938838,4.723281,2.325001,3.195959,0.048952,0.586234
3,1.581139,0.0,0.0,0.0,3.921843,2.181726,5.779985,2.450426,0.034522,0.302704
4,1.581139,0.0,0.0,0.0,2.76557,2.264911,3.890171,0.731913,0.060665,0.255676
5,1.581139,0.0,0.0,0.0,1.633518,2.970712,2.521739,3.486623,0.043998,0.385785
6,1.581139,0.0,0.0,0.0,3.242134,5.79299,5.303241,2.508655,0.043182,0.681894
7,1.581139,0.0,0.0,0.0,2.575529,4.115112,3.805466,1.776679,0.086357,0.402741
8,1.581139,0.0,0.0,0.0,3.965574,3.938667,4.397422,2.54657,0.049177,0.303513
11,1.581139,0.0,0.0,0.0,1.800075,3.010136,2.112475,0.889996,0.033995,0.353596
12,1.581139,0.0,0.0,0.0,4.373732,2.78084,6.392377,2.883071,0.059725,0.219818
