In [1]:
# '''
# function ConnectButton(){
#     console.log("Connect pushed");
#     document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click()
# }

# setInterval(ConnectButton,60000);
# '''

# from google.colab import drive
# from os import chdir

# drive.mount('/content/drive')
# project_path = '/content/drive/MyDrive/Gproject/o-linked-site-prediction-feature-augment'
# chdir(project_path)

In [2]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 14638260538527346589
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 9973006336
locality {
  bus_id: 1
  links {
  }
}
incarnation: 9689848030570155448
physical_device_desc: "device: 0, name: NVIDIA RTX 4000 Ada Generation Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9"
xla_global_id: 416903419
]


In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# set seed for the reproducible result
SEED = 42

In [4]:
data_dir = './data/integrated_features' # we will get names from the augmented proteins
protein_names = [x.split('.')[0] for x in os.listdir(data_dir) if x.split('.')[1] == 'csv'] # get protein name list to be processed for building machine learning models
print('the number of initial proteins:', len(protein_names))
print(protein_names[:10])

the number of initial proteins: 104
['A2ABU4', 'A2AHJ4', 'A2AKB9', 'A2AQ25', 'E9Q1P8', 'E9Q5G3', 'O08537', 'O09061', 'O35303', 'O70263']


## hyper parameter optimization by K-fold cross-validation

In [5]:
import tensorflow as tf
import keras
from keras.optimizers import Adam
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold

import time
from IPython.display import clear_output

from ml_models import *

epochs = 1000
from keras.callbacks import EarlyStopping
patience = 30
callbacks = [EarlyStopping(patience=patience, restore_best_weights=True, monitor='val_loss')]

test_size = 0.2


### set initial parameters

In [6]:
from functions import *

In [7]:
initial_params = default_params.copy()
initial_params.update({
    'window_size'    : 10,
    'rnn_layers'     : 5,
    'rnn_neurons'    : 64,
    'dnn_layers'     : 3,
    'dnn_neurons'    : 64
    })

print('initial parameters')
for key, value in initial_params.items():
    print(f'{key:<14} : {value}')

regularizers = [
    {'input': None, 'hidden': None, 'bias': None},

    {'input': 'L21_0.001', 'hidden': None,           'bias': None},
    {'input': 'L21_0.001', 'hidden': 'L21_0.001', 'bias': None},
    {'input':  'L1_0.001', 'hidden': None,           'bias': None},
    {'input':  'L1_0.001', 'hidden':  'L1_0.001', 'bias': None},

    {'input': 'L21_0.0001', 'hidden': None,           'bias': None},
    {'input': 'L21_0.0001', 'hidden': 'L21_0.0001', 'bias': None},
    {'input':  'L1_0.0001', 'hidden': None,           'bias': None},
    {'input':  'L1_0.0001', 'hidden':  'L1_0.0001', 'bias': None},

    {'input': 'L21_0.00001', 'hidden': None,           'bias': None},
    {'input': 'L21_0.00001', 'hidden': 'L21_0.00001', 'bias': None},
    {'input':  'L1_0.00001', 'hidden': None,           'bias': None},
    {'input':  'L1_0.00001', 'hidden':  'L1_0.00001', 'bias': None},

    {'input': 'L21_0.000001', 'hidden': None,           'bias': None},
    {'input': 'L21_0.000001', 'hidden': 'L21_0.000001', 'bias': None},
    {'input':  'L1_0.000001', 'hidden': None,           'bias': None},
    {'input':  'L1_0.000001', 'hidden':  'L1_0.000001', 'bias': None},

    {'input': 'L21_0.0000001', 'hidden': None,           'bias': None},
    {'input': 'L21_0.0000001', 'hidden': 'L21_0.0000001', 'bias': None},
    {'input':  'L1_0.0000001', 'hidden': None,           'bias': None},
    {'input':  'L1_0.0000001', 'hidden':  'L1_0.0000001', 'bias': None},

    {'input': 'L21_0.00000001', 'hidden': None,           'bias': None},
    {'input': 'L21_0.00000001', 'hidden': 'L21_0.00000001', 'bias': None},
    {'input':  'L1_0.00000001', 'hidden': None,           'bias': None},
    {'input':  'L1_0.00000001', 'hidden':  'L1_0.00000001', 'bias': None},

    {'input': 'L21_0.000000001', 'hidden': None,           'bias': None},
    {'input': 'L21_0.000000001', 'hidden': 'L21_0.000000001', 'bias': None},
    {'input':  'L1_0.000000001', 'hidden': None,           'bias': None},
    {'input':  'L1_0.000000001', 'hidden':  'L1_0.000000001', 'bias': None}
    ]


initial parameters
rnn_layers     : 5
rnn_neurons    : 64
dnn_layers     : 3
dnn_neurons    : 64
activation     : softmax
loss           : categorical_crossentropy
metrics        : accuracy
optimizer_type : Adam
learning_rate  : 0.001
regularizer    : {'input': None, 'hidden': None, 'bias': None}
window_size    : 10


## set variables

In [8]:
model_type = 'SLSTM_UP_AUGMENT_ONLY'

augmented_columns = dict(pd.read_csv('./data/augmented_columns.csv', header=0).values.squeeze())
print('# of augmented features:', len(augmented_columns))

# set continuous input features
x_cts = [x for x in augmented_columns.keys() if augmented_columns.get(x) != 'object']
print('continuous features:')
display(dict(zip(range(len(x_cts)), x_cts)))

# set categorical input features
x_cat = [x for x in augmented_columns.keys() if augmented_columns.get(x) == 'object']
print('categorical features:')
print(dict(zip(range(len(x_cat)), x_cat)))

# input features
x_var = x_cts + x_cat

# set continuos output feature
y_cts = []

# set categorical output feature
y_cat = ['positivity']

# output features
y_var = y_cts + y_cat

# of augmented features: 498
continuous features:


{0: 'number_hydrophobic_0A',
 1: 'number_hydrophilic_0A',
 2: 'number_polar_0A',
 3: 'number_aromatic_0A',
 4: 'number_aliphatic_0A',
 5: 'number_charged_0A',
 6: 'number_positive_0A',
 7: 'number_negative_0A',
 8: 'number_gly_0A',
 9: 'number_very_small_0A',
 10: 'number_small_0A',
 11: 'number_normal_0A',
 12: 'number_long_0A',
 13: 'number_pro_0A',
 14: 'number_A_polar_uncharged_with_hydroxyl_group_0A',
 15: 'number_b_polar_uncharged_with_amide_0A',
 16: 'number_d_negatively_charged_polar_0A',
 17: 'number_e_non_polar_suffered_0A',
 18: 'number_f_non_polar_aromatic_0A',
 19: 'number_ala_0A',
 20: 'number_cys_0A',
 21: 'number_asp_0A',
 22: 'number_glu_0A',
 23: 'number_phe_0A',
 24: 'number_his_0A',
 25: 'number_ile_0A',
 26: 'number_lys_0A',
 27: 'number_leu_0A',
 28: 'number_met_0A',
 29: 'number_asn_0A',
 30: 'number_gln_0A',
 31: 'number_arg_0A',
 32: 'number_ser_0A',
 33: 'number_thr_0A',
 34: 'number_val_0A',
 35: 'number_trp_0A',
 36: 'number_tyr_0A',
 37: 'sasa_hydrophobic_0

categorical features:
{}


# build amino acid sequence dataset

### model training with K-fold

In [9]:
params = initial_params.copy()
keras.utils.set_random_seed(SEED)
tf.config.experimental.enable_op_determinism()

model_update = False

MODELs = []
METRICs = []
METRIC_MEAN = []
verbose = 0
model_id = 1

data_x = []
data_y = []

for name in protein_names:
    data = pd.read_csv(f'./data/integrated_features/{name}.csv')
    ST_idx = np.where((data['residue'] == 'S') | (data['residue'] == 'T'))[0]

    # get X dataset
    x_onehot = get_onehots(data[x_var], columns = x_cat)
    x_features = list(x_onehot.columns)

    # get Y dataset
    y_onehot = get_onehots(data[y_var], columns = y_cat)
    y_labels = list(y_onehot.columns)

    for idx in ST_idx:
        window_x = np.array(get_window(x_onehot, idx, params['window_size']))
        label_y  = np.array(y_onehot.iloc[idx])

        data_x.append(window_x)
        data_y.append(label_y)

data_x = np.array(data_x)
data_y = np.array(data_y)

model = LSTM_CLS(data_x.shape[1], data_x.shape[-1], data_y.shape[-1], params) # I don't know why, but this row is helping producing the same training result of a neural network

print('data x shape:', data_x.shape)
print('data y shape:', data_y.shape)
print('class y counts:', data_y.sum(0))
print(f'class y ratio: {(data_y.sum(0)/len(data_y)).round(4)}')

splitter = StratifiedShuffleSplit(n_splits = 5, test_size = test_size, random_state = SEED)
for regularizer in regularizers:
    params['regularizer'] = regularizer
    feature_weight = []

    clear_output(wait=True)
    if METRICs:
        display(pd.concat(METRICs, axis=0).groupby('model_id').mean())
        # display(pd.concat(METRICs))
    else:
        display(METRICs)

    for cv_idx, (train_idx, test_idx) in enumerate(splitter.split(data_x, data_y)):
        train_x, train_y = data_x[train_idx], data_y[train_idx]
        train_x, train_y = upsample_data(train_x, train_y) # up-sample the training dataset
        test_x , test_y  = data_x[test_idx],  data_y[test_idx]

        train_x, test_x = data_scaling(train_x, test_x)

        model = LSTM_CLS(data_x.shape[1], data_x.shape[-1], data_y.shape[-1], params)
        model_name = name_model(f'{model_type}', params)


        model_folder  = f'./models/{model_name}_{data_x.shape}'
        if not os.path.exists(model_folder):
            os.makedirs(model_folder)
        model_path    = f'{model_folder}/{cv_idx}.h5'
        metric_path   = f'{model_folder}/{cv_idx}.csv'


        if not os.path.exists(model_path) or model_update:
            time_start = time.time()
            history = model.fit(train_x, train_y, verbose=verbose,
                                epochs = 10000, callbacks = callbacks,
                                validation_split = test_size/(1-test_size))
            time_end = time.time()
            training_time = round((time_end - time_start)/60, 3)

            model.save_weights(model_path)

            test_loss, accuracy, precision, recall, f1 = metrics_classification(model, test_x, test_y)
            model_metrics = {
                'model_id' : model_id,
                'cv_idx'   : cv_idx,
                **{f'train_{x}': train_x.shape[x] for x in range(len(train_x.shape))},
                'train_y'     : train_y.shape[-1],
                'test_size'   : test_x.shape[0],
                **params,
                'regularizer_input' : params['regularizer']['input'],
                'regularizer_hidden' : params['regularizer']['hidden'],
                'regularizer_bias' : params['regularizer']['bias'],
                'training_time': training_time,
                'test_loss': test_loss,
                'accuracy': accuracy,
                **{f'precision_{x}': precision[x] for x in range(len(precision))},
                **{f'recall_{x}'   : recall[x] for x in range(len(recall))},
                **{f'f1_{x}'       : f1[x] for x in range(len(f1))}}
            model_metrics = pd.DataFrame([model_metrics]).drop(['activation', 'loss', 'metrics', 'optimizer_type', 'regularizer'], axis=1)
            model_metrics.to_csv(metric_path, index=False)

        else:
            model.load_weights(model_path)
            model_metrics = pd.read_csv(metric_path, header=0)
            model_metrics['model_id'] = model_id

        weights = model.get_weights()
        feature_weight.append(np.abs(weights[0]).mean(1).reshape(1,-1))

        print(f'f1 score: {model_metrics.f1_1[0]}')

        model_metrics['input']   = int(regularizer.get("input", None).split('_')[0][1:]) if regularizer.get("input", None) else 0
        model_metrics['hidden']  = int(regularizer.get("hidden", None).split('_')[0][1:]) if regularizer.get("hidden", None) else 0
        model_metrics['lambda']  = float(regularizer.get("input", None).split('_')[1]) if regularizer.get("input", None) else 0
        METRICs.append(model_metrics[['model_id', 'cv_idx', 'input', 'hidden', 'lambda',
                                     'train_2', 'rnn_layers', 'dnn_layers',
                                     'f1_1', 'precision_1', 'recall_1', 'training_time', 'test_loss', 'accuracy']])

    feature_weight = np.concatenate(feature_weight, axis=0)
    feature_weight = feature_weight.mean(0)
    feature_weight = pd.Series(feature_weight, index=x_features)
    feature_weight.to_csv(f'./weights/{model_name}_{data_x.shape}.csv')
    pd.concat(METRICs, axis=0).groupby('model_id').mean().to_csv('./figures/lambda and regularizer mean.csv')
    pd.concat(METRICs, axis=0).to_csv('./figures/lambda and regularizer.csv')
    model_id += 1

METRICs = pd.concat(METRICs, axis = 0)

Unnamed: 0_level_0,cv_idx,input,hidden,lambda,train_2,rnn_layers,dnn_layers,f1_1,precision_1,recall_1,training_time,test_loss,accuracy
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,2.0,0.0,0.0,0.0,498.0,5.0,3.0,28.322,29.786,27.442,6.8262,0.35122,94.892
2,2.0,21.0,0.0,0.001,498.0,5.0,3.0,12.476,16.872,9.998,145.1884,0.569917,94.728
3,2.0,21.0,21.0,0.001,498.0,5.0,3.0,4.314,2.238,60.0,21.9834,0.724752,40.746
4,2.0,1.0,0.0,0.001,498.0,5.0,3.0,20.542,22.492,19.07,65.8448,0.484889,94.562
5,2.0,1.0,1.0,0.001,498.0,5.0,3.0,10.926,7.734,68.604,15.1434,0.636073,39.254
6,2.0,21.0,0.0,0.0001,498.0,5.0,3.0,21.92,23.504,20.93,12.6094,0.493676,94.502
7,2.0,21.0,21.0,0.0001,498.0,5.0,3.0,18.388,14.502,27.21,14.9144,0.513725,91.136
8,2.0,1.0,0.0,0.0001,498.0,5.0,3.0,27.004,28.418,25.816,10.7458,0.472798,94.796
9,2.0,1.0,1.0,0.0001,498.0,5.0,3.0,26.488,24.7,29.07,9.6366,0.421065,93.982
10,2.0,21.0,0.0,1e-05,498.0,5.0,3.0,26.922,29.434,25.116,10.8158,0.412967,94.928


f1 score: 36.36


KeyboardInterrupt: 

In [None]:
display(METRICs.groupby('model_id').mean())

Unnamed: 0_level_0,cv_idx,train_2,rnn_layers,dnn_layers,f1_1,precision_1,recall_1,training_time,test_loss,accuracy
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2.0,498.0,5.0,3.0,28.322,29.786,27.442,6.8262,0.35122,94.892
2,2.0,498.0,5.0,3.0,12.476,16.872,9.998,145.1884,0.569917,94.728
3,2.0,498.0,5.0,3.0,4.314,2.238,60.0,21.9834,0.724752,40.746
4,2.0,498.0,5.0,3.0,20.542,22.492,19.07,65.8448,0.484889,94.562
5,2.0,498.0,5.0,3.0,10.926,7.734,68.604,15.1434,0.636073,39.254
6,2.0,498.0,5.0,3.0,21.92,23.504,20.93,12.6094,0.493676,94.502
7,2.0,498.0,5.0,3.0,18.388,14.502,27.21,14.9144,0.513725,91.136
8,2.0,498.0,5.0,3.0,27.004,28.418,25.816,10.7458,0.472798,94.796
9,2.0,498.0,5.0,3.0,26.488,24.7,29.07,9.6366,0.421065,93.982
10,2.0,498.0,5.0,3.0,26.922,29.434,25.116,10.8158,0.412967,94.928


In [None]:
display(METRICs.groupby('model_id').std())

Unnamed: 0_level_0,cv_idx,train_2,rnn_layers,dnn_layers,f1_1,precision_1,recall_1,training_time,test_loss,accuracy
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1.581139,0.0,0.0,0.0,4.18442,2.540183,6.607683,1.569053,0.050139,0.289862
2,1.581139,0.0,0.0,0.0,3.162709,5.36533,2.265617,66.646721,0.126811,0.512903
3,1.581139,0.0,0.0,0.0,3.938125,2.043005,54.772256,3.548275,0.002148,50.686245
4,1.581139,0.0,0.0,0.0,4.748028,4.240957,5.368682,18.112177,0.066994,0.288392
5,1.581139,0.0,0.0,0.0,7.338084,6.466937,43.849129,8.905815,0.116909,48.64463
6,1.581139,0.0,0.0,0.0,4.799234,4.506471,6.04149,4.240482,0.040712,0.509431
7,1.581139,0.0,0.0,0.0,2.731935,2.104417,9.138441,6.770135,0.024801,2.112044
8,1.581139,0.0,0.0,0.0,2.394886,2.760158,2.650251,2.247436,0.058648,0.286583
9,1.581139,0.0,0.0,0.0,3.467805,4.157782,5.134311,2.887524,0.03569,0.695068
10,1.581139,0.0,0.0,0.0,2.260768,3.358613,3.351474,4.29564,0.034567,0.399337
