# header

In [1]:
'''
function ConnectButton(){
    console.log("Connect pushed"); 
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() 
}

setInterval(ConnectButton,60000);
'''

from google.colab import drive
from os import chdir

drive.mount('/content/drive')
project_path = '/content/drive/MyDrive/Gproject/MIT_glyco'
chdir(project_path)

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

import os
from os.path import exists
print(os.getcwd()) # current working directory

update = False

/content/drive/MyDrive/Gproject/MIT_glyco


In [3]:
load_name = "./data/data_for_ml.csv"
protein_list = list(pd.read_csv(load_name).protein.unique())
protein_augmented = [name for name in protein_list if exists(f'./data/data_for_ml(augmented)/{name}.csv')]

dataset = []
for name in protein_augmented:
    load_path = f'./data/data_for_ml(augmented)/{name}.csv'
    dataset.append(pd.read_csv(load_path))
dataset = pd.concat(dataset, axis=0).reset_index(drop=True)

positive = dataset[dataset['positivity']==1]

print("total number of proteins:          ", len(protein_list))
print("total number of augmented proteins:", len(protein_augmented))
print("total number of positive sites:    ", len(positive))

total number of proteins:           272
total number of augmented proteins: 89
total number of positive sites:     185


## parameter settings

In [4]:
from src.Data import *
from src.Preprocessing import *
from src.models import *

variables = xy_variables()
test_size = 0.2
valid_size = test_size/(1-test_size)

import random
random_state = 1
n_cv = 10

hpo_counts = 30
hpo_config = {
    "n_layers" : range(1,11),
    "n_neurons" : [16, 32, 64, 128, 256]
}
metrics = ['epoch', 'time', 'loss', 'val_loss', 'test_loss', 'accuracy', 'precision', 'recall', 'f1']
method = "random"

# model

In [5]:
model_name = 'cnn_basic(augmented)'

x_cts   = variables.x_cts_original
x_cat   = variables.x_cat_original
y_label = variables.y_label

print(f"{len(x_cts)} x_cts: {x_cts}")
print(f"{len(x_cat)} x_cat: {x_cat}")
print(f"{len(y_label)} y_label: {y_label}")

data_x, data_y = df_to_dummy(dataset, x_cts, x_cat, y_label)
display(dict(zip(range(len(data_x.columns)), data_x.columns)))

11 x_cts: ['ASA', 'Phi', 'Psi', 'Theta(i-1=>i+1)', 'Tau(i-2=>i+2)', 'HSE_alpha_up', 'HSE_alpha_down', 'P(C)', 'P(H)', 'P(E)', 'flexibility']
2 x_cat: ['SEQ', 'SS']
1 y_label: ['positivity']
dummy x shape: (59027, 34)
dummy y shape: (59027, 1)


{0: 'ASA',
 1: 'Phi',
 2: 'Psi',
 3: 'Theta(i-1=>i+1)',
 4: 'Tau(i-2=>i+2)',
 5: 'HSE_alpha_up',
 6: 'HSE_alpha_down',
 7: 'P(C)',
 8: 'P(H)',
 9: 'P(E)',
 10: 'flexibility',
 11: 'SEQ_A',
 12: 'SEQ_C',
 13: 'SEQ_D',
 14: 'SEQ_E',
 15: 'SEQ_F',
 16: 'SEQ_G',
 17: 'SEQ_H',
 18: 'SEQ_I',
 19: 'SEQ_K',
 20: 'SEQ_L',
 21: 'SEQ_M',
 22: 'SEQ_N',
 23: 'SEQ_P',
 24: 'SEQ_Q',
 25: 'SEQ_R',
 26: 'SEQ_S',
 27: 'SEQ_T',
 28: 'SEQ_V',
 29: 'SEQ_W',
 30: 'SEQ_Y',
 31: 'SS_C',
 32: 'SS_E',
 33: 'SS_H'}

In [9]:
window_size = 10

seq_input  = []
seq_output = []

for name in protein_augmented:
    load_path = f'./data/data_for_ml(augmented)/{name}.csv'
    temp = pd.read_csv(load_path)
    temp_x, temp_y = custom_dummy(temp, x_cts, x_cat, y_label)
    
    temp_input, temp_output = data_to_sequence(temp_x, temp_y, window_size)
    seq_input.append(temp_input)
    seq_output.append(temp_output)
    
seq_input  = np.concatenate(seq_input, axis=0)
seq_output = np.concatenate(seq_output, axis=0)

print(f'seq input shape : {seq_input.shape}')
print(f'seq output shape: {seq_output.shape}')

seq input shape : (8910, 21, 34)
seq output shape: (8910, 1)


## cross validation

In [10]:
# select n_layers and n_neurons as the best values of HPO
cv_path = f'./result/cv_result_{model_name}_{n_cv}.csv'

cv_result = pd.DataFrame([], columns=metrics)
if not exists(cv_path) or update:
    for i in range(n_cv):
        print(f"\n{i+1}th iteration")
        random.seed(i+1)
        train_x, train_y, test_x, test_y, _, _ = stratified_split(seq_input, seq_output, 
                                                              test_size=test_size, random_state=i+1, 
                                                              scale_x=x_cts, scale_y=[])
        train_x, train_y = up_sampling(train_x, train_y)

        history_size = train_x.shape[1]
        x_dim = train_x.shape[2]
        y_dim = train_y.shape[1]
        save_path  = f'./h5/{model_name}_cv_{i+1}of{n_cv}.h5'

        model = CNN1D(history_size, x_dim, y_dim)
        model.build()
        if not exists(save_path) or update:
            model.train(train_x, train_y, valid_size, save_path=save_path)
        else:
            model.load_model(save_path)

        epoch = model.epoch
        time = model.time
        loss = model.loss
        val_loss = model.val_loss
        test_loss, accuracy, precision, recall, f1 = model.evaluate(test_x, test_y)
        cv = pd.DataFrame([[epoch, time, loss, val_loss, test_loss, accuracy, 
                            precision[1], recall[1], f1[1]]], columns=metrics)
        cv_result = pd.concat([cv_result, cv], axis=0)
    cv_result = cv_result.reset_index(drop=True)
    cv_result.to_csv(cv_path, index=False)    
    
else:
    cv_result = pd.read_csv(cv_path)

display(cv_result)


1th iteration
train/test dataset: <class 'numpy.ndarray'>

train: (7128, 21, 34) (7128, 1)
check scale: 0.0 1.0

test: (1782, 21, 34) (1782, 1)
check scale: 0.0 1.0
up-sampled train dataset: (13960, 21, 34) (13960, 1)
model has been saved to ./h5/cnn_basic(augmented)_cv_1of10.h5

2th iteration
train/test dataset: <class 'numpy.ndarray'>

train: (7128, 21, 34) (7128, 1)
check scale: 0.0 1.0

test: (1782, 21, 34) (1782, 1)
check scale: 0.0 1.0
up-sampled train dataset: (13960, 21, 34) (13960, 1)
model has been saved to ./h5/cnn_basic(augmented)_cv_2of10.h5

3th iteration
train/test dataset: <class 'numpy.ndarray'>

train: (7128, 21, 34) (7128, 1)
check scale: 0.0 1.0

test: (1782, 21, 34) (1782, 1)
check scale: 0.0 1.0
up-sampled train dataset: (13960, 21, 34) (13960, 1)
model has been saved to ./h5/cnn_basic(augmented)_cv_3of10.h5

4th iteration
train/test dataset: <class 'numpy.ndarray'>

train: (7128, 21, 34) (7128, 1)
check scale: 0.0 1.0

test: (1782, 21, 34) (1782, 1)
check scale:

Unnamed: 0,epoch,time,loss,val_loss,test_loss,accuracy,precision,recall,f1
0,20,1.208,0.010032,0.008308,0.473742,97.87,33.33,2.7,5.0
1,20,1.207,0.002588,0.008691,0.24686,97.87,44.44,10.81,17.39
2,22,1.257,0.002318,0.011207,0.236911,97.64,33.33,13.51,19.23
3,17,1.122,0.003924,0.004309,0.286926,97.7,37.5,16.22,22.64
4,7,0.916,0.004136,0.01422,0.265409,97.87,47.06,21.62,29.63
5,4,0.82,0.018676,0.010159,0.282652,97.98,55.56,13.51,21.74
6,8,0.941,0.002291,0.003469,0.237134,97.81,33.33,5.41,9.3
7,2,0.771,0.009351,0.008495,0.351239,97.7,33.33,10.81,16.33
8,42,1.728,0.001485,0.003996,0.265243,97.98,53.85,18.92,28.0
9,13,1.056,0.002857,0.005436,0.275578,97.59,12.5,2.7,4.44


In [None]:
from src.graph_plot import *

for i in range(n_cv):
    print(f"\n{i+1}th iteration")
    random.seed(i+1)
    train_x, train_y, test_x, test_y, _, _ = stratified_split(rnn_input, rnn_output, 
                                                          test_size=test_size, random_state=i+1, 
                                                          scale_x=x_cts, scale_y=[])
    train_x, train_y = up_sampling(train_x, train_y)

    history_size = train_x.shape[1]
    x_dim = train_x.shape[2]
    y_dim = train_y.shape[1]
    save_path  = f'./h5/{model_name}_{rnn_layers}_{rnn_neurons}_{dnn_layers}_{dnn_neurons}_cv_{i+1}of{n_cv}.h5'

    model = RNN(history_size, x_dim, y_dim)
    model.build(rnn_layers, rnn_neurons, dnn_layers, dnn_neurons)
    model.load_model(save_path)
    
    prediction = model.model.predict(test_x, verbose=0)
    y_pred = prediction.round(0).astype(int)
    y_real = test_y
    
    plot_confusion(y_real, y_pred, title=model_name+f"_{i+1}", label=["Positive","Negative"])