# header

In [1]:
'''
function ConnectButton(){
    console.log("Connect pushed"); 
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() 
}

setInterval(ConnectButton,60000);
'''

from google.colab import drive
from os import chdir

drive.mount('/content/drive')
project_path = '/content/drive/MyDrive/Gproject/MIT_glyco'
chdir(project_path)

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

import os
from os.path import exists
print(os.getcwd()) # current working directory

update = False

/content/drive/MyDrive/Gproject/MIT_glyco


In [3]:
load_name = "./data/data_for_ml.csv"
dataset = pd.read_csv(load_name)
protein_list = list(dataset.protein.unique())

positive = dataset[dataset['positivity']==1]
negative = dataset[dataset['positivity']==0]

print("total number of proteins:      ", len(dataset.protein.unique()))
print("total number of samples:       ", len(dataset))
print("total number of positive sites:", len(positive))
print("total number of negative sites:", len(negative))

total number of proteins:       272
total number of samples:        257578
total number of positive sites: 529
total number of negative sites: 257049


## parameter settings

In [4]:
from src.Data import *
from src.Preprocessing import *
from src.models import *

variables = xy_variables()
test_size = 0.2
valid_size = test_size/(1-test_size)

import random
random_state = 1
n_cv = 10

hpo_counts = 30
hpo_config = {
    "n_layers" : range(1,11),
    "n_neurons" : [16, 32, 64, 128, 256]
}
metrics = ['epoch', 'time', 'loss', 'val_loss', 'test_loss', 'accuracy', 'precision', 'recall', 'f1']
method = "random"

# model

In [9]:
model_name = 'cnn_basic'

x_cts   = variables.x_cts_original
x_cat   = variables.x_cat_original
y_label = variables.y_label

print(f"{len(x_cts)} x_cts: {x_cts}")
print(f"{len(x_cat)} x_cat: {x_cat}")
print(f"{len(y_label)} y_label: {y_label}")

data_x, data_y = df_to_dummy(dataset, x_cts, x_cat, y_label)
display(dict(zip(range(len(data_x.columns)), data_x.columns)))

11 x_cts: ['ASA', 'Phi', 'Psi', 'Theta(i-1=>i+1)', 'Tau(i-2=>i+2)', 'HSE_alpha_up', 'HSE_alpha_down', 'P(C)', 'P(H)', 'P(E)', 'flexibility']
2 x_cat: ['SEQ', 'SS']
1 y_label: ['positivity']
dummy x shape: (257578, 34)
dummy y shape: (257578, 1)


{0: 'ASA',
 1: 'Phi',
 2: 'Psi',
 3: 'Theta(i-1=>i+1)',
 4: 'Tau(i-2=>i+2)',
 5: 'HSE_alpha_up',
 6: 'HSE_alpha_down',
 7: 'P(C)',
 8: 'P(H)',
 9: 'P(E)',
 10: 'flexibility',
 11: 'SEQ_A',
 12: 'SEQ_C',
 13: 'SEQ_D',
 14: 'SEQ_E',
 15: 'SEQ_F',
 16: 'SEQ_G',
 17: 'SEQ_H',
 18: 'SEQ_I',
 19: 'SEQ_K',
 20: 'SEQ_L',
 21: 'SEQ_M',
 22: 'SEQ_N',
 23: 'SEQ_P',
 24: 'SEQ_Q',
 25: 'SEQ_R',
 26: 'SEQ_S',
 27: 'SEQ_T',
 28: 'SEQ_V',
 29: 'SEQ_W',
 30: 'SEQ_Y',
 31: 'SS_C',
 32: 'SS_E',
 33: 'SS_H'}

In [6]:
window_size = 10

seq_input  = []
seq_output = []

for name in protein_list:
    load_path = f'./data/data_for_ml/{name}.csv'
    temp = pd.read_csv(load_path)
    temp_x, temp_y = custom_dummy(temp, x_cts, x_cat, y_label)
    
    temp_input, temp_output = data_to_sequence(temp_x, temp_y, window_size)
    seq_input.append(temp_input)
    seq_output.append(temp_output)
    
seq_input  = np.concatenate(seq_input, axis=0)
seq_output = np.concatenate(seq_output, axis=0)

print(f'seq input shape : {seq_input.shape}')
print(f'seq output shape: {seq_output.shape}')

seq input shape : (41264, 21, 34)
seq output shape: (41264, 1)


## cross validation

In [10]:
# select n_layers and n_neurons as the best values of HPO
cv_path = f'./result/cv_result_{model_name}_{n_cv}.csv'

cv_result = pd.DataFrame([], columns=metrics)
if not exists(cv_path) or update:
    for i in range(n_cv):
        print(f"\n{i+1}th iteration")
        random.seed(i+1)
        train_x, train_y, test_x, test_y, _, _ = stratified_split(seq_input, seq_output, 
                                                              test_size=test_size, random_state=i+1, 
                                                              scale_x=x_cts, scale_y=[])
        train_x, train_y = up_sampling(train_x, train_y)

        history_size = train_x.shape[1]
        x_dim = train_x.shape[2]
        y_dim = train_y.shape[1]
        save_path  = f'./h5/{model_name}_cv_{i+1}of{n_cv}.h5'

        model = CNN1D(history_size, x_dim, y_dim)
        model.build()
        if not exists(save_path) or update:
            model.train(train_x, train_y, valid_size, save_path=save_path)
        else:
            model.load_model(save_path)

        epoch = model.epoch
        time = model.time
        loss = model.loss
        val_loss = model.val_loss
        test_loss, accuracy, precision, recall, f1 = model.evaluate(test_x, test_y)
        cv = pd.DataFrame([[epoch, time, loss, val_loss, test_loss, accuracy, 
                            precision[1], recall[1], f1[1]]], columns=metrics)
        cv_result = pd.concat([cv_result, cv], axis=0)
    cv_result = cv_result.reset_index(drop=True)
    cv_result.to_csv(cv_path, index=False)    
    
else:
    cv_result = pd.read_csv(cv_path)

display(cv_result)


1th iteration
train/test dataset: <class 'numpy.ndarray'>

train: (33011, 21, 34) (33011, 1)
check scale: 0.0 1.0

test: (8253, 21, 34) (8253, 1)
check scale: 0.0 1.0
up-sampled train dataset: (65188, 21, 34) (65188, 1)
model has been saved to ./h5/cnn_basic_cv_1of10.h5

2th iteration
train/test dataset: <class 'numpy.ndarray'>

train: (33011, 21, 34) (33011, 1)
check scale: 0.0 1.0

test: (8253, 21, 34) (8253, 1)
check scale: 0.0 1.0
up-sampled train dataset: (65188, 21, 34) (65188, 1)
model has been saved to ./h5/cnn_basic_cv_2of10.h5

3th iteration
train/test dataset: <class 'numpy.ndarray'>

train: (33011, 21, 34) (33011, 1)
check scale: 0.0 1.0

test: (8253, 21, 34) (8253, 1)
check scale: 0.0 1.0263605442176873
up-sampled train dataset: (65188, 21, 34) (65188, 1)
model has been saved to ./h5/cnn_basic_cv_3of10.h5

4th iteration
train/test dataset: <class 'numpy.ndarray'>

train: (33011, 21, 34) (33011, 1)
check scale: 0.0 1.0

test: (8253, 21, 34) (8253, 1)
check scale: 0.0 1.0
u

Unnamed: 0,epoch,time,loss,val_loss,test_loss,accuracy,precision,recall,f1
0,33,6.862,0.006625,0.005409,0.235076,98.61,36.59,14.42,20.69
1,20,5.506,0.002713,0.004163,0.407258,98.73,47.62,9.62,16.0
2,25,5.953,0.001915,0.006682,0.283359,98.63,32.0,7.69,12.4
3,62,9.992,0.000331,0.004113,0.27971,98.49,8.0,1.92,3.1
4,34,6.979,0.003381,0.004651,0.361163,98.5,16.67,4.81,7.46
5,7,4.031,0.005506,0.00432,0.233386,98.64,34.62,8.65,13.85
6,19,5.314,0.004664,0.006505,0.209439,98.59,28.57,7.69,12.12
7,9,4.254,0.009671,0.005726,0.2224,98.64,36.67,10.58,16.42
8,56,9.23,0.000853,0.004207,0.310065,98.52,23.53,7.69,11.59
9,30,6.542,0.003332,0.005315,0.281237,98.72,46.43,12.5,19.7


In [None]:
from src.graph_plot import *

for i in range(n_cv):
    print(f"\n{i+1}th iteration")
    random.seed(i+1)
    train_x, train_y, test_x, test_y, _, _ = stratified_split(rnn_input, rnn_output, 
                                                          test_size=test_size, random_state=i+1, 
                                                          scale_x=x_cts, scale_y=[])
    train_x, train_y = up_sampling(train_x, train_y)

    history_size = train_x.shape[1]
    x_dim = train_x.shape[2]
    y_dim = train_y.shape[1]
    save_path  = f'./h5/{model_name}_{rnn_layers}_{rnn_neurons}_{dnn_layers}_{dnn_neurons}_cv_{i+1}of{n_cv}.h5'

    model = RNN(history_size, x_dim, y_dim)
    model.build(rnn_layers, rnn_neurons, dnn_layers, dnn_neurons)
    model.load_model(save_path)
    
    prediction = model.model.predict(test_x, verbose=0)
    y_pred = prediction.round(0).astype(int)
    y_real = test_y
    
    plot_confusion(y_real, y_pred, title=model_name+f"_{i+1}", label=["Positive","Negative"])