# header

In [1]:
'''
function ConnectButton(){
    console.log("Connect pushed"); 
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() 
}

setInterval(ConnectButton,60000);
'''

from google.colab import drive
from os import chdir

drive.mount('/content/drive')
project_path = '/content/drive/MyDrive/Gproject/MIT_glyco'
chdir(project_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np

import os
from os.path import exists
print(os.getcwd()) # current working directory

update = False

/content/drive/MyDrive/Gproject/MIT_glyco


In [3]:
load_name = "./data/data_for_ml.csv"
protein_list = list(pd.read_csv(load_name).protein.unique())
protein_augmented = [name for name in protein_list if exists(f'./data/data_for_ml(augmented)/{name}.csv')]

dataset = []
for name in protein_augmented:
    load_path = f'./data/data_for_ml(augmented)/{name}.csv'
    dataset.append(pd.read_csv(load_path))
dataset = pd.concat(dataset, axis=0).reset_index(drop=True)

positive = dataset[dataset['positivity']==1]

print("total number of proteins:          ", len(protein_list))
print("total number of augmented proteins:", len(protein_augmented))
print("total number of positive sites:    ", len(positive))

ParserError: ignored

## parameter settings

In [None]:
from src.Data import *
from src.Preprocessing import *
from src.models import *

variables = xy_variables()
test_size = 0.2
valid_size = test_size/(1-test_size)

import random
random_state = 1
n_cv = 20

hpo_counts = 30
hpo_config = {
    "n_layers" : range(1,11),
    "n_neurons" : [16, 32, 64, 128, 256]
}
metrics = ['epoch', 'time', 'loss', 'val_loss', 'test_loss', 'accuracy', 'precision', 'recall', 'f1']
method = "random"

# model

In [None]:
model_name = 'augmented_cnn_important_features_cts'
x_important = pd.read_csv('./data/features-important-names.csv').column_name.to_list()

x_cts   = [
    'ASA',
    'P(E)',
    'P(H)',
    'HSE_alpha_down',
    'Tau(i-2=>i+2)',
    'Theta(i-1=>i+1)',
    'all_atom_positive_charge_all_around_target_ser_thr',
    'all_atom_positive_charge_all_backbone_around_target_ser_thr',
    'all_atom_positive_charge_all_backbone_with_around_target_ser_thr',
    'all_atom_positive_charge_all_sidechain_around_target_ser_thr',
    'all_atom_positive_charge_all_with_around_target_ser_thr',
    'all_sasa_cys',
    'all_sasa_leu',
    'all_sasa_lys',
    'all_sasa_met',
    'all_sasa_phe',
    'exposed_charge_all_sidechain_with_around_target_ser_thr',
    'exposed_charge_all_with_around_target_ser_thr',
    'exposed_positive_charge_all_with_around_target_ser_thr',
    'net_charge_all_backbone_around_target_ser_thr',
    'net_charge_all_backbone_with_around_target_ser_thr',
    'net_charge_all_sidechain_around_target_ser_thr',
    'net_charge_all_sidechain_with_around_target_ser_thr',
    'net_charge_all_with_around_target_ser_thr',
    'net_charge_all_around_target_ser_thr',
    'sasa_ala',
    'sasa_back',
    'sasa_cys',
    'sasa_e',
    'sasa_g',
    'sasa_ile',
    'sasa_lys',
    'sasa_negative',
    'sasa_polar',
    'sasa_s',
    'sasa_ser',
    'sasa_all_with_around_target_ser_thr',
    'nAli',
    'nS/nT',
    'number_of_arg',
    'number_of_g',
    'number_of_ser',
    'number_of_thr',
    'number_of_v',
    'number_of_aromatic',
    'number_of_e',
    'number_of_f',
    'number_of_hydrophilic',
    'number_of_hydrophobic',
    'number_of_l',
    'number_of_leu',
    'number_of_lys',
    'number_of_n',
    'number_of_p',
    'number_of_polar',
    'number_of_s',
    'residue_SER_THR'
]
x_cat   = ['SEQ', 'SS']
y_label = variables.y_label

print(f"{len(x_cts)} x_cts")
print(f"{len(x_cat)} x_cat: {x_cat}")
print(f"{len(y_label)} y_label: {y_label}")

data_x, data_y = df_to_dummy(dataset, x_cts, x_cat, y_label)
display(dict(zip(range(len(data_x.columns)), data_x.columns)))

In [None]:
window_size = 10

seq_input  = []
seq_output = []

for name in protein_augmented:
    load_path = f'./data/data_for_ml(augmented)/{name}.csv'
    temp = pd.read_csv(load_path)
    temp_x, temp_y = custom_dummy(temp, x_cts, x_cat, y_label)
    
    temp_input, temp_output = data_to_sequence(temp_x, temp_y, window_size)
    seq_input.append(temp_input)
    seq_output.append(temp_output)
    
seq_input  = np.concatenate(seq_input, axis=0)
seq_output = np.concatenate(seq_output, axis=0)

print(f'rnn input shape : {seq_input.shape}')
print(f'rnn output shape: {seq_output.shape}')

## cross validation

In [None]:
# select n_layers and n_neurons as the best values of HPO
cv_path = f'./result/cv_result_{model_name}_{n_cv}.csv'

cv_result = pd.DataFrame([], columns=metrics)
if not exists(cv_path) or update:
    for i in range(n_cv):
        print(f"\n{i+1}th iteration")
        random.seed(i+1)
        train_x, train_y, test_x, test_y, _, _ = stratified_split(seq_input, seq_output, 
                                                              test_size=test_size, random_state=i+1, 
                                                              scale_x=x_cts, scale_y=[])
        train_x, train_y = up_sampling(train_x, train_y)

        history_size = train_x.shape[1]
        x_dim = train_x.shape[2]
        y_dim = train_y.shape[1]
        save_path  = f'./h5/{model_name}_cv_{i+1}of{n_cv}.h5'
        
        model = CNN1D(history_size, x_dim, y_dim)
        model.build()
        if not exists(save_path) or update:
            model.train(train_x, train_y, valid_size, save_path=save_path, verbose=2)
        else:
            model.load_model(save_path)

        epoch = model.epoch
        time = model.time
        loss = model.loss
        val_loss = model.val_loss
        test_loss, accuracy, precision, recall, f1 = model.evaluate(test_x, test_y)
        cv = pd.DataFrame([[epoch, time, loss, val_loss, test_loss, accuracy, 
                             precision[1], recall[1], f1[1]]], columns=metrics)
        cv_result = pd.concat([cv_result, cv], axis=0)
    cv_result = cv_result.reset_index(drop=True)
    cv_result.to_csv(cv_path, index=False)    
    
else:
    cv_result = pd.read_csv(cv_path)

display(cv_result)

In [None]:
from src.graph_plot import *

for i in range(n_cv):
    print(f"\n{i+1}th iteration")
    random.seed(i+1)
    train_x, train_y, test_x, test_y, _, _ = stratified_split(seq_input, seq_output, 
                                                          test_size=test_size, random_state=i+1, 
                                                          scale_x=x_cts, scale_y=[])
    train_x, train_y = up_sampling(train_x, train_y)

    history_size = train_x.shape[1]
    x_dim = train_x.shape[2]
    y_dim = train_y.shape[1]
    save_path  = f'./h5/{model_name}_cv_{i+1}of{n_cv}.h5'

    model = RNN(history_size, x_dim, y_dim)
    model.build(rnn_layers, rnn_neurons, dnn_layers, dnn_neurons)
    model.load_model(save_path)
    
    prediction = model.model.predict(test_x, verbose=0)
    y_pred = prediction.round(0).astype(int)
    y_real = test_y
    
    plot_confusion(y_real, y_pred, title=model_name+f"_{i+1}", label=["Positive","Negative"])