# 1. Preparation

## Libraries

In [None]:
#Connect to Google Drive
from google.colab import drive 
drive.mount('/gdrive')
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [None]:
#Ignore warnings
%%capture
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#Essentials
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import random
import itertools
from collections import Counter
pd.options.display.max_columns = 200
pd.options.display.max_rows = 200

#Preprocessing and Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score,confusion_matrix, accuracy_score, classification_report, precision_recall_fscore_support, roc_curve
from sklearn.model_selection import StratifiedKFold,KFold
from imblearn.under_sampling import RandomUnderSampler

#Deep learning
%tensorflow_version 2.x
!pip install tensorflow-determinism
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Activation, AlphaDropout, LeakyReLU
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from tensorflow.keras import backend as K
from tensorflow.keras.backend import sigmoid
from tensorflow.keras.utils import get_custom_objects
from tensorflow.keras.optimizers import Adam, Nadam, Adagrad, SGD, RMSprop, Adadelta

#Setting seeds and random states for reproducibility
SEED = 0
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['PYTHONHASHSEED']=str(SEED)
# tf.keras.backend.clear_session()
# tf.compat.v1.reset_default_graph()
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

#Checking GPU availability
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print('GPU device usage is not active')
else:
  print('Found GPU at: {}'.format(device_name))

#Swish activation function
def swish(x, beta = 1):
    return (x * sigmoid(beta * x))
get_custom_objects().update({'swish': swish})

## Functions

In [None]:
#Function for evaluation
def evaluate_dnn(model,testX,testY):
  # Metrics and classification reports
  print("[INFO] Model Performance {}", model)
  test_loss, test_acc = model.evaluate(testX, testY)
  predictions = model.predict(testX)
  roc_auc = roc_auc_score(testY, predictions)
  print()
  print("[INFO] Classification Report")
  print("Test Loss : {0:.3f} \t Test Accuracy : {1:.3f}".format(test_loss, test_acc))
  print("ROC AUC   : {:.3f}".format(roc_auc))
  print(classification_report(testY,[1 if i >=0.5 else 0 for i in predictions], target_names = ["0","1"]))
  print()

  #Confusion matrix
  print("[INFO] Confusion Matrix")
  LABELS = ["Negative", "Positive"]
  conf_matrix = confusion_matrix(testY, [1 if i >=0.5 else 0 for i in predictions])
  plt.figure(figsize=(6, 6))
  sns.set(font_scale=1.4)
  sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d");
  plt.title("Confusion matrix")
  plt.ylabel('True class')
  plt.xlabel('Predicted class')
  plt.show()

def sae_model(xt, xv = None, EPOCHS = 50, BATCH_SIZE = 32, opt = "adam", fr_node = 0.5,
              hl_node = 1024, lr = 0.01,af = "relu",num_layers = 3, do = 0.1,
              verbose = 0,return_fe = False):
  #Setting result placeholders
  xt_ae = [] ;xv_ae = [] ; w_ae = []
  #If validation set is not present, use train set as validation set
  if xv is None :
    xv = xt.copy()
  opt = tf.keras.optimizers.get(opt) #Set optimizer
  K.set_value(opt.learning_rate, lr) #Set learning rate

  #Stacked Autoencoder architecture
  for n_layers in range(num_layers):
    #Autoencoder
    inp = Input(shape=(xt.shape[1],))
    hidden_layer = Dropout(0.1)(inp)
    enc = Dense(int(hl_node*(fr_node**n_layers)), activation = af)(hidden_layer)  
    dec = Dense(xt.shape[1],activation="linear")(enc)
    ae = Model(inp, dec)

    ae.compile(optimizer=opt, loss='mean_squared_error')
    es = EarlyStopping(monitor='val_loss', patience=15, verbose=verbose)
    ae.fit(xt, xt, 
           epochs=EPOCHS,batch_size=BATCH_SIZE, 
           shuffle=True, callbacks = [es] , verbose = verbose,
           validation_data = (xv,xv))

    fe = Model(ae.input, enc)
    xt = fe.predict(xt) ; xt_ae.append(xt)
    xv = fe.predict(xv) ; xv_ae.append(xv)
    w_ae.append([layer_name for layer_name in ae.layers if "dense" in layer_name.name][0].get_weights())
    if verbose:
      print("Layer {} trained".format(n_layers+1))

  return (w_ae,xv) if return_fe else w_ae


def dnn_model(xt, sae_weights = None, EPOCHS = 50,BATCH_SIZE = 32, opt = "adam",
              hl_node = 1024, lr = 0.01,af = "relu",num_layers = 3, do=0, fr_node = 0.5):
  opt = tf.keras.optimizers.get(opt) #Set optimizer
  K.set_value(opt.learning_rate, lr) #Set learning rate
  
  #Model architecture
  input_layer = Input(shape=(xt.shape[1],))
  hidden_layer = BatchNormalization()(input_layer)
  hidden_layer = Dropout(do)(hidden_layer)
  for n_layers in range(num_layers):
    hidden_layer = Dense(int(hl_node*(fr_node**n_layers)), activation = af)(hidden_layer)
    hidden_layer = BatchNormalization()(hidden_layer)
    hidden_layer = Dropout(do)(hidden_layer)
  output_layer = Dense(1, activation = "sigmoid")(hidden_layer)

  dnn = Model(input_layer, output_layer)

  #Using Weight Generated from SAE (if weights are provided)
  if sae_weights is not None:
    weights = sae_weights
    dnn_dense = [layer_name for layer_name in dnn.layers if "dense" in layer_name.name]
    for weight_from,weight_to in list(zip(weights,dnn_dense)):
      weight_to.set_weights(weight_from)

  #Compile model
  dnn.compile(optimizer=opt, loss='binary_crossentropy',metrics=['accuracy'])
  return dnn

# def sae_dnn_model(xt, xv = None, EPOCHS = 50,BATCH_SIZE = 32,
#                   hl_node = 1024, lr = 0.01, af = "relu", num_layers=3):
#   params = {
#       "hl_node" : hl_node,
#       "lr" : lr,
#       "af" : af,
#       "num_layers":
#       "EPOCHS" : EPOCHS,
#       "BATCH_SIZE" : BATCH_SIZE,
#   }
#   sae_weights = sae_model(xt, xv, **params)
#   return dnn_model(xt, sae_weights = sae_weights, **params)

## Generate Feature Vector

In [None]:
!pip install propy3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
len(list0)

881

In [None]:
from propy.GetProteinFromUniprot import GetProteinSequence
from propy.PyPro import GetProDes
proteinsequence = GetProteinSequence("P48039")

target = GetProDes(proteinsequence).GetDPComp()
target = [*target.values()]

In [None]:
target

[0.29,
 0.57,
 0.0,
 0.57,
 0.29,
 0.0,
 0.29,
 0.57,
 0.0,
 1.15,
 0.57,
 0.0,
 0.0,
 0.0,
 0.29,
 1.43,
 0.0,
 0.0,
 0.57,
 1.15,
 0.29,
 0.29,
 0.86,
 0.0,
 0.0,
 0.0,
 0.29,
 0.29,
 0.0,
 1.15,
 0.0,
 0.57,
 0.0,
 0.0,
 0.29,
 0.0,
 0.0,
 0.0,
 0.29,
 0.86,
 0.86,
 0.29,
 0.57,
 0.29,
 0.0,
 0.0,
 0.29,
 0.57,
 0.0,
 0.57,
 0.86,
 0.29,
 0.0,
 0.86,
 0.0,
 0.57,
 0.0,
 0.0,
 0.0,
 0.29,
 0.0,
 0.57,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.29,
 0.0,
 0.29,
 0.29,
 0.29,
 0.0,
 0.29,
 0.57,
 0.57,
 0.0,
 0.0,
 0.0,
 0.29,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.29,
 0.0,
 0.29,
 0.0,
 0.29,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.57,
 0.29,
 0.86,
 0.29,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.29,
 0.29,
 0.0,
 0.0,
 0.29,
 0.29,
 0.29,
 0.0,
 0.0,
 0.0,
 0.29,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.29,
 0.29,
 0.0,
 0.0,
 0.29,
 0.57,
 0.29,
 0.0,
 0.86,
 0.29,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.29,
 0.86,
 0.0,
 0.0,
 0.29,
 0.0,
 0.5

In [None]:
import pubchempy as pcp
ligand = pcp.Compound.from_cid(2920)
temp = bin(int(ligand.fingerprint, 16))
fp = temp[2:883]
list0 = list(fp)
for i in range(len(list0)):
  list0[i] = float(list0[i])
list0

[1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [None]:
from propy.GetProteinFromUniprot import GetProteinSequence
from propy.PyPro import GetProDes

# download the protein sequence by uniprot id
proteinsequence = GetProteinSequence("P48039")

target = GetProDes(proteinsequence).GetDPComp()



In [None]:
#Reading dataset
cov_dti = pd.read_csv("./My Drive/Skripsi/Data/coronadata/cov_dti.csv")
cov_com_fp = pd.read_csv("./My Drive/Skripsi/Data/coronadata/cov_com_fingerprint.csv")
cov_pro_pf = pd.read_csv("./My Drive/Skripsi/Data/coronadata/cov_pro_pf.csv")
cov_pssm = pd.read_csv("./My Drive/Skripsi/Data/coronadata/cov_pssm.csv")

# cov_dti["Senyawa"] = cov_dti["Senyawa"].apply(lambda x : x.capitalize())
# cov_dti.drop(columns = ["Senyawa"], inplace = True)
cov_dti["CID_senyawa"] = cov_dti["CID_senyawa"].astype("str")
cov_com_fp["CID_senyawa"] = cov_com_fp["CID_senyawa"].astype("str")

In [None]:
#Filling missing values with median
cov_com_fp = cov_com_fp.fillna(cov_com_fp.median())
cov_pro_pf = cov_pro_pf.fillna(cov_pro_pf.median())
cov_pssm = cov_pssm.fillna(cov_pssm.median())

In [None]:
#Get all interaction
com_uni = cov_dti["CID_senyawa"].unique()
pro_uni = cov_dti["Protein"].unique()
# selected_pairs = cov_dti[cov_dti["CID_senyawa"].isin(cov_com_feat["CID_senyawa"].unique())]
known_pair = [tuple(x) for x in cov_dti.to_numpy()]
com2 = np.array([])
pro2 = np.array([])
for i in com_uni:
    com2 = np.append(com2,np.array([i]*len(pro_uni)))
pro2 = list(pro_uni)*len(com_uni)

df = pd.DataFrame(list(zip(com2,pro2)), columns=["CID_senyawa","Protein"])

df["class"] = df.apply(lambda row: 1 if (row['CID_senyawa'], row['Protein']) in known_pair else 0, axis=1)

In [None]:
#Select features to be combined into feature vector
cov_pro_combine = pd.merge(cov_pro_pf,cov_pssm,how="inner",on="Protein")
select_com_feat = cov_com_fp
# select_pro_feat = cov_pro_combine.loc[:,["Protein"]+[i for i in cov_pro_combine.columns if any(z in i for z in ["G1","G2","G3","lag1"])]]
select_pro_feat = cov_pro_combine.loc[:,["Protein"]+[i for i in cov_pro_combine.columns if any(z in i for z in ["lag1"])]]
cov_feature_vector = pd.merge(pd.merge(df,select_com_feat,how="inner",on="CID_senyawa"),select_pro_feat,how="inner",on="Protein").drop_duplicates()

#Print dataset information
print("Known Interaction :", len(cov_feature_vector[cov_feature_vector["class"]==1]))
print("All Interaction   :", len(cov_feature_vector))
print("Minority Class    : {:.3f}%".format((len(cov_feature_vector[cov_feature_vector["class"]==1])/len(cov_feature_vector))*100))
print()
print("Available Protein :", cov_feature_vector["Protein"].nunique())
print("Protein Features  :", select_pro_feat.shape[1]-1)
print()
print("Available Compound:", cov_feature_vector["CID_senyawa"].nunique())
print("Compound Features :", select_com_feat.shape[1]-1)

Known Interaction : 712
All Interaction   : 39975
Minority Class    : 1.781%

Available Protein : 325
Protein Features  : 400

Available Compound: 123
Compound Features : 881


In [None]:
cov_feature_vector["class"]

0        1
1        1
2        1
3        1
4        1
        ..
39970    0
39971    0
39972    0
39973    0
39974    1
Name: class, Length: 39975, dtype: int64

In [None]:
X_all = cov_feature_vector.drop(columns = ["Protein", "CID_senyawa", "class"])
y_all = cov_feature_vector["class"]

scaler_all = MinMaxScaler()
X_all = pd.DataFrame(data = scaler_all.fit_transform(X_all), columns = X_all.columns)

#Use all data, then split
# #Data splitting, labelling, and normalizing
# X = cov_feature_vector.drop(columns = ["Protein", "CID_senyawa", "class"])
# y = cov_feature_vector["class"]
# le = LabelEncoder()
# y = le.fit_transform(y)

# X_train, X_test, y_train, y_test = train_test_split(X,y, stratify = y, test_size = 0.1)

# scaler = MinMaxScaler()
# X_train_mm = pd.DataFrame(data = scaler.fit_transform(X_train), columns = X_train.columns)
# X_test_mm = pd.DataFrame(data = scaler.transform(X_test), columns = X_test.columns)

# del X,y, X_train, X_test

In [None]:
#Random Undersampling
X = cov_feature_vector.drop(columns = ["class"])
y = cov_feature_vector.iloc[:,2]

## 1:5 Ratio
rus = RandomUnderSampler(random_state=42, sampling_strategy = 0.2)
X_res, y_res = rus.fit_resample(X, y)
rand_stratify = pd.concat([pd.DataFrame(X_res, columns = X.columns),pd.Series(y_res,name="class")], axis = 1)

#10% of unknown interaction
# rand_stratify = pd.concat([cov_feature_vector[cov_feature_vector["class"] == 1], cov_feature_vector[cov_feature_vector["class"] == 0].sample(frac=0.1, random_state = 42)])

X_train_rus = rand_stratify.drop(columns = ["Protein", "CID_senyawa", "class"])
y_train_rus = rand_stratify["class"]

X_test_rus = X[~X.isin(pd.DataFrame(rand_stratify.drop(columns = ["Protein", "CID_senyawa", "class"]), columns = X.columns))].dropna().reset_index(drop = True)
y_test_rus = [0]*X_test_rus.shape[0]

rus_scaler = MinMaxScaler()
X_train_rus_mm = pd.DataFrame(data = rus_scaler.fit_transform(X_train_rus), columns = X_train_rus.columns)
X_test_rus_mm = pd.DataFrame(data = rus_scaler.transform(X_test_rus.drop(columns = ["CID_senyawa", "Protein"])), columns = X_test_rus.drop(columns = ["CID_senyawa", "Protein"]).columns)

#Print dataset information
print("Known Interaction  :", len(rand_stratify[rand_stratify["class"]==1]))
print("Unknown Interaction:", len(rand_stratify[rand_stratify["class"]==0]))
print()
# print("Test data :", len(X_test_rus))

#Data splitting, labelling, and normalizing
X = rand_stratify.drop(columns = ["Protein", "CID_senyawa", "class"])
y = rand_stratify["class"]
le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify = y, test_size = 0.2)

scaler = MinMaxScaler(feature_range = (0,1))
X_train_mm = pd.DataFrame(data = scaler.fit_transform(X_train), columns = X_train.columns)
X_test_mm = pd.DataFrame(data = scaler.transform(X_test), columns = X_test.columns)

del X,y, X_train, X_test

cov_feature_vector = rand_stratify.copy()

Known Interaction  : 712
Unknown Interaction: 3560



In [None]:
# dti_faldi = cov_feature_vector[cov_feature_vector["class"]==1][["Protein","CID_senyawa"]]
# dti_nabila = pd.read_csv("./My Drive/Skripsi/Data/coronadata/dti_latih.csv")
# cov_dti_nabila = pd.read_excel("./My Drive/Skripsi/Data/coronadata/cov_dti_nabila.xlsx", header= None)
# dti_nabila.columns = ["CID_senyawa","Protein"]
# cov_dti_nabila.columns = ["CID_senyawa","Senyawa","Protein"]

In [None]:
# sorted([i for i in dti_nabila["Protein"].unique() if i not in dti_faldi["Protein"].values])

In [None]:
# [i for i in cov_dti_nabila["Protein"].unique() if str(i) not in cov_dti["Protein"].values]

# Cross Validation

In [None]:
|-activation: relu
|-batch_size: 32
|-learning_rate: 0.0001
|-num_layers: 2
|-units: 1024

|-activation: relu
|-batch_size: 32
|-learning_rate: 0.0001
|-num_layers: 2
|-units: 1024

|-activation: relu
|-batch_size: 32
|-dropout_rate: 0.5
|-learning_rate: 0.0001
|-num_layers: 3
|-optimizer: nadam
|-units: 1024

|-activation: relu
|-batch_size: 8
|-dropout_rate: 0.5
|-learning_rate: 0.01
|-num_layers: 2
|-optimizer: adam
|-units: 256

## DC

In [None]:
# |-activation: relu
# |-batch_size: 8
# |-dropout_rate: 0.5
# |-fraction_node: 0.5
# |-learning_rate: 0.01
# |-num_layers: 2
# |-units: 300

In [None]:
!pip install propy3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting propy3
  Downloading propy3-1.1.1-py3-none-any.whl (290 kB)
[K     |████████████████████████████████| 290 kB 4.6 MB/s 
[?25hInstalling collected packages: propy3
Successfully installed propy3-1.1.1


In [None]:
import pubchempy as pcp
from propy.GetProteinFromUniprot import GetProteinSequence
from propy.PyPro import GetProDes
import pandas as pd

ligand = pcp.Compound.from_cid(323)
temp = bin(int(ligand.fingerprint, 16))
fp = temp[2:883]
list0 = list(fp)
for i in range(len(list0)):
  list0[i] = float(list0[i])

# download the protein sequence by uniprot id
proteinsequence = GetProteinSequence("P48039")

target = GetProDes(proteinsequence).GetDPComp()
target = [*target.values()]

ligand_protein = list0 + target

input0 = pd.DataFrame(ligand_protein).transpose()

In [None]:
input0

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1271,1272,1273,1274,1275,1276,1277,1278,1279,1280
0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,...,2.01,0.86,0.0,0.86,0.57,1.15,0.29,0.0,0.29,1.72


In [None]:
import pickle
import numpy as np
model = pickle.load(open('sae_dnn.pkl', 'rb'))
# app = Flask(__name__)

prediction = model.predict([[input0]])
output = np.round(prediction[0], 10)
output[0]



0.28095442

In [None]:
X_test_mm.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,...,K.Q.lag1,M.Q.lag1,F.Q.lag1,P.Q.lag1,S.Q.lag1,T.Q.lag1,W.Q.lag1,Y.Q.lag1,V.Q.lag1,G.E.lag1,H.E.lag1,I.E.lag1,L.E.lag1,K.E.lag1,M.E.lag1,F.E.lag1,P.E.lag1,S.E.lag1,T.E.lag1,W.E.lag1,Y.E.lag1,V.E.lag1,H.G.lag1,I.G.lag1,L.G.lag1,K.G.lag1,M.G.lag1,F.G.lag1,P.G.lag1,S.G.lag1,T.G.lag1,W.G.lag1,Y.G.lag1,V.G.lag1,I.H.lag1,L.H.lag1,K.H.lag1,M.H.lag1,F.H.lag1,P.H.lag1,S.H.lag1,T.H.lag1,W.H.lag1,Y.H.lag1,V.H.lag1,L.I.lag1,K.I.lag1,M.I.lag1,F.I.lag1,P.I.lag1,S.I.lag1,T.I.lag1,W.I.lag1,Y.I.lag1,V.I.lag1,K.L.lag1,M.L.lag1,F.L.lag1,P.L.lag1,S.L.lag1,T.L.lag1,W.L.lag1,Y.L.lag1,V.L.lag1,M.K.lag1,F.K.lag1,P.K.lag1,S.K.lag1,T.K.lag1,W.K.lag1,Y.K.lag1,V.K.lag1,F.M.lag1,P.M.lag1,S.M.lag1,T.M.lag1,W.M.lag1,Y.M.lag1,V.M.lag1,P.F.lag1,S.F.lag1,T.F.lag1,W.F.lag1,Y.F.lag1,V.F.lag1,S.P.lag1,T.P.lag1,W.P.lag1,Y.P.lag1,V.P.lag1,T.S.lag1,W.S.lag1,Y.S.lag1,V.S.lag1,W.T.lag1,Y.T.lag1,V.T.lag1,Y.W.lag1,V.W.lag1,V.Y.lag1
0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.293024,0.259958,0.357832,0.30798,0.429858,0.441594,0.390912,0.310321,0.427482,0.335988,0.341895,0.349073,0.318503,0.295263,0.289157,0.412426,0.328731,0.431413,0.459299,0.471659,0.423459,0.429698,0.309307,0.504998,0.500149,0.35031,0.540071,0.404682,0.525549,0.351516,0.396578,0.356215,0.310021,0.415934,0.324061,0.376089,0.323041,0.397421,0.358677,0.305232,0.486508,0.501232,0.347306,0.33313,0.398452,0.403419,0.594172,0.395671,0.394843,0.535496,0.526429,0.407957,0.470391,0.492347,0.303987,0.612889,0.367244,0.408634,0.528212,0.587002,0.404216,0.493966,0.498415,0.293241,0.348787,0.532374,0.278396,0.383469,0.393527,0.472056,0.476658,0.521534,0.445078,0.431471,0.568222,0.424242,0.518344,0.559812,0.32238,0.714478,0.524807,0.514106,0.46766,0.447465,0.357204,0.458585,0.303262,0.730267,0.669139,0.318119,0.376834,0.514729,0.421592,0.431067,0.571747,0.519178,0.513422,0.421268,0.394813,0.429494
1,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.368258,0.440858,0.430362,0.382828,0.399812,0.482546,0.408355,0.359603,0.571978,0.160757,0.357573,0.456936,0.455544,0.360728,0.412219,0.484881,0.389193,0.384071,0.48357,0.504223,0.481531,0.519515,0.230882,0.423444,0.424797,0.253668,0.332688,0.485554,0.221924,0.148617,0.204649,0.444495,0.393819,0.244939,0.435198,0.491025,0.406147,0.486442,0.352008,0.48642,0.461834,0.580687,0.265604,0.326211,0.5713,0.512895,0.698993,0.471326,0.608416,0.505527,0.592855,0.477241,0.752372,0.685787,0.364605,0.737153,0.455916,0.62912,0.504889,0.671004,0.481935,0.770142,0.704314,0.361824,0.634365,0.683049,0.369158,0.368038,0.452183,0.49629,0.612162,0.724237,0.612552,0.415248,0.604294,0.493717,0.715668,0.722561,0.380826,0.847957,0.638372,0.702471,0.642224,0.576598,0.471055,0.363916,0.24898,0.587311,0.590625,0.285251,0.283089,0.503768,0.444269,0.384909,0.604877,0.575201,0.559276,0.508245,0.518983,0.574254
2,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.464665,0.40751,0.436038,0.577985,0.543909,0.564992,0.529706,0.498388,0.392961,0.323386,0.576413,0.306971,0.326416,0.470562,0.401028,0.492228,0.614773,0.547512,0.577924,0.629231,0.643751,0.373265,0.388905,0.275296,0.293144,0.421412,0.338386,0.310634,0.451831,0.276123,0.313358,0.365117,0.343622,0.2021,0.272859,0.346088,0.530349,0.476506,0.432241,0.77484,0.670124,0.719831,0.475481,0.539555,0.363182,0.288235,0.465235,0.287812,0.336948,0.364798,0.315423,0.246157,0.467656,0.478275,0.212325,0.483119,0.286514,0.381126,0.36098,0.356215,0.245867,0.512228,0.521883,0.219144,0.535568,0.622428,0.516997,0.472975,0.494683,0.53864,0.76187,0.507282,0.441643,0.36672,0.412923,0.307546,0.575143,0.647035,0.253934,0.657767,0.41472,0.422107,0.563587,0.546737,0.322804,0.514988,0.274087,0.704773,0.767502,0.195161,0.382056,0.524653,0.547338,0.249831,0.504118,0.550609,0.329275,0.586693,0.427759,0.44445
3,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.356151,0.323848,0.375526,0.435336,0.456549,0.488699,0.437241,0.362415,0.389957,0.323367,0.40123,0.307764,0.288919,0.344982,0.302485,0.405584,0.45735,0.448096,0.489344,0.507953,0.464159,0.342955,0.333624,0.42072,0.416769,0.372993,0.459656,0.386284,0.507646,0.292632,0.353496,0.388178,0.353803,0.302202,0.386928,0.420754,0.392978,0.469027,0.365261,0.518888,0.526753,0.595558,0.328163,0.372649,0.483625,0.365281,0.517668,0.359044,0.408024,0.389546,0.509852,0.385612,0.549306,0.502835,0.269076,0.552385,0.341959,0.426548,0.371546,0.570679,0.381531,0.568897,0.52119,0.26308,0.436828,0.559997,0.409266,0.410917,0.447181,0.496515,0.576722,0.482713,0.471254,0.34714,0.617217,0.434469,0.605296,0.607878,0.2967,0.572229,0.575193,0.564863,0.522754,0.482887,0.343052,0.425758,0.273873,0.762336,0.682424,0.234961,0.385792,0.472544,0.414248,0.334846,0.502924,0.487503,0.463331,0.480801,0.41395,0.46411
4,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.403063,0.449958,0.35077,0.520756,0.489287,0.509985,0.278006,0.316818,0.547041,0.369096,0.401864,0.461409,0.44892,0.388612,0.450627,0.397024,0.552608,0.47749,0.526173,0.341172,0.400439,0.535752,0.344968,0.513681,0.502919,0.37974,0.566506,0.353809,0.519482,0.265099,0.3229,0.272683,0.294028,0.368903,0.376384,0.41711,0.41287,0.46439,0.312255,0.59453,0.541771,0.576914,0.219503,0.314002,0.48653,0.353437,0.69545,0.352011,0.422756,0.422714,0.528473,0.338006,0.573526,0.525212,0.268139,0.744637,0.329811,0.431025,0.395698,0.598685,0.329974,0.577654,0.537698,0.261555,0.589275,0.480382,0.506023,0.446055,0.468953,0.314262,0.46279,0.674199,0.457365,0.368905,0.624881,0.361094,0.557365,0.600378,0.29739,0.523286,0.457028,0.381458,0.496923,0.454194,0.282179,0.492311,0.306488,0.408899,0.455587,0.258924,0.305683,0.331059,0.292795,0.394347,0.340689,0.365051,0.44611,0.402498,0.263965,0.356758


In [None]:
X = cov_feature_vector.drop(columns = ["Protein", "CID_senyawa", "class"]).copy()
y = cov_feature_vector["class"].copy()

le = LabelEncoder()
y = le.fit_transform(y)

res_all = [[],[],[],[],[]]
auc_plots = []
y_pred_proba_all = 0 ; c = 0 ; cv_count = 10
sae_weights = sae_model(xt = X_all, xv = X.astype(float), EPOCHS = 100, af = "relu", lr=0.01, num_layers = 2, hl_node= 256, BATCH_SIZE=8, opt = "adam", do = 0.5)
#Initiate Cross-Validation
cv = StratifiedKFold(n_splits=cv_count, random_state=42,shuffle=True)
for train_ind, test_ind in cv.split(X,y):
  #Train the model
  X_train,y_train = X.iloc[train_ind,:],y[train_ind]
  X_test,y_test = X.iloc[test_ind,:],y[test_ind]

  #Data splitting, labelling, and normalizing
  le = LabelEncoder()
  y = le.fit_transform(y)
  scaler = MinMaxScaler()
  X_train_mm = pd.DataFrame(data = scaler.fit_transform(X_train), columns = X_train.columns)
  X_test_mm = pd.DataFrame(data = scaler.transform(X_test), columns = X_test.columns)

  #Fitting model
  sae_dnn =dnn_model(xt = X_train_mm, EPOCHS = 100, af = "relu", lr=0.01, num_layers = 2, hl_node= 256, BATCH_SIZE=8, opt = "adam", do = 0.5)
  # es = EarlyStopping(monitor='val_loss', patience=25)
  sae_dnn.fit(X_train_mm,y_train,epochs=100,batch_size=32,verbose = False)

  #Predict
  y_pred_proba = sae_dnn.predict(X_test_mm)
  y_pred = [1 if elem >= 0.5 else 0 for elem in y_pred_proba]
  # y_pred_proba_all += y_pred_proba

  #Calculate metrics
  accu = accuracy_score(y_test, y_pred)
  auc = roc_auc_score(y_test, y_pred_proba)
  precision_score,recall_score, f1_score,_ = precision_recall_fscore_support(y_test, y_pred, average='binary',pos_label=1)
  _,speci,_,_ = precision_recall_fscore_support(y_test, y_pred, average='binary',pos_label=0)

  res_all[0].append(accu);res_all[1].append(recall_score);res_all[2].append(precision_score);res_all[3].append(auc);res_all[4].append(f1_score)
  fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)
  auc_plots.append([fpr,tpr,auc])
  # #Show metrics
  # print("CV : {}".format(c+1));c+=1
  # print("Accuracy  : {:.3f}".format(accu))
  # print("Recall    : {:.3f}".format(recall_score))
  # print("Precision : {:.3f}".format(precision_score))
  # print("ROC-AUC   : {:.3f}".format(auc))
  # print("F1_Score  : {:.3f}".format(f1_score))
  # print(confusion_matrix(y_test,y_pred))
  # print("===================================")
  # print("===================================")

#Average and Stdv of k-fold CV
print('Average Result of {} CV'.format(cv_count))
print('Accuracy    : {0:.5f}±{1:.3f}'.format(np.mean(res_all[0]), np.std(res_all[0])))
print('Recall      : {0:.5f}±{1:.3f}'.format(np.mean(res_all[1]), np.std(res_all[1])))
print('Precision   : {0:.5f}±{1:.3f}'.format(np.mean(res_all[2]), np.std(res_all[2])))
print('ROC-AUC     : {0:.5f}±{1:.3f}'.format(np.mean(res_all[3]), np.std(res_all[3])))
print('F1 Score    : {0:.5f}±{1:.3f}'.format(np.mean(res_all[4]), np.std(res_all[4])))
print('===================================')

#Save CV result and choose auc plot with highest score
best_auc_dc = auc_plots[np.array(res_all[3]).argmax()]
res_all_dc = res_all

Average Result of 10 CV
Accuracy    : 0.92627±0.012
Recall      : 0.77392±0.039
Precision   : 0.78496±0.053
ROC-AUC     : 0.95469±0.009
F1 Score    : 0.77817±0.034


In [None]:
Average Result of 10 CV
Accuracy    : 0.92674±0.013
Recall      : 0.75855±0.042
Precision   : 0.79645±0.054
ROC-AUC     : 0.95293±0.012
F1 Score    : 0.77568±0.036

In [None]:
sae_dnn.save('./My Drive/Skripsi/Data/coronadata/model_sae_dnn')

In [None]:
import pickle
file = open('./My Drive/Skripsi/Data/coronadata/sae_dnn.pkl','wb')
pickle.dump(sae_dnn,file)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
Average Result of 10 CV
Accuracy    : 0.93984±0.009
Recall      : 0.83016±0.043
Precision   : 0.81733±0.050
ROC-AUC     : 0.96979±0.009
F1 Score    : 0.82162±0.022

In [None]:
X = cov_feature_vector.drop(columns = ["Protein", "CID_senyawa", "class"]).copy()
y = cov_feature_vector["class"].copy()

le = LabelEncoder()
y = le.fit_transform(y)

res_all = [[],[],[],[],[]]
auc_plots = []
y_pred_proba_all = 0 ; c = 0 ; cv_count = 10
# sae_weights = sae_model(xt = X_all, xv = X.astype(float), EPOCHS = 100, af = "relu", lr=0.01, num_layers = 2, hl_node= 300, BATCH_SIZE=8, opt = "adam", do = 0.5)
#Initiate Cross-Validation
cv = StratifiedKFold(n_splits=cv_count, random_state=42,shuffle=True)
for train_ind, test_ind in cv.split(X,y):
  #Train the model
  X_train,y_train = X.iloc[train_ind,:],y[train_ind]
  X_test,y_test = X.iloc[test_ind,:],y[test_ind]

  #Data splitting, labelling, and normalizing
  le = LabelEncoder()
  y = le.fit_transform(y)
  scaler = MinMaxScaler()
  X_train_mm = pd.DataFrame(data = scaler.fit_transform(X_train), columns = X_train.columns)
  X_test_mm = pd.DataFrame(data = scaler.transform(X_test), columns = X_test.columns)

  #Fitting model
  sae_dnn =dnn_model(xt = X_train_mm, EPOCHS = 100, af = "relu", lr=0.01, num_layers = 2, hl_node= 300, BATCH_SIZE=8, opt = "adam", do = 0.5)
  # es = EarlyStopping(monitor='val_loss', patience=25)
  sae_dnn.fit(X_train_mm,y_train,epochs=100,batch_size=32,verbose = False)

  #Predict
  y_pred_proba = sae_dnn.predict(X_test_mm)
  y_pred = [1 if elem >= 0.5 else 0 for elem in y_pred_proba]
  # y_pred_proba_all += y_pred_proba

  #Calculate metrics
  accu = accuracy_score(y_test, y_pred)
  auc = roc_auc_score(y_test, y_pred_proba)
  precision_score,recall_score, f1_score,_ = precision_recall_fscore_support(y_test, y_pred, average='binary',pos_label=1)
  _,speci,_,_ = precision_recall_fscore_support(y_test, y_pred, average='binary',pos_label=0)

  res_all[0].append(accu);res_all[1].append(recall_score);res_all[2].append(precision_score);res_all[3].append(auc);res_all[4].append(f1_score)
  fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)
  auc_plots.append([fpr,tpr,auc])
  # #Show metrics
  # print("CV : {}".format(c+1));c+=1
  # print("Accuracy  : {:.3f}".format(accu))
  # print("Recall    : {:.3f}".format(recall_score))
  # print("Precision : {:.3f}".format(precision_score))
  # print("ROC-AUC   : {:.3f}".format(auc))
  # print("F1_Score  : {:.3f}".format(f1_score))
  # print(confusion_matrix(y_test,y_pred))
  # print("===================================")
  # print("===================================")

#Average and Stdv of k-fold CV
print('Average Result of {} CV'.format(cv_count))
print('Accuracy    : {0:.5f}±{1:.3f}'.format(np.mean(res_all[0]), np.std(res_all[0])))
print('Recall      : {0:.5f}±{1:.3f}'.format(np.mean(res_all[1]), np.std(res_all[1])))
print('Precision   : {0:.5f}±{1:.3f}'.format(np.mean(res_all[2]), np.std(res_all[2])))
print('ROC-AUC     : {0:.5f}±{1:.3f}'.format(np.mean(res_all[3]), np.std(res_all[3])))
print('F1 Score    : {0:.5f}±{1:.3f}'.format(np.mean(res_all[4]), np.std(res_all[4])))
print('===================================')

#Choose auc plot with highest score
best_auc_dnn = auc_plots[np.array(res_all[3]).argmax()]
res_all_dnn = res_all

In [None]:
Average Result of 10 CV
Accuracy    : 0.93844±0.014
Recall      : 0.83009±0.054
Precision   : 0.80829±0.049
ROC-AUC     : 0.97021±0.008
F1 Score    : 0.81791±0.041
===================================

Average Result of 10 CV
Accuracy    : 0.93984±0.009
Recall      : 0.83016±0.043
Precision   : 0.81733±0.050
ROC-AUC     : 0.96979±0.009
F1 Score    : 0.82162±0.022

Average Result of 10 CV #Without SAE
Accuracy    : 0.93727±0.010
Recall      : 0.83856±0.051
Precision   : 0.79878±0.041
ROC-AUC     : 0.96783±0.009
F1 Score    : 0.81656±0.028
===================================

In [None]:
X = cov_feature_vector.drop(columns = ["Protein", "CID_senyawa", "class"]).copy()
y = cov_feature_vector["class"].copy()

le = LabelEncoder()
y = le.fit_transform(y)

res_all = [[],[],[],[],[]]
auc_plots = []
y_pred_proba_all = 0 ; c = 0 ; cv_count = 10
sae_weights = sae_model(xt = X_all, xv = X.astype(float), EPOCHS = 100, af = "relu", lr=0.01, num_layers = 2, hl_node= 300, BATCH_SIZE=32, opt = "adam", do = 0.5)
#Initiate Cross-Validation
cv = StratifiedKFold(n_splits=cv_count, random_state=42,shuffle=True)
for train_ind, test_ind in cv.split(X,y):
  #Train the model
  X_train,y_train = X.iloc[train_ind,:],y[train_ind]
  X_test,y_test = X.iloc[test_ind,:],y[test_ind]

  #Data splitting, labelling, and normalizing
  le = LabelEncoder()
  y = le.fit_transform(y)
  scaler = MinMaxScaler()
  X_train_mm = pd.DataFrame(data = scaler.fit_transform(X_train), columns = X_train.columns)
  X_test_mm = pd.DataFrame(data = scaler.transform(X_test), columns = X_test.columns)

  #Fitting model
  sae_dnn =dnn_model(xt = X_train_mm, EPOCHS = 100, af = "relu", lr=0.01, num_layers = 2, hl_node= 300, BATCH_SIZE=32, opt = "adam", do = 0.5)
  # es = EarlyStopping(monitor='val_loss', patience=25)
  sae_dnn.fit(X_train_mm,y_train,epochs=100,batch_size=32,verbose = False)

  #Predict
  y_pred_proba = sae_dnn.predict(X_test_mm)
  y_pred = [1 if elem >= 0.5 else 0 for elem in y_pred_proba]
  # y_pred_proba_all += y_pred_proba

  #Calculate metrics
  accu = accuracy_score(y_test, y_pred)
  auc = roc_auc_score(y_test, y_pred_proba)
  precision_score,recall_score, f1_score,_ = precision_recall_fscore_support(y_test, y_pred, average='binary',pos_label=1)
  _,speci,_,_ = precision_recall_fscore_support(y_test, y_pred, average='binary',pos_label=0)

  res_all[0].append(accu);res_all[1].append(recall_score);res_all[2].append(precision_score);res_all[3].append(auc);res_all[4].append(f1_score)
  fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)
  auc_plots.append([fpr,tpr,auc])
  #Show metrics
  print("CV : {}".format(c+1));c+=1
  print("Accuracy  : {:.3f}".format(accu))
  print("Recall    : {:.3f}".format(recall_score))
  print("Precision : {:.3f}".format(precision_score))
  print("ROC-AUC   : {:.3f}".format(auc))
  print("F1_Score  : {:.3f}".format(f1_score))
  print(confusion_matrix(y_test,y_pred))
  print("===================================")
  print("===================================")

#Average and Stdv of k-fold CV
print('Average Result of {} CV'.format(cv_count))
print('Accuracy    : {0:.5f}±{1:.3f}'.format(np.mean(res_all[0]), np.std(res_all[0])))
print('Recall      : {0:.5f}±{1:.3f}'.format(np.mean(res_all[1]), np.std(res_all[1])))
print('Precision   : {0:.5f}±{1:.3f}'.format(np.mean(res_all[2]), np.std(res_all[2])))
print('ROC-AUC     : {0:.5f}±{1:.3f}'.format(np.mean(res_all[3]), np.std(res_all[3])))
print('F1 Score    : {0:.5f}±{1:.3f}'.format(np.mean(res_all[4]), np.std(res_all[4])))
print('===================================')

#Save CV result and choose auc plot with highest score
best_auc_dc = auc_plots[np.array(res_all[3]).argmax()]
res_all_dc = res_all

In [None]:
X = cov_feature_vector.drop(columns = ["Protein", "CID_senyawa", "class"]).copy()
y = cov_feature_vector["class"].copy()

le = LabelEncoder()
y = le.fit_transform(y)

res_all = [[],[],[],[],[]]
auc_plots = []
y_pred_proba_all = 0 ; c = 0 ; cv_count = 10
sae_weights = sae_model(xt = X_all, xv = X.astype(float), EPOCHS = 100, af = "relu", lr=0.01, num_layers = 2, hl_node= 300, BATCH_SIZE=32, opt = "adam", do = 0.5)
#Initiate Cross-Validation
cv = StratifiedKFold(n_splits=cv_count, random_state=42,shuffle=True)
for train_ind, test_ind in cv.split(X,y):
  #Train the model
  X_train,y_train = X.iloc[train_ind,:],y[train_ind]
  X_test,y_test = X.iloc[test_ind,:],y[test_ind]

  #Data splitting, labelling, and normalizing
  le = LabelEncoder()
  y = le.fit_transform(y)
  scaler = MinMaxScaler()
  X_train_mm = pd.DataFrame(data = scaler.fit_transform(X_train), columns = X_train.columns)
  X_test_mm = pd.DataFrame(data = scaler.transform(X_test), columns = X_test.columns)

  #Fitting model
  sae_dnn =dnn_model(xt = X_train_mm, EPOCHS = 100, af = "relu", lr=0.01, num_layers = 2, hl_node= 300, BATCH_SIZE=32, opt = "adam", do = 0.5)
  # es = EarlyStopping(monitor='val_loss', patience=25)
  sae_dnn.fit(X_train_mm,y_train,epochs=100,batch_size=32,verbose = False)

  #Predict
  y_pred_proba = sae_dnn.predict(X_test_mm)
  y_pred = [1 if elem >= 0.5 else 0 for elem in y_pred_proba]
  # y_pred_proba_all += y_pred_proba

  #Calculate metrics
  accu = accuracy_score(y_test, y_pred)
  auc = roc_auc_score(y_test, y_pred_proba)
  precision_score,recall_score, f1_score,_ = precision_recall_fscore_support(y_test, y_pred, average='binary',pos_label=1)
  _,speci,_,_ = precision_recall_fscore_support(y_test, y_pred, average='binary',pos_label=0)

  res_all[0].append(accu);res_all[1].append(recall_score);res_all[2].append(precision_score);res_all[3].append(auc);res_all[4].append(f1_score)
  fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)
  auc_plots.append([fpr,tpr,auc])
  #Show metrics
  print("CV : {}".format(c+1));c+=1
  print("Accuracy  : {:.3f}".format(accu))
  print("Recall    : {:.3f}".format(recall_score))
  print("Precision : {:.3f}".format(precision_score))
  print("ROC-AUC   : {:.3f}".format(auc))
  print("F1_Score  : {:.3f}".format(f1_score))
  print(confusion_matrix(y_test,y_pred))
  print("===================================")
  print("===================================")

#Average and Stdv of k-fold CV
print('Average Result of {} CV'.format(cv_count))
print('Accuracy    : {0:.5f}±{1:.3f}'.format(np.mean(res_all[0]), np.std(res_all[0])))
print('Recall      : {0:.5f}±{1:.3f}'.format(np.mean(res_all[1]), np.std(res_all[1])))
print('Precision   : {0:.5f}±{1:.3f}'.format(np.mean(res_all[2]), np.std(res_all[2])))
print('ROC-AUC     : {0:.5f}±{1:.3f}'.format(np.mean(res_all[3]), np.std(res_all[3])))
print('F1 Score    : {0:.5f}±{1:.3f}'.format(np.mean(res_all[4]), np.std(res_all[4])))
print('===================================')

#Save CV result and choose auc plot with highest score
best_auc_dc = auc_plots[np.array(res_all[3]).argmax()]
res_all_dc = res_all