In [1]:
from google.colab import drive
drive.mount('/content/drive/')


Mounted at /content/drive/


In [None]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [2]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve
import pandas as pd
import math

def categorical_probas_to_classes(p):
    return np.argmax(p, axis=1)

def to_categorical(y, nb_classes=None):
    '''Convert class vector (integers from 0 to nb_classes)
    to binary class matrix, for use with categorical_crossentropy.
    '''
    y = np.array(y, dtype='int')
    if not nb_classes:
        nb_classes = np.max(y) + 1
    Y = np.zeros((len(y), nb_classes))
    for i in range(len(y)):
        Y[i, y[i]] = 1.
    return Y

def calculate_performance(test_num, pred_y, labels, pred_probas=None):
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for index in range(test_num):
        if labels[index] == 1:
            if labels[index] == pred_y[index]:
                tp = tp + 1
            else:
                fn = fn + 1
        else:
            if labels[index] == pred_y[index]:
                tn = tn + 1
            else:
                fp = fp + 1

    acc = float(tp + tn) / test_num
    precision = float(tp) / (tp + fp + 1e-06)
    npv = float(tn) / (tn + fn + 1e-06)
    sensitivity = float(tp) / (tp + fn + 1e-06)
    specificity = float(tn) / (tn + fp + 1e-06)
    mcc = float(tp * tn - fp * fn) / (math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) + 1e-06)
    f1 = float(tp * 2) / (tp * 2 + fp + fn + 1e-06)
    # Calculate FPR and TPR
    fpr = fp / (fp + tn)
    tpr = tp / (tp + fn)
    # Calculate AUPR
    aupr = None
    if pred_probas is not None:
        precision_vals, recall_vals, _ = precision_recall_curve(labels, pred_probas)
        aupr = auc(recall_vals, precision_vals)

    # roc_auc = auc(fpr, tpr)
    return acc, sensitivity, specificity, mcc, f1, aupr


**Load input data**

In [30]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

path = '/content/drive/MyDrive/Watashara_Projects/TIP/'


# Load the data

w2vec_all = pd.read_csv(path + 'features/word2vec_TIP_all.csv')
esm2_all = pd.read_csv(path + 'features/Esm2_TIP_all.csv', header=None).iloc[:, :]
prot_t5_bfd = pd.read_csv(path + 'features/TIP_al_prot_t5_xl_bfd.csv', header=None).iloc[1:, 1:]
prot_t5_unief50 = pd.read_csv(path + 'features/TIP_all_prot_t5_xl_uniref50.csv', header=None).iloc[1:, 1:]
reducedACID_all = pd.read_csv(path + 'features/reducedACID_all.csv', header=None).iloc[1:, 1:]
reducedCHARGE_all = pd.read_csv(path + 'features/reducedCHARGE_all.csv', header=None).iloc[1:, 1:]
AAC_all = pd.read_csv(path + 'features/AAC_all.csv', header=None).iloc[1:, 1:]
PAAC_all = pd.read_csv(path + 'features/PAAC_all.csv', header=None).iloc[1:, 1:]

BiGRU_all = pd.read_csv(path + 'features/BiGRU_all.csv', header=None).iloc[:, :]
BiLSTM_all = pd.read_csv(path + 'features/BiLSTM_all.csv', header=None).iloc[:, :]

# esm1_2_w2vec = np.column_stack((data_, esm2, esmv1))
seq_feat = np.column_stack((reducedACID_all,reducedCHARGE_all, AAC_all, PAAC_all))
data_np_all = np.array(seq_feat)

seq_feat = np.column_stack((reducedACID_all,reducedCHARGE_all, AAC_all, PAAC_all))

set1_feat = np.column_stack((seq_feat,esm2_all,prot_t5_bfd,prot_t5_unief50,w2vec_all))
set2_feat = np.column_stack((w2vec_all,esm2_all,prot_t5_bfd,prot_t5_unief50,BiGRU_all, BiLSTM_all))
set3_feat = np.column_stack((w2vec_all, seq_feat,esm2_all,prot_t5_bfd,prot_t5_unief50))
# pd.DataFrame(set2_feat).to_csv('w2v_esm2_t5_bfd_unief50_BiGUR_BiLSTM.csv')
print(np.shape(w2vec_all))
print(np.shape(esm2_all))
print(np.shape(prot_t5_bfd))
print(np.shape(prot_t5_unief50))
print(np.shape(BiGRU_all))
print(np.shape(BiLSTM_all))
print(np.shape(seq_feat))
print(np.shape(AAC_all))
print(np.shape(reducedCHARGE_all))
print(np.shape(reducedACID_all))
print(np.shape(PAAC_all)))

set1 = np.array(set1_feat)
set2 = np.array(set2_feat)
set3 = np.array(set3_feat)


(708, 120)
(708, 320)
(708, 1024)
(708, 1024)
(708, 2048)
(708, 2048)
(708, 105)
(708, 20)
(708, 32)
(708, 32)
(708, 21)


**LSTM_Selfattention**

In [None]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau,ModelCheckpoint
from keras.layers import BatchNormalization
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from keras.regularizers import l2
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, LSTM, Dropout, BatchNormalization, Flatten, Dense, Bidirectional
import os
from sklearn.preprocessing import scale
from sklearn.model_selection import StratifiedKFold, train_test_split
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Layer

class SelfAttention(Layer):
    def __init__(self, attention_width=50, attention_type=2, **kwargs):
        super(SelfAttention, self).__init__(**kwargs)
        self.attention_width = attention_width
        self.attention_type = attention_type

    def build(self, input_shape):
        # Ensure that input_shape[-1] (features) is correct
        feature_dim = input_shape[-1]
        self.W = self.add_weight(name="att_weight", shape=(feature_dim, self.attention_width),
                                 initializer="glorot_uniform", trainable=True)
        self.b = self.add_weight(name="att_bias", shape=(self.attention_width,),
                                 initializer="zeros", trainable=True)
        self.u = self.add_weight(name="att_u", shape=(self.attention_width,),
                                 initializer="glorot_uniform", trainable=True)
        super(SelfAttention, self).build(input_shape)

    def call(self, inputs):
        # Ensure inputs are reshaped properly
        v = tf.nn.tanh(tf.matmul(inputs, self.W) + self.b)
        vu = tf.matmul(v, tf.expand_dims(self.u, -1))
        alphas = tf.nn.softmax(tf.squeeze(vu, -1), axis=1)
        alphas = tf.expand_dims(alphas, -1)
        output = tf.reduce_sum(inputs * alphas, axis=1)
        return output

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

    def get_config(self):
        config = super(SelfAttention, self).get_config()
        config.update({
            "attention_width": self.attention_width,
            "attention_type": self.attention_type,
        })
        return config


# Update the model architecture
model = Sequential()
model.add(Bidirectional(LSTM(132, return_sequences=True, recurrent_dropout=0.2))) # Add recurrent dropout
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(LSTM(32, return_sequences=True, recurrent_dropout=0.2))  # Add recurrent dropout
model.add(Dropout(0.5))
model.add(BatchNormalization())  # Add Batch Normalization
model.add(SelfAttention(attention_width=50, attention_type=2))  # Use SelfAttention layer
model.add(Flatten())

# def get_CNN_model(input_dim, out_dim):
#     model.add(Dense(int(input_dim / 2), activation='gelu', kernel_regularizer='l2'))  # Add L2 regularization
#     model.add(Dense(out_dim, activation='relu', name="Dense_2"))
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#     return model

model.add(Dense(2, activation='sigmoid', name="Dense_2"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Introduce callbacks for early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6)

path = '/content/drive/MyDrive/Watashara_Projects/TIP/'

# data_ = pd.read_csv(path + 'BiGRU_all.csv',header=None)
data = np.array(PAAC_all)
label1 = np.ones((int(206), 1))
label2 = np.zeros((int(502), 1))
label = np.append(label1, label2)
scale_data = scale(data[:,:])

folder= 'PAAC_set2'
# Define the directory to save the best models
model_save_dir = path + f'Results/{folder}/Models/'
# Ensure the save directory exists
os.makedirs(model_save_dir, exist_ok=True)

# from imblearn.over_sampling import SMOTE, BorderlineSMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(scale_data, label)

# # y = labels
X_train, X_ind, y_train, y_ind = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert numpy arrays to DataFrames
X_train_df = pd.DataFrame(X_train)
X_ind_df = pd.DataFrame(X_ind)
y_train_df = pd.DataFrame(y_train)
y_ind_df = pd.DataFrame(y_ind)

# Save the training and test data
X_train_data = pd.concat([X_train_df, y_train_df], axis=1)
X_train_data.to_csv(path+f'Results/{folder}/XtrainData.csv', index=False)

X_test_data = pd.concat([X_ind_df, y_ind_df], axis=1)
X_test_data.to_csv(path+f'Results/{folder}/XtestData.csv', index=False)


# train_all_feat = pd.read_csv(path + 'Lasso_XtrainData.csv')

# X_train = train_all_feat.iloc[:, :-1].values
# y_train = train_all_feat.iloc[:, -1].values

y = y_train
X = np.reshape(X_train, (-1, 1, X_train.shape[1]))

sepscores = []
ytest = np.ones((1, 2)) * 0.5
yscore = np.ones((1, 2)) * 0.5

skf = StratifiedKFold(n_splits=10)

for fold, (train, test) in enumerate(skf.split(X, y_train)):
    y_train = to_categorical(y[train])
    cv_clf = model
    # Model checkpoint to save the best model for each fold
    model_checkpoint = ModelCheckpoint(
        model_save_dir + f'best_model_fold_{fold + 1}.keras',
        monitor='val_loss',
        save_best_only=True,
        save_weights_only=False,
        mode='auto')
    hist = cv_clf.fit(X[train],
                      y_train,
                      epochs=50,  # Increase the number of epochs
                      validation_split=0.1,  # Use part of the training data for validation
                      callbacks=[early_stopping, reduce_lr, model_checkpoint])  # Use callbacks, model_checkpoint

    y_score = cv_clf.predict(X[test])
    y_class = categorical_probas_to_classes(y_score)

    y_test = to_categorical(y[test])
    ytest = np.vstack((ytest, y_test))
    y_test_tmp = y[test]
    yscore = np.vstack((yscore, y_score))

    acc, sensitivity, specificity, mcc, f1, aupr = calculate_performance(
        len(y_class), y_class, y_test_tmp, pred_probas=y_score[:, 1]
    )

    fpr, tpr, _ = roc_curve(y_test[:, 1], y_score[:, 1])
    roc_auc = auc(fpr, tpr)
    precision, recall, _ = precision_recall_curve(y_test[:, 1], y_score[:, 1])
    pr_auc = auc(recall, precision)
    sepscores.append([acc, sensitivity, specificity, mcc, f1, roc_auc, pr_auc])

scores = np.array(sepscores)
result1 = np.mean(scores, axis=0)
H1 = result1.tolist()
sepscores.append(H1)
result = sepscores

row = yscore.shape[0]
yscore = yscore[np.array(range(1, row)), :]
yscore_sum = pd.DataFrame(data=yscore)
yscore_sum.to_csv(path + f'Results/{folder}/Set2_yscore.csv')

ytest = ytest[np.array(range(1, row)), :]
ytest_sum = pd.DataFrame(data=ytest)
ytest_sum.to_csv(path + f'Results/{folder}/Set2_ytest.csv')

data_csv = pd.DataFrame(data=result)
data_csv.to_csv(path + f'Results/{folder}/Set2_CV.csv')


Epoch 1/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 45ms/step - accuracy: 0.6070 - loss: 0.7508 - val_accuracy: 0.6438 - val_loss: 0.6531 - learning_rate: 0.0010
Epoch 2/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.7945 - loss: 0.4589 - val_accuracy: 0.6575 - val_loss: 0.6345 - learning_rate: 0.0010
Epoch 3/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.8081 - loss: 0.4240 - val_accuracy: 0.6438 - val_loss: 0.6305 - learning_rate: 0.0010
Epoch 4/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.8554 - loss: 0.3405 - val_accuracy: 0.6575 - val_loss: 0.6237 - learning_rate: 0.0010
Epoch 5/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.8792 - loss: 0.3286 - val_accuracy: 0.6438 - val_loss: 0.6181 - learning_rate: 0.0010
Epoch 6/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 

**CNN-LSTM-SelfAttention indpendent test**

In [27]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Layer
from tensorflow.keras.saving import register_keras_serializable
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import roc_curve, auc, precision_recall_curve


# test_all_feat = pd.read_csv(path + 'Results/EN_set2/Lasso_XtestData.csv')

# X_ind = test_all_feat.iloc[:, :-1].values
# y_ind = test_all_feat.iloc[:, -1].values

# Define the directory where the models are saved
model_save_dir = f'/content/drive/MyDrive/Watashara_Projects/TIP/Results/{folder}/Models/'

# List of model filenames
model_files = [f'best_model_fold_{i + 1}.keras' for i in range(10)]

# Initialize lists to store metrics and predictions
all_metrics = []
fold_metrics = []
y_score_all_folds = []
y_test_all_folds = []

# Load the models with custom objects
for model_file in model_files:
    file_path = os.path.join(model_save_dir, model_file)

    if not os.path.isfile(file_path):
        print(f"Model file not found: {file_path}")
        continue

    try:
        model = load_model(file_path, custom_objects={'SelfAttention': SelfAttention})
        print(f"Loaded model from {file_path}")
    except Exception as e:
        print(f"Error loading model from {file_path}: {e}")
        continue

    # Assuming X_ind and y_ind are your independent test data and labels
    X_ind_reshaped = np.reshape(X_ind, (-1, 1, X_ind.shape[1]))
    y_ind_categorical = to_categorical(y_ind)

    # Predict and evaluate
    y_score = model.predict(X_ind_reshaped)
    y_class = categorical_probas_to_classes(y_score)

    # Save the predicted scores (y_score) and actual test labels (y_ind)
    y_score_all_folds.append(y_score)
    y_test_all_folds.append(y_ind)

    # Calculate performance metrics
    acc, sensitivity, specificity, mcc, f1, aupr = calculate_performance(
        len(y_ind), y_class, y_ind, pred_probas=y_score[:, 1]
    )
    fpr, tpr, _ = roc_curve(y_class, y_score[:, 1])
    roc_auc = auc(fpr, tpr)
    precision, recall, _ = precision_recall_curve(y_ind, y_score[:, 1])
    pr_auc = auc(recall, precision)

    metrics = {
        'Model': model_file,
        'Accuracy': acc,
        'Sensitivity': sensitivity,
        'Specificity': specificity,
        'MCC': mcc,
        'F1': f1,
        'AUC': roc_auc,
        'AUPR': pr_auc
    }

    fold_metrics.append(metrics)

    print(f"Results for {model_file}:")
    print(f"Accuracy: {acc}")
    print(f"Sensitivity: {sensitivity}")
    print(f"Specificity: {specificity}")
    print(f"MCC: {mcc}")
    print(f"F1: {f1}")
    print(f"pr_auc: {pr_auc}")
    print(f"AUC: {roc_auc}")

# Convert fold metrics to DataFrame and save to CSV
fold_metrics_df = pd.DataFrame(fold_metrics)
fold_metrics_df.to_csv(os.path.join(model_save_dir, 'Fold_Metrics.csv'), index=False)

# Save the predicted scores and actual test labels to CSV
# Flatten y_score and y_test to ensure they are saved correctly
y_score_flat = np.concatenate(y_score_all_folds, axis=0)
y_test_flat = np.concatenate(y_test_all_folds, axis=0)

# Convert to DataFrames
y_score_df = pd.DataFrame(y_score_flat, columns=[f'Class_{i}' for i in range(y_score_flat.shape[1])])
y_test_df = pd.DataFrame(y_test_flat, columns=['True_Label'])

# Save to CSV
y_score_df.to_csv(os.path.join(model_save_dir, 'y_score_all_folds.csv'), index=False)
y_test_df.to_csv(os.path.join(model_save_dir, 'y_test_all_folds.csv'), index=False)

# Average the metrics across all folds (excluding non-numeric columns)
numeric_metrics_df = fold_metrics_df.drop(columns=['Model'])
mean_metrics = numeric_metrics_df.mean()
mean_metrics_df = pd.DataFrame([mean_metrics], columns=mean_metrics.index)
mean_metrics_df.to_csv(os.path.join(model_save_dir, 'Average_Metrics.csv'), index=False)

print("Results and predictions saved to CSV files.")


Loaded model from /content/drive/MyDrive/Watashara_Projects/TIP/Results/PAAC_set2/Models/best_model_fold_1.keras
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 78ms/step
Results for best_model_fold_1.keras:
Accuracy: 0.8606965174129353
Sensitivity: 0.8478260777410209
Specificity: 0.8715596250315631
MCC: 0.7193857199123069
F1: 0.8478260823487713
pr_auc: 0.8650192245530147
AUC: 1.0
Loaded model from /content/drive/MyDrive/Watashara_Projects/TIP/Results/PAAC_set2/Models/best_model_fold_2.keras
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 77ms/step
Results for best_model_fold_2.keras:
Accuracy: 0.8656716417910447
Sensitivity: 0.8478260777410209
Specificity: 0.8807339368740006
MCC: 0.7292147866045113
F1: 0.8524590117351967
pr_auc: 0.8656815779301517
AUC: 0.9998001998001999
Loaded model from /content/drive/MyDrive/Watashara_Projects/TIP/Results/PAAC_set2/Models/best_model_fold_3.keras
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 71ms/step
Resu

In [None]:
from sklearn.preprocessing import scale
from sklearn.model_selection import StratifiedKFold, train_test_split
from imblearn.over_sampling import SMOTE

path = '/content/drive/MyDrive/Watashara_Projects/TIP/'
# Define the directory to save the best models


data_ = pd.read_csv(path + 'update_LASSO_selected_feature_values_SET2.csv')
data = np.array(data_)
label1 = np.ones((int(206), 1))
label2 = np.zeros((int(502), 1))
label = np.append(label1, label2)
scale_data = scale(data[:,:])


# from imblearn.over_sampling import SMOTE, BorderlineSMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(scale_data, label)

# # y = labels
X_train, X_ind, y_train, y_ind = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert numpy arrays to DataFrames
X_train_df = pd.DataFrame(X_train)
X_ind_df = pd.DataFrame(X_ind)
y_train_df = pd.DataFrame(y_train)
y_ind_df = pd.DataFrame(y_ind)

# Save the training and test data
X_train_data = pd.concat([X_train_df, y_train_df], axis=1)
X_train_data.to_csv(path + '1_XtrainData.csv', index=False)

X_test_data = pd.concat([X_ind_df, y_ind_df], axis=1)
X_test_data.to_csv(path + '1_XtestData.csv', index=False)