In [None]:
import numpy as np
import torch
import tensorflow as tf
import time
from Bio import SeqIO
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score, matthews_corrcoef, confusion_matrix
from transformers import AutoModel, AutoTokenizer

# Load pre-trained
MODEL_NAME = "facebook/esm2_t6_8M_UR50D"  # Example: ESM model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
embedding_model = AutoModel.from_pretrained(MODEL_NAME)


def load_fasta(file_path, label):
    sequences, labels = [], []
    for record in SeqIO.parse(file_path, "fasta"):
        sequences.append(str(record.seq))
        labels.append(label)
    return sequences, labels


def get_protein_embedding(sequence):
    tokens = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        embeddings = embedding_model(**tokens).last_hidden_state.mean(dim=1).squeeze()
    return embeddings.numpy()


# Load data
train_pos_file = "train_positive.fasta"
train_neg_file = "train_negative.fasta"
test_pos_file = "test_positive.fasta"
test_neg_file = "test_negative.fasta"

train_pos_sequences, train_pos_labels = load_fasta(train_pos_file, 1)
train_neg_sequences, train_neg_labels = load_fasta(train_neg_file, 0)
test_pos_sequences, test_pos_labels = load_fasta(test_pos_file, 1)
test_neg_sequences, test_neg_labels = load_fasta(test_neg_file, 0)

# Combine and encode sequences
train_sequences = train_pos_sequences + train_neg_sequences
test_sequences = test_pos_sequences + test_neg_sequences
train_labels = np.array(train_pos_labels + train_neg_labels)
test_labels = np.array(test_pos_labels + test_neg_labels)

train_embeddings = np.array([get_protein_embedding(seq) for seq in train_sequences])
test_embeddings = np.array([get_protein_embedding(seq) for seq in test_sequences])

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(train_embeddings, train_labels, test_size=0.2, random_state=42)
X_test, y_test = test_embeddings, test_labels

# ---------------------- BiLSTM with Attention ---------------------- #
def build_bilstm_attention_model(input_shape):
    inputs = tf.keras.Input(shape=input_shape)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(inputs)
    attention = tf.keras.layers.Attention()([x, x])
    x = tf.keras.layers.GlobalAveragePooling1D()(attention)
    x = tf.keras.layers.Dense(128, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.4)(x)
    outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
    return tf.keras.Model(inputs, outputs)

# ---------------------- CNN-BiLSTM Hybrid ---------------------- #
def build_cnn_bilstm_model(input_shape):
    inputs = tf.keras.Input(shape=input_shape)
    x = tf.keras.layers.Conv1D(64, kernel_size=3, activation="relu", padding="same")(inputs)
    x = tf.keras.layers.MaxPooling1D(pool_size=2)(x)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(x)
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    x = tf.keras.layers.Dense(128, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.4)(x)
    outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
    return tf.keras.Model(inputs, outputs)

# ---------------------- Transformer-based Model ---------------------- #
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0.4):
    x = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=head_size, dropout=dropout)(inputs, inputs)
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x + inputs)
    x = tf.keras.layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(x)
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)
    return x


def build_transformer_model(input_shape):
    inputs = tf.keras.Input(shape=input_shape)
    x = transformer_encoder(inputs, head_size=256, num_heads=4, ff_dim=4)
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    x = tf.keras.layers.Dense(128, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.4)(x)
    outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
    return tf.keras.Model(inputs, outputs)

# ---------------------- Model Selection & Training ---------------------- #
input_shape = (train_embeddings.shape[1], 1)

models = {
    "BiLSTM_Attention": build_bilstm_attention_model(input_shape),
    "CNN_BiLSTM": build_cnn_bilstm_model(input_shape),
    "Transformer": build_transformer_model(input_shape)
}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    start_time = time.time()
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32, verbose=1)
    training_time = time.time() - start_time

    y_pred_probs = model.predict(X_test)
    y_pred = (y_pred_probs > 0.5).astype(int)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_probs)
    kappa = cohen_kappa_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    print(f"{model_name} Results:")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}, Kappa: {kappa:.4f}, MCC: {mcc:.4f}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"Execution Time: {training_time:.2f} seconds\n")
