In [None]:
import os
import numpy as np
import pandas as pd
import torch
import tensorflow as tf
import joblib
from tqdm import tqdm
import time
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, LSTM, Conv1D, MaxPooling1D
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from transformers import AutoTokenizer, AutoModel
from concurrent.futures import ThreadPoolExecutor
import hashlib
# ✅ Enable Full GPU Usage
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
# ✅ Load ProtBERT Model & Tokenizer
protbert_tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert")
protbert_model = AutoModel.from_pretrained("Rostlab/prot_bert").to(device)
# ✅ Dynamic Programming: Cache ProtBERT Embeddings
embedding_cache = {}

def hash_sequence(seq):
    return hashlib.md5(seq.encode()).hexdigest()
# ✅ Load Data (Greedy Approach - Read Only Needed Files)
def load_data(base_folder):
    sequences, labels = [], []
    for filename in os.listdir(base_folder):
        if filename.endswith(".csv"):
            label = filename.replace(".csv", "")
            file_path = os.path.join(base_folder, filename)
            df = pd.read_csv(file_path)
            for seq in df['Sequence'].dropna():
                sequences.append(seq)
                labels.append(label)
    return sequences, labels
# ✅ Divide & Conquer: Parallel Feature Extraction
def extract_protbert_features(sequences):
    def process_sequence(seq):
        seq_hash = hash_sequence(seq)
        if seq_hash in embedding_cache:
            return embedding_cache[seq_hash]

        seq = ' '.join(list(seq))  
        encoded = protbert_tokenizer.batch_encode_plus(
            [seq], padding=True, truncation=True, max_length=512, return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            output = protbert_model(**encoded).last_hidden_state.mean(dim=1).cpu().numpy().flatten()

        embedding_cache[seq_hash] = output  
        return output

    with ThreadPoolExecutor() as executor:
        embeddings = list(tqdm(executor.map(process_sequence, sequences), total=len(sequences), desc="Extracting ProtBERT Features"))

    return np.array(embeddings, dtype=np.float32)
# ✅ Load & Preprocess Data
base_folder = r"data\\Therapeutic Category Classification"
sequences, labels = load_data(base_folder)

protbert_features = extract_protbert_features(sequences)
scaler = StandardScaler()
X = scaler.fit_transform(protbert_features)
joblib.dump(scaler, "scaler.pkl")

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)
joblib.dump(label_encoder, "label_encoder.pkl")
# ✅ Feature Selection using PCA
pca = PCA(n_components=50)  # Reduce to 50 principal components
X_pca = pca.fit_transform(X)
joblib.dump(pca, "pca_model.pkl")

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)
# ✅ CNN-LSTM Model (Efficient)
num_classes = len(set(labels))
input_shape = (X_train.shape[1], 1)

with tf.device('/GPU:0'):
    model = Sequential([
        Conv1D(32, 3, activation='relu', input_shape=input_shape),
        MaxPooling1D(2),
        LSTM(64),
        Dropout(0.3),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    history = model.fit(X_train, y_train, epochs=40, batch_size=32, validation_data=(X_test, y_test))

model.save("Therapeutic Peptide Classification.h5")
# ✅ Accuracy Evaluation
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")
# ✅ Greedy Algorithm for Confidence Adjustment
def confidence_adjustment(prediction):
    max_prob = np.max(prediction)
    if max_prob < 0.7:
        return "Uncertain Category"
    return label_encoder.inverse_transform([np.argmax(prediction)])[0]
# ✅ Classify User-Entered Sequence
def classify_peptide(sequence):
    protbert_features = np.array(extract_protbert_features([sequence]))
    features = scaler.transform(protbert_features)
    features = pca.transform(features)
    features = np.expand_dims(features, axis=-1)
    
    model = load_model("Therapeutic Peptide Classification.h5")
    prediction = model.predict(features)
    category = confidence_adjustment(prediction)
    
    return category
# ✅ Extract Accuracy & Loss from Model History
def plot_training_history(history):
    epochs = range(1, len(history.history['accuracy']) + 1)

    # ✅ Plot Accuracy Graph
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(epochs, history.history['accuracy'], 'bo-', label='Training Accuracy')
    plt.plot(epochs, history.history['val_accuracy'], 'r*-', label='Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Training & Validation Accuracy')
    plt.legend()
    plt.grid(True)

    # ✅ Plot Loss Graph
    plt.subplot(1, 2, 2)
    plt.plot(epochs, history.history['loss'], 'bo-', label='Training Loss')
    plt.plot(epochs, history.history['val_loss'], 'r*-', label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training & Validation Loss')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()
# ✅ Call the function with model history
plot_training_history(history)