<a href="https://colab.research.google.com/github/robinrb7/MolGen/blob/main/MolGen2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install optuna

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
import shap
import time
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (accuracy_score, roc_auc_score, f1_score,
                             precision_recall_curve, confusion_matrix,
                             classification_report, roc_curve, auc)

start_time = time.time()

# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/1144.csv")

# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Drop non-numeric columns (e.g., "Sequence ID")
df = df.drop(columns=["Sequence ID"])

# Normalize the 'Z Scale Electronic' column
scaler = StandardScaler()
df["Z Scale Electronic"] = scaler.fit_transform(df[["Z-Scale Electronic"]])
df["Disulfide Bond Estimate"] = scaler.fit_transform(df[["Disulfide Bond Estimation"]])

X = df.drop(columns=["Label", "Molecular Weight",'AAC_A','AAC_D','AAC_F','AAC_G','AAC_H','Tiny Residue %','Large Residue %','Nonpolar Residue %','Fraction Small Hydrophobic',
                     'Bulkiness','Isoelectric Point', 'Aliphatic Index','GRAVY (Hydrophobicity)','Fraction Large Hydrophobic','Cysteine Cluster Ratio',
                     'Alpha-Helix %','Beta-Sheet %','Coil %','Z-Scale Electronic','Disulfide Bond Estimation'])
y = df["Label"]

# Train (70%) - Validation (15%) - Test (15%) Split
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1765, stratify=y_train_val, random_state=42)

# Convert data into DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

def objective(trial):
    params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "booster": "gbtree",
        "max_depth": trial.suggest_int("max_depth", 1, 2),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "gamma": trial.suggest_float("gamma", 0, 3),
        "min_child_weight": trial.suggest_int("min_child_weight", 10, 20),
        "subsample": trial.suggest_float("subsample", 0.5, 0.8),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 0.8),
        "reg_alpha": trial.suggest_float("reg_alpha", 5, 50),
        "reg_lambda": trial.suggest_float("reg_lambda", 5, 50),
        "tree_method": "gpu_hist" if torch.cuda.is_available() else "hist"
    }

    num_boost_round = trial.suggest_int("num_boost_round", 200, 1000, step=100)

    # Perform Cross-validation
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=2000,
        nfold=5,
        stratified=True,
        metrics="auc",
        as_pandas=True,
        seed=42
    )

    # Extract best training and validation AUCs
    best_valid_auc = cv_results["test-auc-mean"].max()  # Validation AUC
    best_train_auc = cv_results["train-auc-mean"].max()  # Training AUC

    # Store values in Optuna trial attributes for later use
    trial.set_user_attr("train_auc", best_train_auc)
    trial.set_user_attr("valid_auc", best_valid_auc)

    return best_valid_auc  # Optimize for validation AUC

# Run Optuna hyperparameter tuning
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# Train the best model
xg_params = study.best_params
xg_model = xgb.XGBClassifier(**xg_params, eval_metric='logloss')

# Train the model
xg_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)

end_time = time.time()
training_time = end_time - start_time

# Predictions
y_pred = xg_model.predict(X_test)
y_pred_proba = xg_model.predict_proba(X_test)[:, 1]

# Compute evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Precision, Recall, Sensitivity & Specificity
report = classification_report(y_test, y_pred, output_dict=True)
precision = report["1"]["precision"]
recall = report["1"]["recall"]
specificity = report["0"]["recall"]

# Print evaluation results
print(f"✅ Accuracy: {accuracy:.4f}")
print(f"✅ AUC Score: {roc_auc:.4f}")
print(f"✅ F1 Score: {f1:.4f}")
print(f"✅ Precision: {precision:.4f}")
print(f"✅ Recall (Sensitivity): {recall:.4f}")
print(f"✅ Specificity: {specificity:.4f}")
print(f"⏳ Training Time: {training_time:.2f} seconds")

# Feature Importance with SHAP
explainer = shap.Explainer(xg_model)
shap_values = explainer(X_test)
shap.summary_plot(shap_values, X_test, plot_type="bar")

In [None]:
import xgboost as xgb
model = xgb.train(xg_params, dtrain, num_boost_round=100)
xgb.plot_importance(model)

In [None]:
# 1️⃣ Plot Confusion Matrix
plt.figure(figsize=(5,4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve

# Function to plot learning curves
def plot_learning_curve(estimator, X, y, cv=5, scoring='accuracy'):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, scoring=scoring, train_sizes=np.linspace(0.1, 1.0, 10)
    )

    # Compute mean and standard deviation
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    plt.figure(figsize=(8, 5))
    plt.plot(train_sizes, train_mean, 'o-', color='red', label='Training Score')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='red')
    plt.plot(train_sizes, test_mean, 'o-', color='green', label='Test Score')
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color='green')

    plt.xlabel("Training Examples")
    plt.ylabel(scoring.title())
    plt.title("Learning Curve")
    plt.legend()
    plt.grid()
    plt.show()

# Plot learning curve for the trained model
plot_learning_curve(xg_model, X_train, y_train, cv=5, scoring='accuracy')

In [None]:
# Extract Optuna study results
trials = study.trials_dataframe()
epochs = range(1, len(trials) + 1)  # Number of trials as x-axis

# Extract Training & Validation AUCs
train_auc_scores = [t.user_attrs.get("train_auc", None) for t in study.trials]
valid_auc_scores = [t.user_attrs.get("valid_auc", None) for t in study.trials]

plt.figure(figsize=(6, 4))

# Plot validation AUC (Orange Line)
plt.plot(epochs, valid_auc_scores, marker="o", linestyle="-", color="orange", label="Validation AUC")

# Plot training AUC (Blue Line)
plt.plot(epochs, train_auc_scores, marker="o", linestyle="-", color="blue", label="Training AUC")

plt.xlabel("Trials (Epochs)")
plt.ylabel("AUC Score")
plt.title("Optuna Trials: Training vs Validation AUC")
plt.legend()
plt.grid()
plt.show()

In [None]:
# Extract training and validation loss from evals_result
evals_result = xg_model.evals_result()

plt.figure(figsize=(8, 5))

# Plot training loss
plt.plot(evals_result["validation_0"]["logloss"], label="Training Loss", linestyle="-", marker="o", color="red")

# Plot validation loss
#plt.plot(evals_result["validation_1"]["logloss"], label="Validation Loss", linestyle="-", marker="s", color="blue")

plt.xlabel("Epochs")
plt.ylabel("Log Loss")
plt.title("Training vs Validation Loss")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Example: List of (model_name, true_labels, predicted_probabilities)
models = [
    ("XGBoost", y_test, xg_model.predict_proba(X_test)[:, 1]),
    # Add other models here in the same format
]

plt.figure(figsize=(7, 5))

for model_name, y_true, y_pred_proba in models:
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr, tpr, label=f"{model_name} (AUC = {roc_auc:.4f})", linewidth=2)

# Fill the area under the curve
plt.fill_between(fpr, tpr, color="pink", alpha=0.3)

# Diagonal reference line
plt.plot([0, 1], [0, 1], linestyle="--", color="gray", alpha=0.7)

# Labels and title
plt.xlabel("False Positive Rate", fontsize=12)
plt.ylabel("True Positive Rate", fontsize=12)
plt.title("ROC Curve (TPR vs FPR)", fontsize=14)

# No grid lines
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["left"].set_linewidth(1)
plt.gca().spines["bottom"].set_linewidth(1)

# Legend
plt.legend(loc="lower right", fontsize=10)
plt.show()

In [None]:
# 2️⃣ Plot Precision-Recall Curve
precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_pred_proba)
plt.figure(figsize=(6,4))
plt.plot(recall_vals, precision_vals, marker='.')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.grid()
plt.show()

In [None]:
import joblib
import pandas as pd

# Save the trained model
joblib.dump(xg_model, "xg_model.pkl")
print("✅ Model saved successfully!")

# Load the saved model
loaded_model = joblib.load("xg_model.pkl")
print("✅ Model loaded successfully!")

# Load trained model
model_path = "/content/xg_model.pkl"  # Replace with actual path
loaded_model = joblib.load(model_path)

# Load new data for prediction
df_new = pd.read_csv("/content/drive/MyDrive/to_predict.csv")  # Replace with actual file path

# Store Sequence ID and Molecular Weight before dropping
sequence_ids = df_new["Sequence ID"] if "Sequence ID" in df_new.columns else None
mol_weights = df_new["Molecular Weight"] if "Molecular Weight" in df_new.columns else None

# Drop unnecessary columns for prediction
df_new = df_new.drop(columns=["Sequence ID", "Molecular Weight"], errors="ignore")

# Predict labels
y_new_pred = loaded_model.predict(df_new)

# Add predictions as a new column
df_new["Label"] = y_new_pred

# Reattach Sequence ID and Molecular Weight if they were originally present
if sequence_ids is not None:
    df_new.insert(0, "Sequence ID", sequence_ids)
if mol_weights is not None:
    df_new["Molecular Weight"] = mol_weights

# Save to a new CSV file
output_path = "/content/drive/MyDrive/xg_prediction.csv"
df_new.to_csv(output_path, index=False)

print(f"✅ Predictions saved to {output_path}")

count_ones = df_new['Label'].sum()
print(count_ones)

# Filter rows where Label is 1
df_label_1 = df_new[df_new["Label"] == 1]

# Extract Sequence IDs
sequence_ids_label_1 = df_label_1["Sequence ID"]
print(sequence_ids_label_1)

In [None]:
import pandas as pd
import numpy as np
import optuna
import shap
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, log_loss


# Load dataset
df = pd.read_csv("/content/drive/MyDrive/total_antibody.csv")

df = df.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle dataset

# Drop non-numeric columns
df = df.drop(columns=["Sequence ID"])

# Split features and labels
X = df.drop(columns=["Label", "Molecular Weight"])
y = df["Label"]

# Remove highly correlated features
corr_matrix = X.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr_features = [column for column in upper_tri.columns if any(upper_tri[column] > 0.85)]
X = X.drop(columns=high_corr_features)

# Train (70%) - Validation (15%) - Test (15%) Split
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1765, stratify=y_train_val, random_state=42)

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 250, step=50),
        "max_depth": trial.suggest_int("max_depth", 5, 12),
        "min_samples_split": trial.suggest_int("min_samples_split", 10, 30),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 5, 20),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 20, 50, step=5),
        "min_impurity_decrease": trial.suggest_float("min_impurity_decrease", 0.005, 0.05),
        "ccp_alpha": trial.suggest_float("ccp_alpha", 0.005, 0.1),
        "class_weight": "balanced",
    }

    model = RandomForestClassifier(**params, random_state=42, n_jobs=-1)

    # Train model
    model.fit(X_train, y_train)

    # Compute Training AUC
    y_train_pred_proba = model.predict_proba(X_train)[:, 1]
    train_auc = roc_auc_score(y_train, y_train_pred_proba)

    # Compute Validation AUC
    y_val_pred_proba = model.predict_proba(X_val)[:, 1]
    val_auc = roc_auc_score(y_val, y_val_pred_proba)

    # Compute Training Log Loss
    train_loss = log_loss(y_train, y_train_pred_proba)

    # Compute Validation Log Loss
    val_loss = log_loss(y_val, y_val_pred_proba)

    # Store metrics inside the trial
    trial.set_user_attr("train_auc", train_auc)
    trial.set_user_attr("valid_auc", val_auc)
    trial.set_user_attr("train_loss", train_loss)
    trial.set_user_attr("valid_loss", val_loss)

    return val_auc  # Optimize for validation AUC


# Run Optuna tuning
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# Train the best Random Forest model
best_params = study.best_params
rf_model = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Evaluate on validation set (Early stopping checkpoint)
y_val_pred = rf_model.predict(X_val)
y_val_proba = rf_model.predict_proba(X_val)[:, 1]

val_accuracy = accuracy_score(y_val, y_val_pred)
val_roc_auc = roc_auc_score(y_val, y_val_proba)
val_f1 = f1_score(y_val, y_val_pred)

print(f"🔍 Validation Accuracy: {val_accuracy:.4f}")
print(f"🔍 Validation AUC Score: {val_roc_auc:.4f}")
print(f"🔍 Validation F1 Score: {val_f1:.4f}")

# If validation AUC is below a threshold, stop training and do not test
if val_roc_auc < 0.7:  # Early stopping condition
    print("⚠️ Early stopping: Model did not reach required AUC on validation set!")
else:
    # Final evaluation on test set
    y_pred = rf_model.predict(X_test)
    y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

    test_accuracy = accuracy_score(y_test, y_pred)
    test_roc_auc = roc_auc_score(y_test, y_pred_proba)
    test_f1 = f1_score(y_test, y_pred)

    print(f"✅ Test Accuracy: {test_accuracy:.4f}")
    print(f"✅ Test AUC Score: {test_roc_auc:.4f}")
    print(f"✅ Test F1 Score: {test_f1:.4f}")


In [None]:
# Confusion Matrix Plot
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5, 4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
# Function to plot learning curves
def plot_learning_curve(estimator, X, y, cv=5, scoring='accuracy'):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, scoring=scoring, train_sizes=np.linspace(0.1, 1.0, 10)
    )

    # Compute mean and standard deviation
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    plt.figure(figsize=(8, 5))
    plt.plot(train_sizes, train_mean, 'o-', color='red', label='Training Score')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='red')
    plt.plot(train_sizes, test_mean, 'o-', color='green', label='Test Score')
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color='green')

    plt.xlabel("Training Examples")
    plt.ylabel(scoring.title())
    plt.title("Learning Curve")
    plt.legend()
    plt.grid()
    plt.show()


# Plot learning curve for the trained model
plot_learning_curve(rf_model, X_train, y_train, cv=5, scoring='accuracy')

In [None]:
# Extract Optuna study results
trials = study.trials_dataframe()
epochs = range(1, len(trials) + 1)  # Number of trials as x-axis

# Extract Training & Validation AUCs
train_auc_scores = [t.user_attrs.get("train_auc", None) for t in study.trials]
valid_auc_scores = [t.user_attrs.get("valid_auc", None) for t in study.trials]

plt.figure(figsize=(6, 4))

# Plot validation AUC (Orange Line)
plt.plot(epochs, valid_auc_scores, marker="o", linestyle="-", color="orange", label="Validation AUC")

# Plot training AUC (Blue Line)
plt.plot(epochs, train_auc_scores, marker="o", linestyle="-", color="blue", label="Training AUC")

plt.xlabel("Trials (Epochs)")
plt.ylabel("AUC Score")
plt.title("Optuna Trials: Training vs Validation AUC")
plt.legend()
plt.grid()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Extract log loss values from trials
train_loss_scores = [t.user_attrs.get("train_loss", None) for t in study.trials]
valid_loss_scores = [t.user_attrs.get("valid_loss", None) for t in study.trials]

# Filter out None values
train_loss_scores = [loss for loss in train_loss_scores if loss is not None]
valid_loss_scores = [loss for loss in valid_loss_scores if loss is not None]

epochs = range(1, len(valid_loss_scores) + 1)

plt.figure(figsize=(8, 5))

# Plot training loss
plt.plot(epochs, train_loss_scores, label="Training Loss", linestyle="-", marker="o", color="red")

# Plot validation loss
plt.plot(epochs, valid_loss_scores, label="Validation Loss", linestyle="-", marker="s", color="blue")

plt.xlabel("Epochs")
plt.ylabel("Log Loss")
plt.title("Random Forest: Training vs Validation Loss")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc

plt.figure(figsize=(7, 5))

# Random Forest Model ROC Curve
y_test_pred_proba = rf_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_test_pred_proba)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label=f"Random Forest (AUC = {roc_auc:.4f})", linewidth=2)

# Fill the area under the curve
plt.fill_between(fpr, tpr, color="pink", alpha=0.3)

# Diagonal reference line
plt.plot([0, 1], [0, 1], linestyle="--", color="gray", alpha=0.7)

# Labels and title
plt.xlabel("False Positive Rate", fontsize=12)
plt.ylabel("True Positive Rate", fontsize=12)
plt.title("ROC Curve: Random Forest", fontsize=14)

plt.legend(loc="lower right", fontsize=10)
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve

precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_test_pred_proba)

plt.figure(figsize=(6,4))
plt.plot(recall_vals, precision_vals, marker='.', label="Random Forest")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve: Random Forest")
plt.legend()
plt.grid()
plt.show()

In [None]:
import pandas as pd
import joblib

# Save the trained model
joblib.dump(rf_model, "rf_model.pkl")
print("✅ Model saved successfully!")

# Load the saved model
loaded_model = joblib.load("rf_model.pkl")
print("✅ Model loaded successfully!")

# Load trained model
model_path = "/content/rf_model.pkl"  # Replace with actual path
loaded_model = joblib.load(model_path)

# Load new data for prediction
df_new = pd.read_csv("/content/drive/MyDrive/to_predict.csv")  # Replace with actual file path

# Store Sequence ID and Molecular Weight before dropping
sequence_ids = df_new["Sequence ID"] if "Sequence ID" in df_new.columns else None
mol_weights = df_new["Molecular Weight"] if "Molecular Weight" in df_new.columns else None

# Drop unnecessary columns for prediction
df_new = df_new.drop(columns=["Sequence ID", "Molecular Weight"], errors="ignore")

# Get the training data feature names from the loaded model
training_feature_names = loaded_model.feature_names_in_

# Ensure the new data has the same columns as the training data, in the same order
df_new = df_new.reindex(columns=training_feature_names)


# Predict labels
y_new_pred = loaded_model.predict(df_new)

# Add predictions as a new column
df_new["Label"] = y_new_pred

# Reattach Sequence ID and Molecular Weight if they were originally present
if sequence_ids is not None:
    df_new.insert(0, "Sequence ID", sequence_ids)
if mol_weights is not None:
    df_new["Molecular Weight"] = mol_weights

# Save to a new CSV file
output_path = "/content/drive/MyDrive/prediction.csv"
df_new.to_csv(output_path, index=False)

print(f"✅ Predictions saved to {output_path}")

count_ones = df_new['Label'].sum()
print(count_ones)
# Filter rows where Label is 1
df_label_1 = df_new[df_new["Label"] == 1]

# Extract Sequence IDs
sequence_ids_label_1 = df_label_1["Sequence ID"]
sequence_ids_label_1

In [None]:
import pandas as pd
import numpy as np
import optuna
import shap
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, roc_curve, auc, precision_recall_curve, confusion_matrix, log_loss

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/1165.csv")

df = df.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle dataset

# Drop non-numeric columns
df = df.drop(columns=["Sequence ID"])

# Split features and labels
X = df.drop(columns=["Label", "Molecular Weight"])
y = df["Label"]

# Train-test split (80-20)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1765, stratify=y_train_val, random_state=42)

from sklearn.metrics import roc_auc_score, log_loss

def objective(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_samples_split": trial.suggest_int("min_samples_split", 4, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 2, 10),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
        "ccp_alpha": trial.suggest_float("ccp_alpha", 0.0001, 0.02)
    }

    model = DecisionTreeClassifier(**params, random_state=42)
    model.fit(X_train, y_train)

    # Predict on training and validation sets
    y_train_pred_proba = model.predict_proba(X_train)[:, 1]
    y_val_pred_proba = model.predict_proba(X_val)[:, 1]

    # Compute AUC scores
    train_auc = roc_auc_score(y_train, y_train_pred_proba)
    valid_auc = roc_auc_score(y_val, y_val_pred_proba)

    # Compute Log Loss scores
    train_loss = log_loss(y_train, model.predict_proba(X_train))
    valid_loss = log_loss(y_val, model.predict_proba(X_val))

    # Store AUC and Log Loss values in Optuna's trial attributes
    trial.set_user_attr("train_auc", train_auc)
    trial.set_user_attr("valid_auc", valid_auc)
    trial.set_user_attr("train_loss", train_loss)
    trial.set_user_attr("valid_loss", valid_loss)

    return valid_auc  # Optimize based on validation AUC

# Run Optuna tuning
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=70)

# Train the best model
dtc_params = study.best_params
dtc_model = DecisionTreeClassifier(**dtc_params, random_state=42)
dtc_model.fit(X_train, y_train)

# Predictions
y_pred = dtc_model.predict(X_test)
y_pred_proba = dtc_model.predict_proba(X_test)[:, 1]

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
f1 = f1_score(y_test, y_pred)

print(f"✅ Accuracy: {accuracy:.4f}")
print(f"✅ AUC Score: {roc_auc:.4f}")
print(f"✅ F1 Score: {f1:.4f}")

In [None]:
# 1️⃣ Confusion Matrix
plt.figure(figsize=(5, 4))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
# 2️⃣ Learning Curve
def plot_learning_curve(estimator, X, y, cv=5, scoring='accuracy'):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, scoring=scoring, train_sizes=np.linspace(0.1, 1.0, 10)
    )

    # Compute mean and standard deviation
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    plt.figure(figsize=(8, 5))
    plt.plot(train_sizes, train_mean, 'o-', color='red', label='Training Score')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='red')
    plt.plot(train_sizes, test_mean, 'o-', color='green', label='Test Score')
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color='green')

    plt.xlabel("Training Examples")
    plt.ylabel(scoring.title())
    plt.title("Learning Curve")
    plt.legend()
    plt.grid()
    plt.show()

plot_learning_curve(dtc_model, X_train, y_train, cv=5, scoring='roc_auc')

In [None]:
trials = study.trials_dataframe()
epochs = range(1, len(trials) + 1)  # Number of trials as x-axis

# Extract Training & Validation AUCs
train_auc_scores = [t.user_attrs.get("train_auc", None) for t in study.trials]
valid_auc_scores = [t.user_attrs.get("valid_auc", None) for t in study.trials]

plt.figure(figsize=(6, 4))

# Plot validation AUC (Orange Line)
plt.plot(epochs, valid_auc_scores, marker="o", linestyle="-", color="orange", label="Validation AUC")

# Plot training AUC (Blue Line)
plt.plot(epochs, train_auc_scores, marker="o", linestyle="-", color="blue", label="Training AUC")

plt.xlabel("Trials (Epochs)")
plt.ylabel("AUC Score")
plt.title("Optuna Trials: Training vs Validation AUC")
plt.legend()
plt.grid()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Extract log loss values from trials
train_loss_scores = [t.user_attrs.get("train_loss", None) for t in study.trials]
valid_loss_scores = [t.user_attrs.get("valid_loss", None) for t in study.trials]

# Filter out None values
train_loss_scores = [loss for loss in train_loss_scores if loss is not None]
valid_loss_scores = [loss for loss in valid_loss_scores if loss is not None]

epochs = range(1, len(valid_loss_scores) + 1)

plt.figure(figsize=(8, 5))

# Plot training loss
plt.plot(epochs, train_loss_scores, label="Training Loss", linestyle="-", marker="o", color="red")

# Plot validation loss
plt.plot(epochs, valid_loss_scores, label="Validation Loss", linestyle="-", marker="s", color="blue")

plt.xlabel("Epochs")
plt.ylabel("Log Loss")
plt.title("Random Forest: Training vs Validation Loss")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc

plt.figure(figsize=(7, 5))

# Random Forest Model ROC Curve
y_test_pred_proba = dtc_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_test_pred_proba)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label=f"Random Forest (AUC = {roc_auc:.4f})", linewidth=2)

# Fill the area under the curve
plt.fill_between(fpr, tpr, color="pink", alpha=0.3)

# Diagonal reference line
plt.plot([0, 1], [0, 1], linestyle="--", color="gray", alpha=0.7)

# Labels and title
plt.xlabel("False Positive Rate", fontsize=12)
plt.ylabel("True Positive Rate", fontsize=12)
plt.title("ROC Curve: Decision Tree", fontsize=14)

plt.legend(loc="lower right", fontsize=10)
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve

precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_test_pred_proba)

plt.figure(figsize=(6,4))
plt.plot(recall_vals, precision_vals, marker='.', label="Decision Tree")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve: Decision Tree")
plt.legend()
plt.grid()
plt.show()

In [None]:
import joblib
import pandas as pd

# Save the trained model
joblib.dump(dtc_model, "dtc_model.pkl")
print("✅ Model saved successfully!")

# Load the saved model
loaded_model = joblib.load("dtc_model.pkl")
print("✅ Model loaded successfully!")

# Load trained model
model_path = "/content/dtc_model.pkl"  # Replace with actual path
loaded_model = joblib.load(model_path)

# Load new data for prediction
df_new = pd.read_csv("/content/drive/MyDrive/to_predict.csv")  # Replace with actual file path

# Store Sequence ID and Molecular Weight before dropping
sequence_ids = df_new["Sequence ID"] if "Sequence ID" in df_new.columns else None
mol_weights = df_new["Molecular Weight"] if "Molecular Weight" in df_new.columns else None

# Drop unnecessary columns for prediction
df_new = df_new.drop(columns=["Sequence ID", "Molecular Weight"], errors="ignore")

# Predict labels
y_new_pred = loaded_model.predict(df_new)

# Add predictions as a new column
df_new["Label"] = y_new_pred

# Reattach Sequence ID and Molecular Weight if they were originally present
if sequence_ids is not None:
    df_new.insert(0, "Sequence ID", sequence_ids)
if mol_weights is not None:
    df_new["Molecular Weight"] = mol_weights

# Save to a new CSV file
output_path = "/content/drive/MyDrive/dtc_prediction.csv"
df_new.to_csv(output_path, index=False)

print(f"✅ Predictions saved to {output_path}")

count_ones = df_new['Label'].sum()
print(count_ones)

# Filter rows where Label is 1
df_label_1 = df_new[df_new["Label"] == 1]

# Extract Sequence IDs
sequence_ids_label_1 = df_label_1["Sequence ID"]
print(sequence_ids_label_1)

In [None]:
import pandas as pd
import numpy as np
import optuna
import shap
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/output2000.csv")

# Drop non-numeric columns
df = df.drop(columns=["Sequence ID"])

# Split features and labels
X = df.drop(columns=["Label", "Molecular Weight"])
y = df["Label"]

# Standardize features (Naïve Bayes is sensitive to scale)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)

# Optuna objective function
def objective(trial):
    var_smoothing = trial.suggest_loguniform("var_smoothing", 1e-9, 1e-3)  # Smoothing factor

    model = GaussianNB(var_smoothing=var_smoothing)
    model.fit(X_train, y_train)

    y_pred_proba = model.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, y_pred_proba)

# Run Optuna tuning
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# Train the best model
nb_params = study.best_params
nb_model = GaussianNB(**nb_params)
nb_model.fit(X_train, y_train)

# Predictions
y_pred = nb_model.predict(X_test)
y_pred_proba = nb_model.predict_proba(X_test)[:, 1]

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
f1 = f1_score(y_test, y_pred)

print(f"✅ Accuracy: {accuracy:.4f}")
print(f"✅ AUC Score: {roc_auc:.4f}")
print(f"✅ F1 Score: {f1:.4f}")