In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier



In [2]:
common=pd.read_csv("D:/Desktop/AD/AD_Dataset/Gene_expression/last_prepare_17.csv")

In [3]:
common.head()

Unnamed: 0,Sample,disease_status,SERPINA5,SLC14A1,GPRASP1,FGF13,ZCCHC12,FRMPD4,HPRT1,CNKSR2,RBM3,DDX3Y,BEX1,MS4A4A,BEX5,TMSB4Y,PAK3,RPS4Y1,SYTL4
0,GSM1423780,1,0.233161,0.127394,0.055005,0.019651,0.040549,-0.044839,0.136986,-0.020435,0.341209,-1.418092,0.081533,0.177249,-0.007898,-0.953641,-0.086508,-1.65894,-0.165616
1,GSM1423781,1,-0.295554,0.053273,0.002815,0.014815,-0.111576,-0.03392,0.047247,0.072077,0.146683,0.255052,0.142487,0.27019,-0.036374,0.232091,-0.069293,0.225559,-0.172584
2,GSM1423782,1,-0.598633,-0.582372,0.123408,0.150028,0.154966,0.145267,0.160844,0.11778,0.157258,0.254411,0.225043,-0.131901,0.135785,0.386145,0.142142,0.330395,-0.388533
3,GSM1423783,1,0.237143,0.245438,-0.047806,-0.055892,-0.13588,-0.176507,0.010585,-0.173845,0.162723,-1.470255,0.052789,0.101474,-0.010927,-1.215171,-0.25143,-1.563496,-0.087103
4,GSM1423784,1,-0.775671,0.014268,0.073264,-0.069487,-0.106902,0.087054,0.143043,0.05747,-0.196429,-1.513204,0.092603,-0.220523,0.018407,-1.320286,0.1161,-1.691422,-0.354625


In [4]:
print("\nAny missing values? ", common.isnull().values.any())


Any missing values?  True


In [5]:
y=common.iloc[:,1]
y.shape
x=common.iloc[:,2:19]

In [6]:
# Handle missing values before scaling
imputer = SimpleImputer(strategy="mean")   # or "median"
X= imputer.fit_transform(x)

In [7]:

from sklearn.metrics import roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Models
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rf_probs = rf.predict_proba(X_test)[:, 1]

print("\nTest Set Accuracy_rf:", accuracy_score(y_test, y_pred))
print("\nClassification Report_rf:\n", classification_report(y_test, y_pred))


# MLP Classifier
mlp = MLPClassifier(random_state=42, max_iter=1000)
# Grid Search parameters
param_grid = {
    'hidden_layer_sizes': [ (200,100,100,50,50)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.0001],
    'learning_rate': ['constant']
}

# GridSearchCV with 5-fold CV
grid_search = GridSearchCV(estimator=mlp,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=5,
                           n_jobs=-1,
                           verbose=2)
# Fit the model
grid_search.fit(X_train, y_train)
# Best parameters
print("Best Parameters from Grid Search:\n", grid_search.best_params_)


# Best Model
best_mlp = grid_search.best_estimator_
# Predictions
y_pred = best_mlp.predict(X_test)
mlp_probs = best_mlp.predict_proba(X_test)[:, 1]
# Accuracy
print("\nTest Set Accuracy_mlp:", accuracy_score(y_test, y_pred))

# Classification Report
print("\nClassification Report_mlp:\n", classification_report(y_test, y_pred))



# Compute AUCs
# ------------------
auc_rf = roc_auc_score(y_test, rf_probs)
auc_mlp = roc_auc_score(y_test, mlp_probs)

print(f"Random Forest AUC: {auc_rf:.3f}")
print(f"mlp AUC: {auc_mlp:.3f}")



Test Set Accuracy_rf: 0.9380530973451328

Classification Report_rf:
               precision    recall  f1-score   support

           0       0.96      0.93      0.94       127
           1       0.91      0.95      0.93        99

    accuracy                           0.94       226
   macro avg       0.94      0.94      0.94       226
weighted avg       0.94      0.94      0.94       226

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters from Grid Search:
 {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (200, 100, 100, 50, 50), 'learning_rate': 'constant', 'solver': 'adam'}

Test Set Accuracy_mlp: 0.9380530973451328

Classification Report_mlp:
               precision    recall  f1-score   support

           0       0.94      0.95      0.95       127
           1       0.94      0.92      0.93        99

    accuracy                           0.94       226
   macro avg       0.94      0.94      0.94       226
weighted avg       0.94      0.

In [14]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score, roc_curve

# ✅ ROC Curve Plot-RF
fpr, tpr, thresholds = roc_curve(y_test, rf_probs)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label='ROC curve (AUC = %0.2f)' % auc_rf, color='darkgreen')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Random Forest)')
plt.legend(loc="lower right")
plt.savefig('D:/Desktop/rf_roc_journal.png',dpi=300)
plt.close()

In [16]:

# ✅ ROC Curve Plot-MLP
fpr, tpr, thresholds = roc_curve(y_test, mlp_probs)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label='ROC curve (AUC = %0.2f)' % auc_mlp, color='red')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (MLP)')
plt.legend(loc="lower right")
plt.savefig('d:/Desktop/AD/AD_Journal/figure_66',dpi=300)
plt.close()

In [8]:
import numpy as np
from scipy import stats

def compute_midrank(x):
    """Computes midranks."""
    J = np.argsort(x)
    Z = x[J]
    N = len(x)
    T = np.zeros(N, dtype=float)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = 0.5*(i + j - 1)
        i = j
    T2 = np.empty(N, dtype=float)
    T2[J] = T + 1
    return T2

def fastDeLong(predictions_sorted_transposed, label_1_count):
    """Fast implementation of DeLong test for two correlated ROC AUCs."""
    m = label_1_count
    n = predictions_sorted_transposed.shape[1] - m
    positive_examples = predictions_sorted_transposed[:, :m]
    negative_examples = predictions_sorted_transposed[:, m:]
    k = predictions_sorted_transposed.shape[0]

    tx = np.empty([k, m], dtype=float)
    ty = np.empty([k, n], dtype=float)
    tz = np.empty([k, m + n], dtype=float)
    for r in range(k):
        tx[r] = compute_midrank(positive_examples[r])
        ty[r] = compute_midrank(negative_examples[r])
        tz[r] = compute_midrank(predictions_sorted_transposed[r])
    aucs = tz[:, :m].sum(axis=1) / m / n - (m + 1.0) / (2.0 * n)

    v01 = (tz[:, :m] - tx[:, :m]) / n
    v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
    sx = np.cov(v01)
    sy = np.cov(v10)
    s = sx / m + sy / n
    return aucs, s

def calc_pvalue(aucs, covariances):
    """Computes p-value for DeLong test."""
    diff = aucs[0] - aucs[1]
    var = covariances[0, 0] + covariances[1, 1] - 2 * covariances[0, 1]
    z = abs(diff) / np.sqrt(var)
    pval = 2 * (1 - stats.norm.cdf(z))
    return pval

def delong_roc_test(y_true, y_scores_A, y_scores_B):
    """Performs DeLong test between two ROC AUCs."""
    y_true = np.array(y_true)
    order = np.argsort(-y_scores_A)  # sort descending
    y_true = y_true[order]
    preds = np.array([y_scores_A, y_scores_B])[:, order]
    aucs, cov = fastDeLong(preds, np.sum(y_true))
    pval = calc_pvalue(aucs, cov)
    return pval


In [9]:

# DeLong test
p_value = delong_roc_test(y_test, rf_probs, mlp_probs)
print("DeLong test p-value:", p_value)

DeLong test p-value: 0.025710334623148423
