<a href="https://colab.research.google.com/github/rakshansingh12/disease-risk-prediction-ml/blob/main/07_model_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
import numpy as np

PROJECT_PATH = "/content/drive/MyDrive/disease-risk-prediction-gene-expression"

X = np.load(os.path.join(PROJECT_PATH, "data/processed", "X_variance_selected.npy"))
y = np.load(os.path.join(PROJECT_PATH, "data/processed", "y_labels.npy"))

X.shape, y.shape

Mounted at /content/drive


((161, 1000), (161,))

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42
)

rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)
rf_prob = rf.predict_proba(X_test)[:, 1]

rf_acc = accuracy_score(y_test, rf_pred)
rf_auc = roc_auc_score(y_test, rf_prob)

rf_acc, rf_auc

(0.9090909090909091, np.float64(0.9796296296296297))

In [5]:
from sklearn.svm import SVC

svm = SVC(
    kernel="rbf",
    probability=True,
    random_state=42
)

svm.fit(X_train, y_train)

svm_pred = svm.predict(X_test)
svm_prob = svm.predict_proba(X_test)[:, 1]

svm_acc = accuracy_score(y_test, svm_pred)
svm_auc = roc_auc_score(y_test, svm_prob)

svm_acc, svm_auc

(0.9393939393939394, np.float64(0.9851851851851852))

In [6]:
import pandas as pd

results = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest", "SVM"],
    "Accuracy": [0.939, rf_acc, svm_acc],
    "ROC-AUC": [0.989, rf_auc, svm_auc]
})

results

Unnamed: 0,Model,Accuracy,ROC-AUC
0,Logistic Regression,0.939,0.989
1,Random Forest,0.909091,0.97963
2,SVM,0.939394,0.985185


“Although SVM achieved similar performance, logistic regression was selected due to comparable accuracy and ROC-AUC, greater interpretability, and better suitability for high-dimensional gene expression data.”