In [None]:
from google.colab import files
uploaded = files.upload()  # Manually upload the file

Saving ML_DEG_dataset.csv to ML_DEG_dataset.csv


In [None]:
from google.colab import files
uploaded = files.upload()  # Manually upload the file

Saving GSE21815_gene_expression_renamed.csv to GSE21815_gene_expression_renamed.csv


In [None]:
!pip install xgboost



# Feature Selection

# MI Feature Selection

## Grid Search CV

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
)
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings("ignore")

# Load datasets
df_train = pd.read_csv("ML_DEG_dataset.csv")
columns_of_interest = ['ADH1B', 'CDH3', 'GUCA2B', 'PDE9A', 'ABCA8', 'GUCA2A', 'target']
df_train = df_train[columns_of_interest]
#df_train = df_train.iloc[:, 1:]
df_train = df_train[[col for col in df_train.columns if col != 'target'] + ['target']]
X_train = df_train.iloc[:, 0:-1]
y_train = df_train.iloc[:, -1]

df_test = pd.read_csv("GSE21815_gene_expression_renamed.csv")
df_test = df_test.iloc[:, 1:]
df_test = df_test[[col for col in df_test.columns if col != 'target'] + ['target']]
X_test = df_test.iloc[:, 0:-1]
y_test = df_test.iloc[:, -1]

# Ensure common features in both train and test sets
common_cols = set(X_train.columns).intersection(X_test.columns)
X_train_common = X_train[list(common_cols)]
X_test_common = X_test[list(common_cols)]

# Hyperparameter grid for each model
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['saga'],
        'max_iter': [500]
    },
    'K-Nearest Neighbors': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'min_samples_split': [2, 5]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1]
    },
    'Bagging': {
        'n_estimators': [50, 100, 200],
        'max_samples': [0.5, 0.75, 1.0],
        'max_features': [0.5, 0.75, 1.0],
        'bootstrap': [True, False]
    },
    'Extra Trees': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Dictionary to store best parameters
best_params = {}

# Tune and evaluate models
for name, model in models.items():
    print(f"Tuning {name}...")

    # Perform GridSearchCV
    grid_search = GridSearchCV(model, param_grids[name], cv=10, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train_common, y_train)

    # Store best parameters
    best_params[name] = grid_search.best_params_
    print(f"Best parameters for {name}: {grid_search.best_params_}\n")

    # Train model with best parameters
    best_model = grid_search.best_estimator_
    best_model.fit(X_train_common, y_train)

    # Predict on test set
    predictions = best_model.predict(X_test_common)

    # Compute metrics
    test_accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test, predictions)

    print(f"{name} Testing Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"MCC: {mcc:.2f}")

    # Print confusion matrix and classification report
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("-" * 60)

# Print final best parameters
print("\nFinal Best Parameters for All Models:")
for model, params in best_params.items():
    print(f"{model}: {params}")

Tuning Logistic Regression...
Best parameters for Logistic Regression: {'C': 0.01, 'max_iter': 500, 'penalty': 'l2', 'solver': 'saga'}

Logistic Regression Testing Accuracy: 98.58%
Precision: 0.99
Recall: 0.99
F1-Score: 0.98
MCC: 0.88
Confusion Matrix:
[[  7   2]
 [  0 132]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.78      0.88         9
           1       0.99      1.00      0.99       132

    accuracy                           0.99       141
   macro avg       0.99      0.89      0.93       141
weighted avg       0.99      0.99      0.98       141

------------------------------------------------------------
Tuning K-Nearest Neighbors...
Best parameters for K-Nearest Neighbors: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}

K-Nearest Neighbors Testing Accuracy: 97.87%
Precision: 0.98
Recall: 0.98
F1-Score: 0.98
MCC: 0.81
Confusion Matrix:
[[  6   3]
 [  0 132]]
Classification Report:
             

# Random Search CV


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
)
from xgboost import XGBClassifier

import warnings

warnings.filterwarnings("ignore")

# Load datasets
df_train = pd.read_csv("ML_DEG_dataset.csv")
columns_of_interest = ['ADH1B', 'CDH3', 'GUCA2B', 'PDE9A', 'ABCA8', 'GUCA2A', 'target']
df_train = df_train[columns_of_interest]
#df_train = df_train.iloc[:, 1:]
df_train = df_train[[col for col in df_train.columns if col != 'target'] + ['target']]
X_train = df_train.iloc[:, 0:-1]
y_train = df_train.iloc[:, -1]

df_test = pd.read_csv("GSE21815_gene_expression_renamed.csv")
df_test = df_test.iloc[:, 1:]
df_test = df_test[[col for col in df_test.columns if col != 'target'] + ['target']]
X_test = df_test.iloc[:, 0:-1]
y_test = df_test.iloc[:, -1]

# Ensure common features in both train and test sets
common_cols = set(X_train.columns).intersection(X_test.columns)
X_train_common = X_train[list(common_cols)]
X_test_common = X_test[list(common_cols)]

# Hyperparameter grid for each model
param_grids = {
    'Logistic Regression': {
        'C': np.logspace(-3, 2, 10),
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['saga'],
        'max_iter': [500, 1000, 5000],
        'l1_ratio': np.linspace(0, 1, 5).tolist()
    },
    'K-Nearest Neighbors': {
        'n_neighbors': range(3, 15),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Random Forest': {
        'n_estimators': range(100, 501, 100),
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': range(100, 501, 100),
        'learning_rate': np.linspace(0.01, 0.2, 5),
        'max_depth': [3, 5, 7, 10],
        'subsample': [0.6, 0.8, 1.0],
        'min_samples_split': [2, 5, 10]
    },
    'AdaBoost': {
        'n_estimators': range(50, 301, 50),
        'learning_rate': np.linspace(0.01, 1, 5)
    },
    'Bagging': {
        'n_estimators': range(50, 301, 50),
        'max_samples': np.linspace(0.5, 1.0, 5),
        'max_features': np.linspace(0.5, 1.0, 5),
        'bootstrap': [True, False]
    },
    'Extra Trees': {
        'n_estimators': range(100, 501, 100),
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
     'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')

}

# Dictionary to store best parameters
best_params = {}

# Tune and evaluate models using RandomizedSearchCV
for name, model in models.items():
    print(f"Tuning {name}...")

    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(model, param_grids[name],
                                       n_iter=20, cv=10,
                                       n_jobs=-1, scoring='accuracy', random_state=7)

    random_search.fit(X_train_common, y_train)

    # Store best parameters
    best_params[name] = random_search.best_params_
    print(f"Best parameters for {name}: {random_search.best_params_}\n")

    # Train model with best parameters
    best_model = random_search.best_estimator_
    best_model.fit(X_train_common, y_train)

    # Predict on test set
    predictions = best_model.predict(X_test_common)

    # Compute metrics
    test_accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test, predictions)

    print(f"{name} Testing Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"MCC: {mcc:.2f}")

    # Print confusion matrix and classification report
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("-" * 60)

# Print final best parameters
print("\nFinal Best Parameters for All Models:")
for model, params in best_params.items():
    print(f"{model}: {params}")

Tuning Logistic Regression...
Best parameters for Logistic Regression: {'solver': 'saga', 'penalty': 'l1', 'max_iter': 5000, 'l1_ratio': 0.5, 'C': np.float64(0.1668100537200059)}

Logistic Regression Testing Accuracy: 98.58%
Precision: 0.99
Recall: 0.99
F1-Score: 0.98
MCC: 0.88
Confusion Matrix:
[[  7   2]
 [  0 132]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.78      0.88         9
           1       0.99      1.00      0.99       132

    accuracy                           0.99       141
   macro avg       0.99      0.89      0.93       141
weighted avg       0.99      0.99      0.98       141

------------------------------------------------------------
Tuning K-Nearest Neighbors...
Best parameters for K-Nearest Neighbors: {'weights': 'distance', 'n_neighbors': 4, 'metric': 'manhattan'}

K-Nearest Neighbors Testing Accuracy: 97.16%
Precision: 0.97
Recall: 0.97
F1-Score: 0.97
MCC: 0.73
Confusion Matrix:
[[  5   4]
 [  

# Tuning for Hub genes

# Grid Search CV

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
)
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings("ignore")

# Load datasets
df_train = pd.read_csv("ML_DEG_dataset.csv")
columns_of_interest = ['EHD2', 'TUBB', 'CSRP1', 'DPYSL3', 'TIMP2', 'RHOQ', 'ILK', 'SPARCL1', 'PALLD', 'PTRF',
                       'PPP3CB', 'MEF2C', 'RASL12', 'FLNA', 'PRNP', 'CAV2', 'SNCA', 'TNS1', 'TGFB1I1',
                       'SLIT2', 'ITPR1', 'LAMB2', 'RHOB', 'ACTN1','target']
df_train = df_train[columns_of_interest]
#df_train = df_train.iloc[:, 1:]
df_train = df_train[[col for col in df_train.columns if col != 'target'] + ['target']]
X_train = df_train.iloc[:, 0:-1]
y_train = df_train.iloc[:, -1]

df_test = pd.read_csv("GSE21815_gene_expression_renamed.csv")
df_test = df_test.iloc[:, 1:]
df_test = df_test[[col for col in df_test.columns if col != 'target'] + ['target']]
X_test = df_test.iloc[:, 0:-1]
y_test = df_test.iloc[:, -1]

# Ensure common features in both train and test sets
common_cols = set(X_train.columns).intersection(X_test.columns)
X_train_common = X_train[list(common_cols)]
X_test_common = X_test[list(common_cols)]

# Hyperparameter grid for each model
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['saga'],
        'max_iter': [500]
    },
    'K-Nearest Neighbors': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'min_samples_split': [2, 5]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1]
    },
    'Bagging': {
        'n_estimators': [50, 100, 200],
        'max_samples': [0.5, 0.75, 1.0],
        'max_features': [0.5, 0.75, 1.0],
        'bootstrap': [True, False]
    },
    'Extra Trees': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
     'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Dictionary to store best parameters
best_params = {}

# Tune and evaluate models
for name, model in models.items():
    print(f"Tuning {name}...")

    # Perform GridSearchCV
    grid_search = GridSearchCV(model, param_grids[name], cv=10, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train_common, y_train)

    # Store best parameters
    best_params[name] = grid_search.best_params_
    print(f"Best parameters for {name}: {grid_search.best_params_}\n")

    # Train model with best parameters
    best_model = grid_search.best_estimator_
    best_model.fit(X_train_common, y_train)

    # Predict on test set
    predictions = best_model.predict(X_test_common)

    # Compute metrics
    test_accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test, predictions)

    print(f"{name} Testing Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"MCC: {mcc:.2f}")

    # Print confusion matrix and classification report
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("-" * 60)

# Print final best parameters
print("\nFinal Best Parameters for All Models:")
for model, params in best_params.items():
    print(f"{model}: {params}")

# Random Search CV

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
)
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings("ignore")

# Load datasets
df_train = pd.read_csv("ML_DEG_dataset.csv")
columns_of_interest = ['EHD2', 'TUBB', 'CSRP1', 'DPYSL3', 'TIMP2', 'RHOQ', 'ILK', 'SPARCL1', 'PALLD', 'PTRF',
                       'PPP3CB', 'MEF2C', 'RASL12', 'FLNA', 'PRNP', 'CAV2', 'SNCA', 'TNS1', 'TGFB1I1',
                       'SLIT2', 'ITPR1', 'LAMB2', 'RHOB', 'ACTN1','target']
df_train = df_train[columns_of_interest]
#df_train = df_train.iloc[:, 1:]
df_train = df_train[[col for col in df_train.columns if col != 'target'] + ['target']]
X_train = df_train.iloc[:, 0:-1]
y_train = df_train.iloc[:, -1]

df_test = pd.read_csv("GSE21815_gene_expression_renamed.csv")
df_test = df_test.iloc[:, 1:]
df_test = df_test[[col for col in df_test.columns if col != 'target'] + ['target']]
X_test = df_test.iloc[:, 0:-1]
y_test = df_test.iloc[:, -1]

# Ensure common features in both train and test sets
common_cols = set(X_train.columns).intersection(X_test.columns)
X_train_common = X_train[list(common_cols)]
X_test_common = X_test[list(common_cols)]

# Hyperparameter grid for each model
param_grids = {
    'Logistic Regression': {
        'C': np.logspace(-3, 2, 10),
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['saga'],
        'max_iter': [500, 1000, 5000],
        'l1_ratio': np.linspace(0, 1, 5).tolist()
    },
    'K-Nearest Neighbors': {
        'n_neighbors': range(3, 15),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Random Forest': {
        'n_estimators': range(100, 501, 100),
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': range(100, 501, 100),
        'learning_rate': np.linspace(0.01, 0.2, 5),
        'max_depth': [3, 5, 7, 10],
        'subsample': [0.6, 0.8, 1.0],
        'min_samples_split': [2, 5, 10]
    },
    'AdaBoost': {
        'n_estimators': range(50, 301, 50),
        'learning_rate': np.linspace(0.01, 1, 5)
    },
    'Bagging': {
        'n_estimators': range(50, 301, 50),
        'max_samples': np.linspace(0.5, 1.0, 5),
        'max_features': np.linspace(0.5, 1.0, 5),
        'bootstrap': [True, False]
    },
    'Extra Trees': {
        'n_estimators': range(100, 501, 100),
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')

}

# Dictionary to store best parameters
best_params = {}

# Tune and evaluate models using RandomizedSearchCV
for name, model in models.items():
    print(f"Tuning {name}...")

    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(model, param_grids[name],
                                       n_iter=20, cv=10,
                                       n_jobs=-1, scoring='accuracy', random_state=7)

    random_search.fit(X_train_common, y_train)

    # Store best parameters
    best_params[name] = random_search.best_params_
    print(f"Best parameters for {name}: {random_search.best_params_}\n")

    # Train model with best parameters
    best_model = random_search.best_estimator_
    best_model.fit(X_train_common, y_train)

    # Predict on test set
    predictions = best_model.predict(X_test_common)

    # Compute metrics
    test_accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test, predictions)

    print(f"{name} Testing Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"MCC: {mcc:.2f}")

    # Print confusion matrix and classification report
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("-" * 60)

# Print final best parameters
print("\nFinal Best Parameters for All Models:")
for model, params in best_params.items():
    print(f"{model}: {params}")

In [None]:
['MT1M', 'ITM2A', 'VSNL1', 'BRCA2', 'AFF3', 'LGALS2', 'PCSK2', 'TMEM100', 'CDH3', 'LIFR', 'FAM107A', 'STMN2', 'ABCA8', 'CENPA', 'ADH1B','target']

In [None]:
# Grid Search CV Tuning for Pearson Correlation

# PearsonCorrelation

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
)
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings("ignore")

# Load datasets
df_train = pd.read_csv("ML_DEG_dataset.csv")
columns_of_interest = ['MT1M', 'ITM2A', 'VSNL1', 'BRCA2', 'AFF3', 'LGALS2', 'PCSK2', 'TMEM100', 'CDH3', 'LIFR', 'FAM107A', 'STMN2', 'ABCA8', 'CENPA', 'ADH1B','target']
df_train = df_train[columns_of_interest]
#df_train = df_train.iloc[:, 1:]
df_train = df_train[[col for col in df_train.columns if col != 'target'] + ['target']]
X_train = df_train.iloc[:, 0:-1]
y_train = df_train.iloc[:, -1]

df_test = pd.read_csv("GSE21815_gene_expression_renamed.csv")
df_test = df_test.iloc[:, 1:]
df_test = df_test[[col for col in df_test.columns if col != 'target'] + ['target']]
X_test = df_test.iloc[:, 0:-1]
y_test = df_test.iloc[:, -1]

# Ensure common features in both train and test sets
common_cols = set(X_train.columns).intersection(X_test.columns)
X_train_common = X_train[list(common_cols)]
X_test_common = X_test[list(common_cols)]

# Hyperparameter grid for each model
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['saga'],
        'max_iter': [500]
    },
    'K-Nearest Neighbors': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'min_samples_split': [2, 5]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1]
    },
    'Bagging': {
        'n_estimators': [50, 100, 200],
        'max_samples': [0.5, 0.75, 1.0],
        'max_features': [0.5, 0.75, 1.0],
        'bootstrap': [True, False]
    },
    'Extra Trees': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')

}

# Dictionary to store best parameters
best_params = {}

# Tune and evaluate models
for name, model in models.items():
    print(f"Tuning {name}...")

    # Perform GridSearchCV
    grid_search = GridSearchCV(model, param_grids[name], cv=10, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train_common, y_train)

    # Store best parameters
    best_params[name] = grid_search.best_params_
    print(f"Best parameters for {name}: {grid_search.best_params_}\n")

    # Train model with best parameters
    best_model = grid_search.best_estimator_
    best_model.fit(X_train_common, y_train)

    # Predict on test set
    predictions = best_model.predict(X_test_common)

    # Compute metrics
    test_accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test, predictions)

    print(f"{name} Testing Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"MCC: {mcc:.2f}")

    # Print confusion matrix and classification report
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("-" * 60)

# Print final best parameters
print("\nFinal Best Parameters for All Models:")
for model, params in best_params.items():
    print(f"{model}: {params}")

Tuning Logistic Regression...
Best parameters for Logistic Regression: {'C': 0.01, 'max_iter': 500, 'penalty': 'l2', 'solver': 'saga'}

Logistic Regression Testing Accuracy: 95.04%
Precision: 0.97
Recall: 0.95
F1-Score: 0.96
MCC: 0.73
Confusion Matrix:
[[  9   0]
 [  7 125]]
Classification Report:
              precision    recall  f1-score   support

           0       0.56      1.00      0.72         9
           1       1.00      0.95      0.97       132

    accuracy                           0.95       141
   macro avg       0.78      0.97      0.85       141
weighted avg       0.97      0.95      0.96       141

------------------------------------------------------------
Tuning K-Nearest Neighbors...
Best parameters for K-Nearest Neighbors: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}

K-Nearest Neighbors Testing Accuracy: 53.90%
Precision: 0.94
Recall: 0.54
F1-Score: 0.64
MCC: 0.25
Confusion Matrix:
[[ 9  0]
 [65 67]]
Classification Report:
              pre

# Random Search CV

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
)
from xgboost import XGBClassifier

import warnings

warnings.filterwarnings("ignore")

# Load datasets
df_train = pd.read_csv("ML_DEG_dataset.csv")
columns_of_interest = ['MT1M', 'ITM2A', 'VSNL1', 'BRCA2', 'AFF3', 'LGALS2', 'PCSK2', 'TMEM100', 'CDH3', 'LIFR', 'FAM107A', 'STMN2', 'ABCA8', 'CENPA', 'ADH1B','target']
df_train = df_train[columns_of_interest]
#df_train = df_train.iloc[:, 1:]
df_train = df_train[[col for col in df_train.columns if col != 'target'] + ['target']]
X_train = df_train.iloc[:, 0:-1]
y_train = df_train.iloc[:, -1]

df_test = pd.read_csv("GSE21815_gene_expression_renamed.csv")
df_test = df_test.iloc[:, 1:]
df_test = df_test[[col for col in df_test.columns if col != 'target'] + ['target']]
X_test = df_test.iloc[:, 0:-1]
y_test = df_test.iloc[:, -1]

# Ensure common features in both train and test sets
common_cols = set(X_train.columns).intersection(X_test.columns)
X_train_common = X_train[list(common_cols)]
X_test_common = X_test[list(common_cols)]

# Hyperparameter grid for each model
param_grids = {
    'Logistic Regression': {
        'C': np.logspace(-3, 2, 10),
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['saga'],
        'max_iter': [500, 1000, 5000],
        'l1_ratio': np.linspace(0, 1, 5).tolist()
    },
    'K-Nearest Neighbors': {
        'n_neighbors': range(3, 15),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Random Forest': {
        'n_estimators': range(100, 501, 100),
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': range(100, 501, 100),
        'learning_rate': np.linspace(0.01, 0.2, 5),
        'max_depth': [3, 5, 7, 10],
        'subsample': [0.6, 0.8, 1.0],
        'min_samples_split': [2, 5, 10]
    },
    'AdaBoost': {
        'n_estimators': range(50, 301, 50),
        'learning_rate': np.linspace(0.01, 1, 5)
    },
    'Bagging': {
        'n_estimators': range(50, 301, 50),
        'max_samples': np.linspace(0.5, 1.0, 5),
        'max_features': np.linspace(0.5, 1.0, 5),
        'bootstrap': [True, False]
    },
    'Extra Trees': {
        'n_estimators': range(100, 501, 100),
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
     'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')}

# Dictionary to store best parameters
best_params = {}

# Tune and evaluate models using RandomizedSearchCV
for name, model in models.items():
    print(f"Tuning {name}...")

    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(model, param_grids[name],
                                       n_iter=20, cv=10,
                                       n_jobs=-1, scoring='accuracy', random_state=7)

    random_search.fit(X_train_common, y_train)

    # Store best parameters
    best_params[name] = random_search.best_params_
    print(f"Best parameters for {name}: {random_search.best_params_}\n")

    # Train model with best parameters
    best_model = random_search.best_estimator_
    best_model.fit(X_train_common, y_train)

    # Predict on test set
    predictions = best_model.predict(X_test_common)

    # Compute metrics
    test_accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test, predictions)

    print(f"{name} Testing Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"MCC: {mcc:.2f}")

    # Print confusion matrix and classification report
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("-" * 60)

# Print final best parameters
print("\nFinal Best Parameters for All Models:")
for model, params in best_params.items():
    print(f"{model}: {params}")

Tuning Logistic Regression...
Best parameters for Logistic Regression: {'solver': 'saga', 'penalty': 'l1', 'max_iter': 5000, 'l1_ratio': 0.5, 'C': np.float64(0.1668100537200059)}

Logistic Regression Testing Accuracy: 96.45%
Precision: 0.97
Recall: 0.96
F1-Score: 0.97
MCC: 0.75
Confusion Matrix:
[[  8   1]
 [  4 128]]
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.89      0.76         9
           1       0.99      0.97      0.98       132

    accuracy                           0.96       141
   macro avg       0.83      0.93      0.87       141
weighted avg       0.97      0.96      0.97       141

------------------------------------------------------------
Tuning K-Nearest Neighbors...
Best parameters for K-Nearest Neighbors: {'weights': 'uniform', 'n_neighbors': 4, 'metric': 'euclidean'}

K-Nearest Neighbors Testing Accuracy: 53.19%
Precision: 0.94
Recall: 0.53
F1-Score: 0.64
MCC: 0.24
Confusion Matrix:
[[ 9  0]
 [66 66

# Lasso Regression Tuning

# Grid Search CV

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
)
from xgboost import XGBClassifier
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings("ignore")

# Load datasets
df_train = pd.read_csv("ML_DEG_dataset.csv")
columns_of_interest = ['NR3C2', 'BRCA2', 'AFF3', 'LGALS2', 'TMEM100', 'CDH3', 'ABCG2', 'STMN2', 'ABCA8', 'AZGP1', 'TSPAN7', 'ADH1B','target']
df_train = df_train[columns_of_interest]
#df_train = df_train.iloc[:, 1:]
df_train = df_train[[col for col in df_train.columns if col != 'target'] + ['target']]
X_train = df_train.iloc[:, 0:-1]
y_train = df_train.iloc[:, -1]

df_test = pd.read_csv("GSE21815_gene_expression_renamed.csv")
df_test = df_test.iloc[:, 1:]
df_test = df_test[[col for col in df_test.columns if col != 'target'] + ['target']]
X_test = df_test.iloc[:, 0:-1]
y_test = df_test.iloc[:, -1]

# Ensure common features in both train and test sets
common_cols = set(X_train.columns).intersection(X_test.columns)
X_train_common = X_train[list(common_cols)]
X_test_common = X_test[list(common_cols)]

# Hyperparameter grid for each model
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['saga'],
        'max_iter': [500]
    },
    'K-Nearest Neighbors': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'min_samples_split': [2, 5]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1]
    },
    'Bagging': {
        'n_estimators': [50, 100, 200],
        'max_samples': [0.5, 0.75, 1.0],
        'max_features': [0.5, 0.75, 1.0],
        'bootstrap': [True, False]
    },
    'Extra Trees': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },

    'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Dictionary to store best parameters
best_params = {}

# Tune and evaluate models
for name, model in models.items():
    print(f"Tuning {name}...")

    # Perform GridSearchCV
    grid_search = GridSearchCV(model, param_grids[name], cv=10, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train_common, y_train)

    # Store best parameters
    best_params[name] = grid_search.best_params_
    print(f"Best parameters for {name}: {grid_search.best_params_}\n")

    # Train model with best parameters
    best_model = grid_search.best_estimator_
    best_model.fit(X_train_common, y_train)

    # Predict on test set
    predictions = best_model.predict(X_test_common)

    # Compute metrics
    test_accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test, predictions)

    print(f"{name} Testing Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"MCC: {mcc:.2f}")

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("-" * 60)

print("\nFinal Best Parameters for All Models:")
for model, params in best_params.items():
    print(f"{model}: {params}")

Tuning Logistic Regression...
Best parameters for Logistic Regression: {'C': 0.01, 'max_iter': 500, 'penalty': 'l2', 'solver': 'saga'}

Logistic Regression Testing Accuracy: 97.87%
Precision: 0.98
Recall: 0.98
F1-Score: 0.98
MCC: 0.81
Confusion Matrix:
[[  7   2]
 [  1 131]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.78      0.82         9
           1       0.98      0.99      0.99       132

    accuracy                           0.98       141
   macro avg       0.93      0.89      0.91       141
weighted avg       0.98      0.98      0.98       141

------------------------------------------------------------
Tuning K-Nearest Neighbors...
Best parameters for K-Nearest Neighbors: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}

K-Nearest Neighbors Testing Accuracy: 73.76%
Precision: 0.95
Recall: 0.74
F1-Score: 0.80
MCC: 0.38
Confusion Matrix:
[[ 9  0]
 [37 95]]
Classification Report:
              prec

# Random Search CV

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
)
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings("ignore")

# Load datasets
df_train = pd.read_csv("ML_DEG_dataset.csv")
columns_of_interest =['NR3C2', 'BRCA2', 'AFF3', 'LGALS2', 'TMEM100', 'CDH3', 'ABCG2', 'STMN2', 'ABCA8', 'AZGP1', 'TSPAN7', 'ADH1B','target']
df_train = df_train[columns_of_interest]
#df_train = df_train.iloc[:, 1:]
df_train = df_train[[col for col in df_train.columns if col != 'target'] + ['target']]
X_train = df_train.iloc[:, 0:-1]
y_train = df_train.iloc[:, -1]

df_test = pd.read_csv("GSE21815_gene_expression_renamed.csv")
df_test = df_test.iloc[:, 1:]
df_test = df_test[[col for col in df_test.columns if col != 'target'] + ['target']]
X_test = df_test.iloc[:, 0:-1]
y_test = df_test.iloc[:, -1]

# Ensure common features in both train and test sets
common_cols = set(X_train.columns).intersection(X_test.columns)
X_train_common = X_train[list(common_cols)]
X_test_common = X_test[list(common_cols)]

# Hyperparameter distributions for each model
param_distributions = {
    'Logistic Regression': {
        'C': np.logspace(-2, 2, 10),
        'penalty': ['l1', 'l2'],
        'solver': ['saga'],
        'max_iter': [500]
    },
    'K-Nearest Neighbors': {
        'n_neighbors': range(3, 10),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Random Forest': {
        'n_estimators': range(100, 301, 100),
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'min_samples_split': [2, 5]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1]
    },
    'Bagging': {
        'n_estimators': [50, 100, 200],
        'max_samples': [0.5, 0.75, 1.0],
        'max_features': [0.5, 0.75, 1.0],
        'bootstrap': [True, False]
    },
    'Extra Trees': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },

    'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Dictionary to store best parameters
best_params = {}

# Tune and evaluate models
for name, model in models.items():
    print(f"Tuning {name}...")

    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(model, param_distributions[name], n_iter=20, cv=10, n_jobs=-1, scoring='accuracy')
    random_search.fit(X_train_common, y_train)

    # Store best parameters
    best_params[name] = random_search.best_params_
    print(f"Best parameters for {name}: {random_search.best_params_}\n")

    # Train model with best parameters
    best_model = random_search.best_estimator_
    best_model.fit(X_train_common, y_train)

    # Predict on test set
    predictions = best_model.predict(X_test_common)

    # Compute metrics
    test_accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test, predictions)

    print(f"{name} Testing Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"MCC: {mcc:.2f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("-" * 60)

print("\nFinal Best Parameters for All Models:")
for model, params in best_params.items():
    print(f"{model}: {params}")

Tuning Logistic Regression...
Best parameters for Logistic Regression: {'solver': 'saga', 'penalty': 'l2', 'max_iter': 500, 'C': np.float64(0.01)}

Logistic Regression Testing Accuracy: 97.87%
Precision: 0.98
Recall: 0.98
F1-Score: 0.98
MCC: 0.81
Confusion Matrix:
[[  7   2]
 [  1 131]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.78      0.82         9
           1       0.98      0.99      0.99       132

    accuracy                           0.98       141
   macro avg       0.93      0.89      0.91       141
weighted avg       0.98      0.98      0.98       141

------------------------------------------------------------
Tuning K-Nearest Neighbors...
Best parameters for K-Nearest Neighbors: {'weights': 'uniform', 'n_neighbors': 3, 'metric': 'euclidean'}

K-Nearest Neighbors Testing Accuracy: 73.76%
Precision: 0.95
Recall: 0.74
F1-Score: 0.80
MCC: 0.38
Confusion Matrix:
[[ 9  0]
 [37 95]]
Classification Report:
      

# mRMR Feature Selection Tuning


# Grid Search CV

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
)

from xgboost import XGBClassifier
import warnings

warnings.filterwarnings("ignore")

# Load datasets
df_train = pd.read_csv("ML_DEG_dataset.csv")
columns_of_interest = ['CDH3', 'GNG7', 'GDPD3', 'MMP1', 'ABCA8', 'VSNL1', 'ADH1B', 'AZGP1', 'LGALS2', 'TMEM100', 'DPEP1',
                       'AFF3', 'BRCA2', 'MS4A12', 'CENPA', 'LIFR', 'CEP72', 'ABCG2', 'ABI3BP', 'COL10A1', 'MT1M', 'MAD2L1',
                       'STMN2', 'NFE2L3', 'SLC4A4', 'KRT23', 'PRR7', 'target']
df_train = df_train[columns_of_interest]
#df_train = df_train.iloc[:, 1:]
df_train = df_train[[col for col in df_train.columns if col != 'target'] + ['target']]
X_train = df_train.iloc[:, 0:-1]
y_train = df_train.iloc[:, -1]

df_test = pd.read_csv("GSE21815_gene_expression_renamed.csv")
df_test = df_test.iloc[:, 1:]
df_test = df_test[[col for col in df_test.columns if col != 'target'] + ['target']]
X_test = df_test.iloc[:, 0:-1]
y_test = df_test.iloc[:, -1]

# Ensure common features in both train and test sets
common_cols = set(X_train.columns).intersection(X_test.columns)
X_train_common = X_train[list(common_cols)]
X_test_common = X_test[list(common_cols)]

# Hyperparameter grid for each model
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['saga'],
        'max_iter': [500]
    },
    'K-Nearest Neighbors': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'min_samples_split': [2, 5]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1]
    },
    'Bagging': {
        'n_estimators': [50, 100, 200],
        'max_samples': [0.5, 0.75, 1.0],
        'max_features': [0.5, 0.75, 1.0],
        'bootstrap': [True, False]
    },
    'Extra Trees': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },

    'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Dictionary to store best parameters
best_params = {}

# Tune and evaluate models
for name, model in models.items():
    print(f"Tuning {name}...")

    # Perform GridSearchCV
    grid_search = GridSearchCV(model, param_grids[name], cv=10, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train_common, y_train)

    # Store best parameters
    best_params[name] = grid_search.best_params_
    print(f"Best parameters for {name}: {grid_search.best_params_}\n")

    # Train model with best parameters
    best_model = grid_search.best_estimator_
    best_model.fit(X_train_common, y_train)

    # Predict on test set
    predictions = best_model.predict(X_test_common)

    # Compute metrics
    test_accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test, predictions)

    print(f"{name} Testing Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"MCC: {mcc:.2f}")

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("-" * 60)

print("\nFinal Best Parameters for All Models:")
for model, params in best_params.items():
    print(f"{model}: {params}")

Tuning Logistic Regression...
Best parameters for Logistic Regression: {'C': 0.1, 'max_iter': 500, 'penalty': 'l2', 'solver': 'saga'}

Logistic Regression Testing Accuracy: 97.87%
Precision: 0.98
Recall: 0.98
F1-Score: 0.98
MCC: 0.86
Confusion Matrix:
[[  9   0]
 [  3 129]]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      1.00      0.86         9
           1       1.00      0.98      0.99       132

    accuracy                           0.98       141
   macro avg       0.88      0.99      0.92       141
weighted avg       0.98      0.98      0.98       141

------------------------------------------------------------
Tuning K-Nearest Neighbors...
Best parameters for K-Nearest Neighbors: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'uniform'}

K-Nearest Neighbors Testing Accuracy: 98.58%
Precision: 0.99
Recall: 0.99
F1-Score: 0.99
MCC: 0.88
Confusion Matrix:
[[  8   1]
 [  1 131]]
Classification Report:
              p

# Random Search CV

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
)
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings("ignore")

# Load datasets
df_train = pd.read_csv("ML_DEG_dataset.csv")
columns_of_interest = ['CDH3', 'GNG7', 'GDPD3', 'MMP1', 'ABCA8', 'VSNL1', 'ADH1B', 'AZGP1', 'LGALS2', 'TMEM100', 'DPEP1',
                       'AFF3', 'BRCA2', 'MS4A12', 'CENPA', 'LIFR', 'CEP72', 'ABCG2', 'ABI3BP', 'COL10A1', 'MT1M', 'MAD2L1',
                       'STMN2', 'NFE2L3', 'SLC4A4', 'KRT23', 'PRR7', 'target']
df_train = df_train[columns_of_interest]
#df_train = df_train.iloc[:, 1:]
df_train = df_train[[col for col in df_train.columns if col != 'target'] + ['target']]
X_train = df_train.iloc[:, 0:-1]
y_train = df_train.iloc[:, -1]

df_test = pd.read_csv("GSE21815_gene_expression_renamed.csv")
df_test = df_test.iloc[:, 1:]
df_test = df_test[[col for col in df_test.columns if col != 'target'] + ['target']]
X_test = df_test.iloc[:, 0:-1]
y_test = df_test.iloc[:, -1]

# Ensure common features in both train and test sets
common_cols = set(X_train.columns).intersection(X_test.columns)
X_train_common = X_train[list(common_cols)]
X_test_common = X_test[list(common_cols)]

# Hyperparameter distributions for each model
param_distributions = {
    'Logistic Regression': {
        'C': np.logspace(-2, 2, 10),
        'penalty': ['l1', 'l2'],
        'solver': ['saga'],
        'max_iter': [500]
    },
    'K-Nearest Neighbors': {
        'n_neighbors': range(3, 10),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Random Forest': {
        'n_estimators': range(100, 301, 100),
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'min_samples_split': [2, 5]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1]
    },
    'Bagging': {
        'n_estimators': [50, 100, 200],
        'max_samples': [0.5, 0.75, 1.0],
        'max_features': [0.5, 0.75, 1.0],
        'bootstrap': [True, False]
    },
    'Extra Trees': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'CatBoost': {
        'iterations': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'depth': [4, 6, 8],
        'l2_leaf_reg': [3, 5, 7]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Dictionary to store best parameters
best_params = {}

# Tune and evaluate models
for name, model in models.items():
    print(f"Tuning {name}...")

    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(model, param_distributions[name], n_iter=20, cv=10, n_jobs=-1, scoring='accuracy')
    random_search.fit(X_train_common, y_train)

    # Store best parameters
    best_params[name] = random_search.best_params_
    print(f"Best parameters for {name}: {random_search.best_params_}\n")

    # Train model with best parameters
    best_model = random_search.best_estimator_
    best_model.fit(X_train_common, y_train)

    # Predict on test set
    predictions = best_model.predict(X_test_common)

    # Compute metrics
    test_accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test, predictions)

    print(f"{name} Testing Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"MCC: {mcc:.2f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("-" * 60)

print("\nFinal Best Parameters for All Models:")
for model, params in best_params.items():
    print(f"{model}: {params}")

Tuning Logistic Regression...
Best parameters for Logistic Regression: {'solver': 'saga', 'penalty': 'l2', 'max_iter': 500, 'C': np.float64(0.027825594022071243)}

Logistic Regression Testing Accuracy: 97.87%
Precision: 0.98
Recall: 0.98
F1-Score: 0.98
MCC: 0.86
Confusion Matrix:
[[  9   0]
 [  3 129]]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      1.00      0.86         9
           1       1.00      0.98      0.99       132

    accuracy                           0.98       141
   macro avg       0.88      0.99      0.92       141
weighted avg       0.98      0.98      0.98       141

------------------------------------------------------------
Tuning K-Nearest Neighbors...
Best parameters for K-Nearest Neighbors: {'weights': 'distance', 'n_neighbors': 4, 'metric': 'manhattan'}

K-Nearest Neighbors Testing Accuracy: 98.58%
Precision: 0.99
Recall: 0.99
F1-Score: 0.99
MCC: 0.88
Confusion Matrix:
[[  8   1]
 [  1 131]]
Classifi

In [None]:
# GRid Search CV for 15 Dataset

# Elastic Net Feature Selection Tuning

# Grid Search CV

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
)

from xgboost import XGBClassifier
import warnings

warnings.filterwarnings("ignore")

# Load datasets
df_train = pd.read_csv("ML_DEG_dataset.csv")
columns_of_interest = [ 'NEBL.1', 'CDH3', 'LGI1', 'BRCA2', 'PTGDS', 'ASPA', 'ABCG2', 'SST', 'TNFRSF11B', 'SLC4A4', 'TAF1A', 'SPINK2', 'PTGDR', 'AFF3', 'DHRS9', 'TG', 'MMP12', 'ADH1B', 'IL23A', 'ACSL6', 'TSPAN7', 'MMP1', 'PDE9A', 'SPINK5', 'CDKN2A','target']
df_train = df_train[columns_of_interest]
#df_train = df_train.iloc[:, 1:]
df_train = df_train[[col for col in df_train.columns if col != 'target'] + ['target']]
X_train = df_train.iloc[:, 0:-1]
y_train = df_train.iloc[:, -1]

df_test = pd.read_csv("GSE21815_gene_expression_renamed.csv")
df_test = df_test.iloc[:, 1:]
df_test = df_test[[col for col in df_test.columns if col != 'target'] + ['target']]
X_test = df_test.iloc[:, 0:-1]
y_test = df_test.iloc[:, -1]

# Ensure common features in both train and test sets
common_cols = set(X_train.columns).intersection(X_test.columns)
X_train_common = X_train[list(common_cols)]
X_test_common = X_test[list(common_cols)]

# Hyperparameter grid for each model
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['saga'],
        'max_iter': [500]
    },
    'K-Nearest Neighbors': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'min_samples_split': [2, 5]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1]
    },
    'Bagging': {
        'n_estimators': [50, 100, 200],
        'max_samples': [0.5, 0.75, 1.0],
        'max_features': [0.5, 0.75, 1.0],
        'bootstrap': [True, False]
    },
    'Extra Trees': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },

    'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Dictionary to store best parameters
best_params = {}

# Tune and evaluate models
for name, model in models.items():
    print(f"Tuning {name}...")

    # Perform GridSearchCV
    grid_search = GridSearchCV(model, param_grids[name], cv=10, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train_common, y_train)

    # Store best parameters
    best_params[name] = grid_search.best_params_
    print(f"Best parameters for {name}: {grid_search.best_params_}\n")

    # Train model with best parameters
    best_model = grid_search.best_estimator_
    best_model.fit(X_train_common, y_train)

    # Predict on test set
    predictions = best_model.predict(X_test_common)

    # Compute metrics
    test_accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test, predictions)

    print(f"{name} Testing Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"MCC: {mcc:.2f}")

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("-" * 60)

print("\nFinal Best Parameters for All Models:")
for model, params in best_params.items():
    print(f"{model}: {params}")

Tuning Logistic Regression...
Best parameters for Logistic Regression: {'C': 0.01, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}

Logistic Regression Testing Accuracy: 98.58%
Precision: 0.99
Recall: 0.99
F1-Score: 0.99
MCC: 0.88
Confusion Matrix:
[[  8   1]
 [  1 131]]
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89         9
           1       0.99      0.99      0.99       132

    accuracy                           0.99       141
   macro avg       0.94      0.94      0.94       141
weighted avg       0.99      0.99      0.99       141

------------------------------------------------------------
Tuning K-Nearest Neighbors...
Best parameters for K-Nearest Neighbors: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}

K-Nearest Neighbors Testing Accuracy: 98.58%
Precision: 0.99
Recall: 0.99
F1-Score: 0.99
MCC: 0.90
Confusion Matrix:
[[  9   0]
 [  2 130]]
Classification Report:
              

# Random Search CV

In [None]:
!pip install xgboost



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier



from xgboost import XGBClassifier
import warnings

warnings.filterwarnings("ignore")

# Load datasets
df_train = pd.read_csv("ML_DEG_dataset.csv")
columns_of_interest = [ 'NEBL.1', 'CDH3', 'LGI1', 'BRCA2', 'PTGDS', 'ASPA', 'ABCG2', 'SST', 'TNFRSF11B', 'SLC4A4', 'TAF1A', 'SPINK2', 'PTGDR', 'AFF3', 'DHRS9', 'TG', 'MMP12', 'ADH1B', 'IL23A', 'ACSL6', 'TSPAN7', 'MMP1', 'PDE9A', 'SPINK5', 'CDKN2A','target']
df_train = df_train[columns_of_interest]
#df_train = df_train.iloc[:, 1:]
df_train = df_train[[col for col in df_train.columns if col != 'target'] + ['target']]
X_train = df_train.iloc[:, 0:-1]
y_train = df_train.iloc[:, -1]

df_test = pd.read_csv("GSE21815_gene_expression_renamed.csv")
df_test = df_test.iloc[:, 1:]
df_test = df_test[[col for col in df_test.columns if col != 'target'] + ['target']]
X_test = df_test.iloc[:, 0:-1]
y_test = df_test.iloc[:, -1]

# Ensure common features in both train and test sets
common_cols = set(X_train.columns).intersection(X_test.columns)
X_train_common = X_train[list(common_cols)]
X_test_common = X_test[list(common_cols)]

# Hyperparameter distributions for each model
param_distributions = {
    'Logistic Regression': {
        'C': np.logspace(-2, 2, 10),
        'penalty': ['l1', 'l2'],
        'solver': ['saga'],
        'max_iter': [500]
    },
    'K-Nearest Neighbors': {
        'n_neighbors': range(3, 10),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Random Forest': {
        'n_estimators': range(100, 301, 100),
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'min_samples_split': [2, 5]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1]
    },
    'Bagging': {
        'n_estimators': [50, 100, 200],
        'max_samples': [0.5, 0.75, 1.0],
        'max_features': [0.5, 0.75, 1.0],
        'bootstrap': [True, False]
    },
    'Extra Trees': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },

    'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Dictionary to store best parameters
best_params = {}

# Tune and evaluate models
for name, model in models.items():
    print(f"Tuning {name}...")

    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(model, param_distributions[name], n_iter=20, cv=10, n_jobs=-1, scoring='accuracy')
    random_search.fit(X_train_common, y_train)

    # Store best parameters
    best_params[name] = random_search.best_params_
    print(f"Best parameters for {name}: {random_search.best_params_}\n")

    # Train model with best parameters
    best_model = random_search.best_estimator_
    best_model.fit(X_train_common, y_train)

    # Predict on test set
    predictions = best_model.predict(X_test_common)

    # Compute metrics
    test_accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test, predictions)

    print(f"{name} Testing Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"MCC: {mcc:.2f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("-" * 60)

print("\nFinal Best Parameters for All Models:")
for model, params in best_params.items():
    print(f"{model}: {params}")

Tuning Logistic Regression...
Best parameters for Logistic Regression: {'solver': 'saga', 'penalty': 'l1', 'max_iter': 500, 'C': np.float64(0.01)}

Logistic Regression Testing Accuracy: 98.58%
Precision: 0.99
Recall: 0.99
F1-Score: 0.99
MCC: 0.88
Confusion Matrix:
[[  8   1]
 [  1 131]]
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89         9
           1       0.99      0.99      0.99       132

    accuracy                           0.99       141
   macro avg       0.94      0.94      0.94       141
weighted avg       0.99      0.99      0.99       141

------------------------------------------------------------
Tuning K-Nearest Neighbors...
Best parameters for K-Nearest Neighbors: {'weights': 'distance', 'n_neighbors': 6, 'metric': 'manhattan'}

K-Nearest Neighbors Testing Accuracy: 97.87%
Precision: 0.98
Recall: 0.98
F1-Score: 0.98
MCC: 0.81
Confusion Matrix:
[[  7   2]
 [  1 131]]
Classification Report:
 

# Grid Search CV for 15 Dataset

# Random Search CV for 15 Dataset

# Grid Search CV for 82 Dataset

# Random Search CV for 82 Dataset