In [1]:
!pip install xgboost



# Now Tuning

# Grid Serach CV GSE106582

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
)
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings("ignore")

# Load datasets
df_train = pd.read_csv("ML_DEG_dataset.csv")
columns_of_interest = ['PTRF', 'FLNA', 'RHOB', 'TNS1', 'ACTN1', 'TGFB1I1', 'RHOQ','target']
df_train = df_train[columns_of_interest]
#df_train = df_train.iloc[:, 1:]
df_train = df_train[[col for col in df_train.columns if col != 'target'] + ['target']]
X_train = df_train.iloc[:, 0:-1]
y_train = df_train.iloc[:, -1]

df_test = pd.read_csv("GSE106582_gene_expression_renamed.csv")
df_test = df_test.iloc[:, 1:]
df_test = df_test[[col for col in df_test.columns if col != 'target'] + ['target']]
X_test = df_test.iloc[:, 0:-1]
y_test = df_test.iloc[:, -1]

# Ensure common features in both train and test sets
common_cols = set(X_train.columns).intersection(X_test.columns)
X_train_common = X_train[list(common_cols)]
X_test_common = X_test[list(common_cols)]

# Hyperparameter grid for each model
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['saga'],
        'max_iter': [500]
    },
    'K-Nearest Neighbors': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'min_samples_split': [2, 5]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1]
    },
    'Bagging': {
        'n_estimators': [50, 100, 200],
        'max_samples': [0.5, 0.75, 1.0],
        'max_features': [0.5, 0.75, 1.0],
        'bootstrap': [True, False]
    },
    'Extra Trees': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
     'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Dictionary to store best parameters
best_params = {}

# Tune and evaluate models
for name, model in models.items():
    print(f"Tuning {name}...")

    # Perform GridSearchCV
    grid_search = GridSearchCV(model, param_grids[name], cv=10, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train_common, y_train)

    # Store best parameters
    best_params[name] = grid_search.best_params_
    print(f"Best parameters for {name}: {grid_search.best_params_}\n")

    # Train model with best parameters
    best_model = grid_search.best_estimator_
    best_model.fit(X_train_common, y_train)

    # Predict on test set
    predictions = best_model.predict(X_test_common)

    # Compute metrics
    test_accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test, predictions)

    print(f"{name} Testing Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"MCC: {mcc:.2f}")

    # Print confusion matrix and classification report
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("-" * 60)

# Print final best parameters
print("\nFinal Best Parameters for All Models:")
for model, params in best_params.items():
    print(f"{model}: {params}")

Tuning Logistic Regression...
Best parameters for Logistic Regression: {'C': 0.01, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}

Logistic Regression Testing Accuracy: 60.31%
Precision: 0.36
Recall: 0.60
F1-Score: 0.45
MCC: 0.00
Confusion Matrix:
[[117   0]
 [ 77   0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.60      1.00      0.75       117
           1       0.00      0.00      0.00        77

    accuracy                           0.60       194
   macro avg       0.30      0.50      0.38       194
weighted avg       0.36      0.60      0.45       194

------------------------------------------------------------
Tuning K-Nearest Neighbors...
Best parameters for K-Nearest Neighbors: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}

K-Nearest Neighbors Testing Accuracy: 47.42%
Precision: 0.50
Recall: 0.47
F1-Score: 0.48
MCC: -0.04
Confusion Matrix:
[[53 64]
 [38 39]]
Classification Report:
              pr

# Random Search CV GSE106582

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
)
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings("ignore")

# Load datasets
df_train = pd.read_csv("ML_DEG_dataset.csv")
columns_of_interest = ['PTRF', 'FLNA', 'RHOB', 'TNS1', 'ACTN1', 'TGFB1I1', 'RHOQ','target']
df_train = df_train[columns_of_interest]
#df_train = df_train.iloc[:, 1:]
df_train = df_train[[col for col in df_train.columns if col != 'target'] + ['target']]
X_train = df_train.iloc[:, 0:-1]
y_train = df_train.iloc[:, -1]

df_test = pd.read_csv("GSE106582_gene_expression_renamed.csv")
df_test = df_test.iloc[:, 1:]
df_test = df_test[[col for col in df_test.columns if col != 'target'] + ['target']]
X_test = df_test.iloc[:, 0:-1]
y_test = df_test.iloc[:, -1]

# Ensure common features in both train and test sets
common_cols = set(X_train.columns).intersection(X_test.columns)
X_train_common = X_train[list(common_cols)]
X_test_common = X_test[list(common_cols)]

# Hyperparameter grid for each model
param_grids = {
    'Logistic Regression': {
        'C': np.logspace(-3, 2, 10),
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['saga'],
        'max_iter': [500, 1000, 5000],
        'l1_ratio': np.linspace(0, 1, 5).tolist()
    },
    'K-Nearest Neighbors': {
        'n_neighbors': range(3, 15),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Random Forest': {
        'n_estimators': range(100, 501, 100),
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': range(100, 501, 100),
        'learning_rate': np.linspace(0.01, 0.2, 5),
        'max_depth': [3, 5, 7, 10],
        'subsample': [0.6, 0.8, 1.0],
        'min_samples_split': [2, 5, 10]
    },
    'AdaBoost': {
        'n_estimators': range(50, 301, 50),
        'learning_rate': np.linspace(0.01, 1, 5)
    },
    'Bagging': {
        'n_estimators': range(50, 301, 50),
        'max_samples': np.linspace(0.5, 1.0, 5),
        'max_features': np.linspace(0.5, 1.0, 5),
        'bootstrap': [True, False]
    },
    'Extra Trees': {
        'n_estimators': range(100, 501, 100),
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')

}

# Dictionary to store best parameters
best_params = {}

# Tune and evaluate models using RandomizedSearchCV
for name, model in models.items():
    print(f"Tuning {name}...")

    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(model, param_grids[name],
                                       n_iter=20, cv=10,
                                       n_jobs=-1, scoring='accuracy', random_state=7)

    random_search.fit(X_train_common, y_train)

    # Store best parameters
    best_params[name] = random_search.best_params_
    print(f"Best parameters for {name}: {random_search.best_params_}\n")

    # Train model with best parameters
    best_model = random_search.best_estimator_
    best_model.fit(X_train_common, y_train)

    # Predict on test set
    predictions = best_model.predict(X_test_common)

    # Compute metrics
    test_accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test, predictions)

    print(f"{name} Testing Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"MCC: {mcc:.2f}")

    # Print confusion matrix and classification report
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("-" * 60)

# Print final best parameters
print("\nFinal Best Parameters for All Models:")
for model, params in best_params.items():
    print(f"{model}: {params}")

Tuning Logistic Regression...
Best parameters for Logistic Regression: {'solver': 'saga', 'penalty': 'l2', 'max_iter': 1000, 'l1_ratio': 0.25, 'C': np.float64(7.742636826811277)}

Logistic Regression Testing Accuracy: 60.31%
Precision: 0.36
Recall: 0.60
F1-Score: 0.45
MCC: 0.00
Confusion Matrix:
[[117   0]
 [ 77   0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.60      1.00      0.75       117
           1       0.00      0.00      0.00        77

    accuracy                           0.60       194
   macro avg       0.30      0.50      0.38       194
weighted avg       0.36      0.60      0.45       194

------------------------------------------------------------
Tuning K-Nearest Neighbors...
Best parameters for K-Nearest Neighbors: {'weights': 'uniform', 'n_neighbors': 3, 'metric': 'manhattan'}

K-Nearest Neighbors Testing Accuracy: 42.78%
Precision: 0.46
Recall: 0.43
F1-Score: 0.43
MCC: -0.12
Confusion Matrix:
[[46 71]
 [40 3

# GSE21815 Grid Search CV

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
)
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings("ignore")

# Load datasets
df_train = pd.read_csv("ML_DEG_dataset.csv")
columns_of_interest = ['PTRF', 'FLNA', 'RHOB', 'TNS1', 'ACTN1', 'TGFB1I1', 'RHOQ','target']
df_train = df_train[columns_of_interest]
#df_train = df_train.iloc[:, 1:]
df_train = df_train[[col for col in df_train.columns if col != 'target'] + ['target']]
X_train = df_train.iloc[:, 0:-1]
y_train = df_train.iloc[:, -1]

df_test = pd.read_csv("GSE21815_gene_expression_renamed.csv")
df_test = df_test.iloc[:, 1:]
df_test = df_test[[col for col in df_test.columns if col != 'target'] + ['target']]
X_test = df_test.iloc[:, 0:-1]
y_test = df_test.iloc[:, -1]

# Ensure common features in both train and test sets
common_cols = set(X_train.columns).intersection(X_test.columns)
X_train_common = X_train[list(common_cols)]
X_test_common = X_test[list(common_cols)]

# Hyperparameter grid for each model
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['saga'],
        'max_iter': [500]
    },
    'K-Nearest Neighbors': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'min_samples_split': [2, 5]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1]
    },
    'Bagging': {
        'n_estimators': [50, 100, 200],
        'max_samples': [0.5, 0.75, 1.0],
        'max_features': [0.5, 0.75, 1.0],
        'bootstrap': [True, False]
    },
    'Extra Trees': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
     'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Dictionary to store best parameters
best_params = {}

# Tune and evaluate models
for name, model in models.items():
    print(f"Tuning {name}...")

    # Perform GridSearchCV
    grid_search = GridSearchCV(model, param_grids[name], cv=10, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train_common, y_train)

    # Store best parameters
    best_params[name] = grid_search.best_params_
    print(f"Best parameters for {name}: {grid_search.best_params_}\n")

    # Train model with best parameters
    best_model = grid_search.best_estimator_
    best_model.fit(X_train_common, y_train)

    # Predict on test set
    predictions = best_model.predict(X_test_common)

    # Compute metrics
    test_accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test, predictions)

    print(f"{name} Testing Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"MCC: {mcc:.2f}")

    # Print confusion matrix and classification report
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("-" * 60)

# Print final best parameters
print("\nFinal Best Parameters for All Models:")
for model, params in best_params.items():
    print(f"{model}: {params}")

Tuning Logistic Regression...
Best parameters for Logistic Regression: {'C': 0.01, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}

Logistic Regression Testing Accuracy: 93.62%
Precision: 0.88
Recall: 0.94
F1-Score: 0.91
MCC: 0.00
Confusion Matrix:
[[  0   9]
 [  0 132]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.94      1.00      0.97       132

    accuracy                           0.94       141
   macro avg       0.47      0.50      0.48       141
weighted avg       0.88      0.94      0.91       141

------------------------------------------------------------
Tuning K-Nearest Neighbors...
Best parameters for K-Nearest Neighbors: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}

K-Nearest Neighbors Testing Accuracy: 92.91%
Precision: 0.90
Recall: 0.93
F1-Score: 0.91
MCC: 0.16
Confusion Matrix:
[[  1   8]
 [  2 130]]
Classification Report:
             

# GSE21815 Random SearchCV

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
)
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings("ignore")

# Load datasets
df_train = pd.read_csv("ML_DEG_dataset.csv")
columns_of_interest = ['PTRF', 'FLNA', 'RHOB', 'TNS1', 'ACTN1', 'TGFB1I1', 'RHOQ','target']
df_train = df_train[columns_of_interest]
#df_train = df_train.iloc[:, 1:]
df_train = df_train[[col for col in df_train.columns if col != 'target'] + ['target']]
X_train = df_train.iloc[:, 0:-1]
y_train = df_train.iloc[:, -1]

df_test = pd.read_csv("GSE21815_gene_expression_renamed.csv")
df_test = df_test.iloc[:, 1:]
df_test = df_test[[col for col in df_test.columns if col != 'target'] + ['target']]
X_test = df_test.iloc[:, 0:-1]
y_test = df_test.iloc[:, -1]

# Ensure common features in both train and test sets
common_cols = set(X_train.columns).intersection(X_test.columns)
X_train_common = X_train[list(common_cols)]
X_test_common = X_test[list(common_cols)]

# Hyperparameter grid for each model
param_grids = {
    'Logistic Regression': {
        'C': np.logspace(-3, 2, 10),
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['saga'],
        'max_iter': [500, 1000, 5000],
        'l1_ratio': np.linspace(0, 1, 5).tolist()
    },
    'K-Nearest Neighbors': {
        'n_neighbors': range(3, 15),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Random Forest': {
        'n_estimators': range(100, 501, 100),
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': range(100, 501, 100),
        'learning_rate': np.linspace(0.01, 0.2, 5),
        'max_depth': [3, 5, 7, 10],
        'subsample': [0.6, 0.8, 1.0],
        'min_samples_split': [2, 5, 10]
    },
    'AdaBoost': {
        'n_estimators': range(50, 301, 50),
        'learning_rate': np.linspace(0.01, 1, 5)
    },
    'Bagging': {
        'n_estimators': range(50, 301, 50),
        'max_samples': np.linspace(0.5, 1.0, 5),
        'max_features': np.linspace(0.5, 1.0, 5),
        'bootstrap': [True, False]
    },
    'Extra Trees': {
        'n_estimators': range(100, 501, 100),
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')

}

# Dictionary to store best parameters
best_params = {}

# Tune and evaluate models using RandomizedSearchCV
for name, model in models.items():
    print(f"Tuning {name}...")

    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(model, param_grids[name],
                                       n_iter=20, cv=10,
                                       n_jobs=-1, scoring='accuracy', random_state=7)

    random_search.fit(X_train_common, y_train)

    # Store best parameters
    best_params[name] = random_search.best_params_
    print(f"Best parameters for {name}: {random_search.best_params_}\n")

    # Train model with best parameters
    best_model = random_search.best_estimator_
    best_model.fit(X_train_common, y_train)

    # Predict on test set
    predictions = best_model.predict(X_test_common)

    # Compute metrics
    test_accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test, predictions)

    print(f"{name} Testing Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"MCC: {mcc:.2f}")

    # Print confusion matrix and classification report
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("-" * 60)

# Print final best parameters
print("\nFinal Best Parameters for All Models:")
for model, params in best_params.items():
    print(f"{model}: {params}")

Tuning Logistic Regression...
Best parameters for Logistic Regression: {'solver': 'saga', 'penalty': 'l2', 'max_iter': 1000, 'l1_ratio': 0.25, 'C': np.float64(7.742636826811277)}

Logistic Regression Testing Accuracy: 93.62%
Precision: 0.88
Recall: 0.94
F1-Score: 0.91
MCC: 0.00
Confusion Matrix:
[[  0   9]
 [  0 132]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.94      1.00      0.97       132

    accuracy                           0.94       141
   macro avg       0.47      0.50      0.48       141
weighted avg       0.88      0.94      0.91       141

------------------------------------------------------------
Tuning K-Nearest Neighbors...
Best parameters for K-Nearest Neighbors: {'weights': 'uniform', 'n_neighbors': 3, 'metric': 'manhattan'}

K-Nearest Neighbors Testing Accuracy: 89.36%
Precision: 0.89
Recall: 0.89
F1-Score: 0.89
MCC: 0.06
Confusion Matrix:
[[  1   8]
 [  7

# GSE44076 Grid Serach CV

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
)
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings("ignore")

# Load datasets
df_train = pd.read_csv("ML_DEG_dataset.csv")
columns_of_interest = ['PTRF', 'FLNA', 'RHOB', 'TNS1', 'ACTN1', 'TGFB1I1', 'RHOQ','target']
df_train = df_train[columns_of_interest]
#df_train = df_train.iloc[:, 1:]
df_train = df_train[[col for col in df_train.columns if col != 'target'] + ['target']]
X_train = df_train.iloc[:, 0:-1]
y_train = df_train.iloc[:, -1]

df_test = pd.read_csv("GSE44076_gene_expression_renamed.csv")
df_test = df_test.iloc[:, 1:]
df_test = df_test[[col for col in df_test.columns if col != 'target'] + ['target']]
X_test = df_test.iloc[:, 0:-1]
y_test = df_test.iloc[:, -1]

# Ensure common features in both train and test sets
common_cols = set(X_train.columns).intersection(X_test.columns)
X_train_common = X_train[list(common_cols)]
X_test_common = X_test[list(common_cols)]

# Hyperparameter grid for each model
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['saga'],
        'max_iter': [500]
    },
    'K-Nearest Neighbors': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'min_samples_split': [2, 5]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1]
    },
    'Bagging': {
        'n_estimators': [50, 100, 200],
        'max_samples': [0.5, 0.75, 1.0],
        'max_features': [0.5, 0.75, 1.0],
        'bootstrap': [True, False]
    },
    'Extra Trees': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
     'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Dictionary to store best parameters
best_params = {}

# Tune and evaluate models
for name, model in models.items():
    print(f"Tuning {name}...")

    # Perform GridSearchCV
    grid_search = GridSearchCV(model, param_grids[name], cv=10, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train_common, y_train)

    # Store best parameters
    best_params[name] = grid_search.best_params_
    print(f"Best parameters for {name}: {grid_search.best_params_}\n")

    # Train model with best parameters
    best_model = grid_search.best_estimator_
    best_model.fit(X_train_common, y_train)

    # Predict on test set
    predictions = best_model.predict(X_test_common)

    # Compute metrics
    test_accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test, predictions)

    print(f"{name} Testing Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"MCC: {mcc:.2f}")

    # Print confusion matrix and classification report
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("-" * 60)

# Print final best parameters
print("\nFinal Best Parameters for All Models:")
for model, params in best_params.items():
    print(f"{model}: {params}")

Tuning Logistic Regression...
Best parameters for Logistic Regression: {'C': 0.01, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}

Logistic Regression Testing Accuracy: 37.16%
Precision: 0.78
Recall: 0.37
F1-Score: 0.24
MCC: 0.13
Confusion Matrix:
[[50  0]
 [93  5]]
Classification Report:
              precision    recall  f1-score   support

           0       0.35      1.00      0.52        50
           1       1.00      0.05      0.10        98

    accuracy                           0.37       148
   macro avg       0.67      0.53      0.31       148
weighted avg       0.78      0.37      0.24       148

------------------------------------------------------------
Tuning K-Nearest Neighbors...
Best parameters for K-Nearest Neighbors: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}

K-Nearest Neighbors Testing Accuracy: 62.84%
Precision: 0.65
Recall: 0.63
F1-Score: 0.64
MCC: 0.22
Confusion Matrix:
[[29 21]
 [34 64]]
Classification Report:
              precisi

# GSE44076 Random Search CV

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
)
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings("ignore")

# Load datasets
df_train = pd.read_csv("ML_DEG_dataset.csv")
columns_of_interest = ['PTRF', 'FLNA', 'RHOB', 'TNS1', 'ACTN1', 'TGFB1I1', 'RHOQ','target']
df_train = df_train[columns_of_interest]
#df_train = df_train.iloc[:, 1:]
df_train = df_train[[col for col in df_train.columns if col != 'target'] + ['target']]
X_train = df_train.iloc[:, 0:-1]
y_train = df_train.iloc[:, -1]

df_test = pd.read_csv("GSE44076_gene_expression_renamed.csv")
df_test = df_test.iloc[:, 1:]
df_test = df_test[[col for col in df_test.columns if col != 'target'] + ['target']]
X_test = df_test.iloc[:, 0:-1]
y_test = df_test.iloc[:, -1]

# Ensure common features in both train and test sets
common_cols = set(X_train.columns).intersection(X_test.columns)
X_train_common = X_train[list(common_cols)]
X_test_common = X_test[list(common_cols)]

# Hyperparameter grid for each model
param_grids = {
    'Logistic Regression': {
        'C': np.logspace(-3, 2, 10),
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['saga'],
        'max_iter': [500, 1000, 5000],
        'l1_ratio': np.linspace(0, 1, 5).tolist()
    },
    'K-Nearest Neighbors': {
        'n_neighbors': range(3, 15),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Random Forest': {
        'n_estimators': range(100, 501, 100),
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': range(100, 501, 100),
        'learning_rate': np.linspace(0.01, 0.2, 5),
        'max_depth': [3, 5, 7, 10],
        'subsample': [0.6, 0.8, 1.0],
        'min_samples_split': [2, 5, 10]
    },
    'AdaBoost': {
        'n_estimators': range(50, 301, 50),
        'learning_rate': np.linspace(0.01, 1, 5)
    },
    'Bagging': {
        'n_estimators': range(50, 301, 50),
        'max_samples': np.linspace(0.5, 1.0, 5),
        'max_features': np.linspace(0.5, 1.0, 5),
        'bootstrap': [True, False]
    },
    'Extra Trees': {
        'n_estimators': range(100, 501, 100),
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')

}

# Dictionary to store best parameters
best_params = {}

# Tune and evaluate models using RandomizedSearchCV
for name, model in models.items():
    print(f"Tuning {name}...")

    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(model, param_grids[name],
                                       n_iter=20, cv=10,
                                       n_jobs=-1, scoring='accuracy', random_state=7)

    random_search.fit(X_train_common, y_train)

    # Store best parameters
    best_params[name] = random_search.best_params_
    print(f"Best parameters for {name}: {random_search.best_params_}\n")

    # Train model with best parameters
    best_model = random_search.best_estimator_
    best_model.fit(X_train_common, y_train)

    # Predict on test set
    predictions = best_model.predict(X_test_common)

    # Compute metrics
    test_accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test, predictions)

    print(f"{name} Testing Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"MCC: {mcc:.2f}")

    # Print confusion matrix and classification report
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("-" * 60)

# Print final best parameters
print("\nFinal Best Parameters for All Models:")
for model, params in best_params.items():
    print(f"{model}: {params}")

Tuning Logistic Regression...
Best parameters for Logistic Regression: {'solver': 'saga', 'penalty': 'l2', 'max_iter': 1000, 'l1_ratio': 0.25, 'C': np.float64(7.742636826811277)}

Logistic Regression Testing Accuracy: 33.78%
Precision: 0.11
Recall: 0.34
F1-Score: 0.17
MCC: 0.00
Confusion Matrix:
[[50  0]
 [98  0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.34      1.00      0.51        50
           1       0.00      0.00      0.00        98

    accuracy                           0.34       148
   macro avg       0.17      0.50      0.25       148
weighted avg       0.11      0.34      0.17       148

------------------------------------------------------------
Tuning K-Nearest Neighbors...
Best parameters for K-Nearest Neighbors: {'weights': 'uniform', 'n_neighbors': 3, 'metric': 'manhattan'}

K-Nearest Neighbors Testing Accuracy: 78.38%
Precision: 0.78
Recall: 0.78
F1-Score: 0.78
MCC: 0.50
Confusion Matrix:
[[28 22]
 [10 88]]
C

# GSE8671 Grid Search CV

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
)
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings("ignore")

# Load datasets
df_train = pd.read_csv("ML_DEG_dataset.csv")
columns_of_interest = ['PTRF', 'FLNA', 'RHOB', 'TNS1', 'ACTN1', 'TGFB1I1', 'RHOQ','target']
df_train = df_train[columns_of_interest]
#df_train = df_train.iloc[:, 1:]
df_train = df_train[[col for col in df_train.columns if col != 'target'] + ['target']]
X_train = df_train.iloc[:, 0:-1]
y_train = df_train.iloc[:, -1]

df_test = pd.read_csv("GSE8671_gene_expression_renamed.csv")
df_test = df_test.iloc[:, 1:]
df_test = df_test[[col for col in df_test.columns if col != 'target'] + ['target']]
X_test = df_test.iloc[:, 0:-1]
y_test = df_test.iloc[:, -1]

# Ensure common features in both train and test sets
common_cols = set(X_train.columns).intersection(X_test.columns)
X_train_common = X_train[list(common_cols)]
X_test_common = X_test[list(common_cols)]

# Hyperparameter grid for each model
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['saga'],
        'max_iter': [500]
    },
    'K-Nearest Neighbors': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'min_samples_split': [2, 5]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1]
    },
    'Bagging': {
        'n_estimators': [50, 100, 200],
        'max_samples': [0.5, 0.75, 1.0],
        'max_features': [0.5, 0.75, 1.0],
        'bootstrap': [True, False]
    },
    'Extra Trees': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
     'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Dictionary to store best parameters
best_params = {}

# Tune and evaluate models
for name, model in models.items():
    print(f"Tuning {name}...")

    # Perform GridSearchCV
    grid_search = GridSearchCV(model, param_grids[name], cv=10, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train_common, y_train)

    # Store best parameters
    best_params[name] = grid_search.best_params_
    print(f"Best parameters for {name}: {grid_search.best_params_}\n")

    # Train model with best parameters
    best_model = grid_search.best_estimator_
    best_model.fit(X_train_common, y_train)

    # Predict on test set
    predictions = best_model.predict(X_test_common)

    # Compute metrics
    test_accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test, predictions)

    print(f"{name} Testing Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"MCC: {mcc:.2f}")

    # Print confusion matrix and classification report
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("-" * 60)

# Print final best parameters
print("\nFinal Best Parameters for All Models:")
for model, params in best_params.items():
    print(f"{model}: {params}")

Tuning Logistic Regression...
Best parameters for Logistic Regression: {'C': 0.01, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}

Logistic Regression Testing Accuracy: 53.12%
Precision: 0.76
Recall: 0.53
F1-Score: 0.40
MCC: 0.18
Confusion Matrix:
[[ 2 30]
 [ 0 32]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.06      0.12        32
           1       0.52      1.00      0.68        32

    accuracy                           0.53        64
   macro avg       0.76      0.53      0.40        64
weighted avg       0.76      0.53      0.40        64

------------------------------------------------------------
Tuning K-Nearest Neighbors...
Best parameters for K-Nearest Neighbors: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}

K-Nearest Neighbors Testing Accuracy: 96.88%
Precision: 0.97
Recall: 0.97
F1-Score: 0.97
MCC: 0.94
Confusion Matrix:
[[31  1]
 [ 1 31]]
Classification Report:
              precisi

# GSE8671 Random Search CV

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
)
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings("ignore")

# Load datasets
df_train = pd.read_csv("ML_DEG_dataset.csv")
columns_of_interest = ['PTRF', 'FLNA', 'RHOB', 'TNS1', 'ACTN1', 'TGFB1I1', 'RHOQ','target']
df_train = df_train[columns_of_interest]
#df_train = df_train.iloc[:, 1:]
df_train = df_train[[col for col in df_train.columns if col != 'target'] + ['target']]
X_train = df_train.iloc[:, 0:-1]
y_train = df_train.iloc[:, -1]

df_test = pd.read_csv("GSE8671_gene_expression_renamed.csv")
df_test = df_test.iloc[:, 1:]
df_test = df_test[[col for col in df_test.columns if col != 'target'] + ['target']]
X_test = df_test.iloc[:, 0:-1]
y_test = df_test.iloc[:, -1]

# Ensure common features in both train and test sets
common_cols = set(X_train.columns).intersection(X_test.columns)
X_train_common = X_train[list(common_cols)]
X_test_common = X_test[list(common_cols)]

# Hyperparameter grid for each model
param_grids = {
    'Logistic Regression': {
        'C': np.logspace(-3, 2, 10),
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['saga'],
        'max_iter': [500, 1000, 5000],
        'l1_ratio': np.linspace(0, 1, 5).tolist()
    },
    'K-Nearest Neighbors': {
        'n_neighbors': range(3, 15),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Random Forest': {
        'n_estimators': range(100, 501, 100),
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': range(100, 501, 100),
        'learning_rate': np.linspace(0.01, 0.2, 5),
        'max_depth': [3, 5, 7, 10],
        'subsample': [0.6, 0.8, 1.0],
        'min_samples_split': [2, 5, 10]
    },
    'AdaBoost': {
        'n_estimators': range(50, 301, 50),
        'learning_rate': np.linspace(0.01, 1, 5)
    },
    'Bagging': {
        'n_estimators': range(50, 301, 50),
        'max_samples': np.linspace(0.5, 1.0, 5),
        'max_features': np.linspace(0.5, 1.0, 5),
        'bootstrap': [True, False]
    },
    'Extra Trees': {
        'n_estimators': range(100, 501, 100),
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')

}

# Dictionary to store best parameters
best_params = {}

# Tune and evaluate models using RandomizedSearchCV
for name, model in models.items():
    print(f"Tuning {name}...")

    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(model, param_grids[name],
                                       n_iter=20, cv=10,
                                       n_jobs=-1, scoring='accuracy', random_state=7)

    random_search.fit(X_train_common, y_train)

    # Store best parameters
    best_params[name] = random_search.best_params_
    print(f"Best parameters for {name}: {random_search.best_params_}\n")

    # Train model with best parameters
    best_model = random_search.best_estimator_
    best_model.fit(X_train_common, y_train)

    # Predict on test set
    predictions = best_model.predict(X_test_common)

    # Compute metrics
    test_accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test, predictions)

    print(f"{name} Testing Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"MCC: {mcc:.2f}")

    # Print confusion matrix and classification report
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("-" * 60)

# Print final best parameters
print("\nFinal Best Parameters for All Models:")
for model, params in best_params.items():
    print(f"{model}: {params}")

Tuning Logistic Regression...
Best parameters for Logistic Regression: {'solver': 'saga', 'penalty': 'l2', 'max_iter': 1000, 'l1_ratio': 0.25, 'C': np.float64(7.742636826811277)}

Logistic Regression Testing Accuracy: 54.69%
Precision: 0.76
Recall: 0.55
F1-Score: 0.43
MCC: 0.22
Confusion Matrix:
[[ 3 29]
 [ 0 32]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.09      0.17        32
           1       0.52      1.00      0.69        32

    accuracy                           0.55        64
   macro avg       0.76      0.55      0.43        64
weighted avg       0.76      0.55      0.43        64

------------------------------------------------------------
Tuning K-Nearest Neighbors...
Best parameters for K-Nearest Neighbors: {'weights': 'uniform', 'n_neighbors': 3, 'metric': 'manhattan'}

K-Nearest Neighbors Testing Accuracy: 76.56%
Precision: 0.82
Recall: 0.77
F1-Score: 0.76
MCC: 0.58
Confusion Matrix:
[[31  1]
 [14 18]]
C