In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_classif
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.combine import SMOTETomek
import matplotlib.pyplot as plt

# Define correlation function
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:  # Interested in absolute coeff value
                colname = corr_matrix.columns[i]  # Getting the name of column
                col_corr.add(colname)
    return col_corr

# Load the dataset
print("Loading dataset...")
dataset = pd.read_csv('/content/mydrive/MyDrive/PCOS_extended_dataset.csv')

# Replace incorrect value in 'II beta-HCG(mIU/mL)'
print("Replacing incorrect values...")
dataset['II    beta-HCG(mIU/mL)'] = dataset['II    beta-HCG(mIU/mL)'].replace('1.99.', '1.99')

X = dataset.drop('PCOS (Y/N)', axis=1)
y = dataset['PCOS (Y/N)']

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Print the shape and class distribution of the original data
print("Original data shape:")
print("X_imputed shape:", X_imputed.shape)
print("y value counts:")
print(y.value_counts())

# Calculate mutual information scores
mutual_info_scores = mutual_info_classif(X_imputed, y)

# Randomly select fewer features for feature selection
k = 20  # Number of top features to select
top_indices = np.argsort(mutual_info_scores)[::-1][:k]
selected_features = X.iloc[:, top_indices]

# Define feature selection methods
feature_selection_methods = {
    'Pearson Correlation': X.corr(),
    'Mutual Information': selected_features
}

plt.figure(figsize=(10, 5))

for method_name, scores in feature_selection_methods.items():
    print(f"Using {method_name} for feature selection...")

    # Initialize KFold and lists for evaluation metrics
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    accuracies, sensitivities, specificities, f1_scores, roc_auc_scores = [], [], [], [], []

    # Perform 10-fold cross-validation
    for fold, (train_index, test_index) in enumerate(kf.split(X)):
        print(f"\nFold {fold+1}:")
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTETomek
        smote = SMOTETomek(random_state=0)
        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

        if method_name == 'Pearson Correlation':
            # Pearson correlation-based feature selection
            X_train_smote = pd.DataFrame(X_train_smote)
            corr_features = correlation(X_train_smote, 0.80)
            X_train_selected = X_train_smote.drop(corr_features, axis=1)
            X_test_selected = X_test.drop(corr_features, axis=1)
        else:
            # Mutual information-based feature selection
            X_train_selected = X_train_smote.iloc[:, top_indices]
            X_test_selected = X_test.iloc[:, top_indices]

        # Handle missing values
        X_train_imputed = imputer.fit_transform(X_train_selected)
        X_test_imputed = imputer.transform(X_test_selected)

        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_imputed)
        X_test_scaled = scaler.transform(X_test_imputed)

        # First layer classifiers
        classifiers_first_layer = {
            'Naive Bayes': GaussianNB(),
            'Random Forest': RandomForestClassifier()
        }
        predictions_first_layer = []
        for name, clf in classifiers_first_layer.items():
            clf.fit(X_train_scaled, y_train_smote)
            predictions_first_layer.append(clf.predict_proba(X_test_scaled)[:, 1])

        X_test_combined_first_layer = np.column_stack(predictions_first_layer)

        # Second layer classifiers
        classifiers_second_layer = {'Logistic Regression': LogisticRegression()}
        predictions_second_layer = []
        for name, clf in classifiers_second_layer.items():
            clf.fit(X_test_combined_first_layer, y_test)
            predictions_second_layer.append(clf.predict_proba(X_test_combined_first_layer)[:, 1])

        X_test_combined_second_layer = np.column_stack(predictions_second_layer)

        # Train meta layer classifier
        mlp_parameters = {
            'hidden_layer_sizes': [(100,), (50, 50), (50,)],
            'alpha': [0.0001, 0.01, 0.001],
            'learning_rate_init': [0.001, 0.01, 0.1],
        }
        meta_classifier = GridSearchCV(MLPClassifier(max_iter=1000, early_stopping=True, solver='adam', learning_rate='adaptive'),
                                       mlp_parameters, cv=5)
        meta_classifier.fit(X_test_combined_second_layer, y_test)
        best_mlp = meta_classifier.best_estimator_

        # Make predictions using stacked ensemble model
        y_pred_stacked_ensemble = best_mlp.predict(X_test_combined_second_layer)

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred_stacked_ensemble)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred_stacked_ensemble).ravel()
        sensitivity = tp / (tp + fn)
        specificity = tn / (tn + fp)
        f1 = f1_score(y_test, y_pred_stacked_ensemble)
        roc_auc = roc_auc_score(y_test, y_pred_stacked_ensemble)

        # Append metrics
        accuracies.append(accuracy)
        sensitivities.append(sensitivity)
        specificities.append(specificity)
        f1_scores.append(f1)
        roc_auc_scores.append(roc_auc)

    # Calculate and print average metrics
    avg_accuracy = np.mean(accuracies)
    avg_sensitivity = np.mean(sensitivities)
    avg_specificity = np.mean(specificities)
    avg_f1_score = np.mean(f1_scores)
    avg_roc_auc_score = np.mean(roc_auc_scores)

    print(f"\nFinal Average Results for {method_name}:")
    print(f"Accuracy: {avg_accuracy}, Sensitivity: {avg_sensitivity}, Specificity: {avg_specificity}")
    print(f"F1-score: {avg_f1_score}, ROC AUC: {avg_roc_auc_score}")

    # Plot loss curve
    plt.plot(best_mlp.loss_curve_, label=f'{method_name}')

plt.title('Train Loss Comparison')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
