In [None]:
#!pip install tqdm > /dev/null 2>&1
#!pip install ipywidgets > /dev/null 2>&1
#!gdown -V > /dev/null 2>&1
#!gdown --folder url -O /content/data > /dev/null 2>&1

# Import necessary libraries and functions
import os
import shutil
import warnings
from time import time
import zipfile

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from tqdm.notebook import tqdm
from scipy.stats import norm

from sklearn.model_selection import (StratifiedKFold, cross_val_score, GridSearchCV,
                                     train_test_split)
from sklearn.metrics import (balanced_accuracy_score, classification_report,
                             matthews_corrcoef, confusion_matrix, mean_squared_error,
                             roc_curve, auc, roc_auc_score, accuracy_score,
                             f1_score, precision_score)
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier

from mlxtend.plotting import plot_decision_regions


# Copy Datasets
source_file_path = '/content/drive/MyDrive/PhD/1-DEDProject/8-ifocus-dataset/find biomarker in the metabolomics dataset/classification/datasets/data.zip'
destination_file_path = '/content/data.zip'
shutil.copy(source_file_path, destination_file_path)

# Uzip the zipped dataset file
zip_file_path = '/content/data.zip'
extract_to_path = '/content/data/'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_path)

In [None]:
# Suppress all warnings
warnings.filterwarnings('ignore')

############################################################################################# Plots
# Function to plot histograms for different performance metrics
def plot_histograms(auc_list, test_acc_list, mcc_list, bal_acc_list, model_name, method_name):
    metrics = [auc_list, test_acc_list, mcc_list, bal_acc_list]
    metric_names = ["AUC", "Test Accuracy", "MCC", "Balanced Accuracy"]

    plt.figure(figsize=(15, 10))

    # iterate over the metrics and plot histogram
    for i, metric in enumerate(metrics):
        plt.subplot(2, 2, i + 1)  # 2 rows, 2 columns
        plt.hist(metric, bins=20, edgecolor='black', alpha=0.7)
        plt.title(metric_names[i])
        plt.xlabel('Value')
        plt.ylabel('Frequency')

    plt.tight_layout()

    # Constructing the filename and saving the figure
    directory = "/content/report/plots"
    filename = os.path.join(directory, f"{model_name}_{method_name}.png")

    plt.savefig(filename, dpi=300)
    plt.close()

# Function to plot AUC with error bars for different models and methods
def plot_auc_with_error_bars(results_df):
    # Extracting relevant data for plotting
    labels = results_df["Model"] + "-" + results_df["Feature Reduction Method"]
    auc_means = results_df["AUC Mean"].values
    auc_stds = results_df["AUC Std"].values

    # Setting the figure size and creating the bar plot with error bars
    plt.figure(figsize=(14, 8))
    plt.bar(range(len(labels)), auc_means, yerr=auc_stds, align='center', alpha=0.7, ecolor='black', capsize=10)
    plt.ylabel('AUC')
    plt.xticks(range(len(labels)), labels, rotation=45, ha="right")

    # Annotations for better clarity
    for i, v in enumerate(auc_means):
        plt.text(i, v + 0.01, f"{v:.2f}", ha='center', va='bottom', fontsize=8, color='black')

    plt.tight_layout()  # Adjust layout
    plt.title('AUC with Error Bars for Different ML Model-FR Method Combinations', y=1.05)  # Adjust title position

    # Saving the plot
    plt.savefig("/content/report/auc_barplot_with_error_bars.png", bbox_inches='tight')
    plt.close()  # Close the figure after saving to free up memory    

def plot_performance_heatmap(df, metric, save_path):

  df[['Mean', 'CI']] = df[metric].str.split('±', expand=True)
  df['Mean'] = df['Mean'].astype(float)

  # Creating a pivot table for the heatmap
  pivot_table = df.pivot("Model", "Feature Reduction Method", "Mean")

  # Plotting the heatmap
  plt.figure(figsize=(10, 8))
  ax = sns.heatmap(pivot_table, annot=True, fmt=".4f", cmap="YlGnBu")

  # Annotation settings
  fontsize_mean = 10  # Font size for the mean values
  fontsize_ci = 8    # Font size for the confidence intervals
  mean_color = 'white'  # Color for the mean values
  ci_color = 'darkorange'   # Color for the confidence intervals

  # Adding the confidence interval and mean to the heatmap cells
  for i, row in enumerate(pivot_table.values):
      for j, _ in enumerate(row):
          mean_value = pivot_table.iloc[i, j]
          mean_text = f"{mean_value:.4f}"
          ci_text = df.loc[(df['Model'] == pivot_table.index[i]) & (df['Feature Reduction Method'] == pivot_table.columns[j]), 'CI'].values[0]

          # Positioning and color settings for annotations
          #ax.text(j + 0.5, i + 0.3, mean_text, horizontalalignment='center', verticalalignment='center', color=mean_color, fontsize=fontsize_mean)
          ax.text(j + 0.5, i + 0.7, f'±{ci_text}', horizontalalignment='center', verticalalignment='center', color=ci_color, fontsize=fontsize_ci)

  plt.title(f'Heatmap of {metric} across Models and FR Methods')
  plt.tight_layout()

  # Save the heatmap
  plt.savefig(save_path)
  plt.close()


########################################################################################### Load dataset
# Define a function to load the dataset from an Excel file
def load_dataset(sheet_name, remove_unknowns=True):
    path = '/content/data/m1_esi_plus_minus_norm.xlsx'
    dataset = pd.read_excel(path, sheet_name=sheet_name, header=0)
    dataset = dataset.sample(frac=1, random_state=666).reset_index(drop=True)
    dataset = dataset.drop(columns=['eye', 'gender', 'age'])
    # Optionally remove columns that start with 'unknown'
    if remove_unknowns:
        dataset = dataset.loc[:, ~dataset.columns.str.startswith('unknown')]
    return dataset

# Define a function to prepare the data for training
def prepare_data(dataset):
    # Extract labels for training
    train_label = dataset['class']
    # Drop the first and last columns from the dataset for training data
    train_data = dataset.drop(columns=[dataset.columns[0], dataset.columns[-1]])
    return train_data, train_label, dataset.columns.tolist()  


# Define a function to get the dataset and preprocess it based on feature selection
def get_data(sheet_name, train_features):
    if train_features == 'Only known metabolites':
        dataset = load_dataset(sheet_name, remove_unknowns=True)
    else:
        dataset = load_dataset(sheet_name, remove_unknowns=False)
    return prepare_data(dataset)