In [None]:
# X: Feature matrix
# y: Target vector

cat= X.select_dtypes(include='object').columns
num= X.select_dtypes(include=['float','int']).columns

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline

encoder=OneHotEncoder(sparse=False, handle_unknown='ignore')
fill_nan=KNNImputer(n_neighbors=10)
scaler = StandardScaler()


cat_transformer = Pipeline(
    steps=[('encode', encoder), ("fillna", fill_nan)]
)


preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([('impute', fill_nan), ('scale', scaler)]), num),
        ("cat", cat_transformer, cat),
    ]
)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statistics
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score, average_precision_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.feature_selection import GenericUnivariateSelect
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import FactorAnalysis, KernelPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
import shap
import lime
import lime.lime_tabular
import matplotlib.pyplot as plt



svm_classifier = SVC()



# Feature selection methods
feature_selection_methods = {
    'Correlation-based Feature Selection': GenericUnivariateSelect(score_func=f_classif, mode='k_best', param=5),  # Adjust k as per requirement
    'Mutual information': GenericUnivariateSelect(score_func=mutual_info_classif, mode='k_best', param=5),  # Adjust k as per requirement
    'SelectKBest': SelectKBest(score_func=f_classif, k=5),  # Adjust k as per requirement
    'Sequential Forward Selection': SequentialFeatureSelector(estimator=svm_classifier, n_features_to_select=5, direction='forward', cv=LeaveOneOut())
}



# Dimensionality reduction methods
reductioner = {
    'PCA': PCA(n_components=0.9),
    'LDA': LinearDiscriminantAnalysis(n_components=1),
    'FA': FactorAnalysis(n_components=2),
    'kPCA': KernelPCA(n_components=1, kernel='rbf')
}



# Base classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(max_depth=7, min_samples_split=5, min_samples_leaf=9),
    'KNN': KNeighborsClassifier(n_neighbors=2),
    'Decision Tree': DecisionTreeClassifier(),
    'Logistic Regression': LogisticRegression(C=1.0),
    'Naive Bayes': GaussianNB(),
    'XGBoost': XGBClassifier(reg_alpha=0.0, reg_lambda=1.0),
    'LightGBM': LGBMClassifier(reg_alpha=0.0, reg_lambda=1.0),
    'SVM': SVC(C=1.0),
    'Gaussian Process': GaussianProcessClassifier()
}



# Ensemble classifiers
ensemble_classifiers = {
    'Voting Classifier': VotingClassifier(estimators=list(classifiers.items()), voting='hard'),
    'Bagging Classifier': BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=392, random_state=0),
    'AdaBoost': AdaBoostClassifier(n_estimators=392),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=392)
}



results = []


file_path = 'log.txt'           
with open(file_path, 'w') as file:


    for method_name, method in feature_selection_methods.items():
        for reduction_name, reduction in reductioner.items():
            for clf_name, clf in classifiers.items():
                y_true_all              = []
                y_pred_all              = []
                accuracies              = []
                f1_scores               = []
                precision               = []
                recall                  = []
                shap_values_list        = []

                for _ in range(10):
                    print(f"Evaluating feature selection: {method_name} - feature reduction: {reduction_name} - classifier: {clf_name}")
                    file.write(f"{method_name} - {reduction_name} - {clf_name} - {_}\n")

                    kf = KFold(n_splits=4)
                                    
                    for train_index, test_index in kf.split(X):
                        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                        
                        
                        pipeline = Pipeline([
                            ('preprocessor', preprocessor),  # Preprocessing steps defined in your preprocessor
                            ('feature_selection', method),  
                            ('reduction', reduction),  # for dimensionality reduction
                            ('classifier', clf)  # Classifier
                        ])
                        

                        pipeline.fit(X_train, y_train)
                        y_pred = pipeline.predict(X_test)
                        
                        
                        
                        
                        #SHAP
                        explainer = shap.Explainer(pipeline.predict, X_train)
                        shap_values = explainer(X_test)
                        shap_values_list.append(shap_values.values)
                    
                        
                        
                        #LIME
                        explainer = lime.lime_tabular.LimeTabularExplainer(
                            training_data=X.values,
                            feature_names=X.columns,
                            class_names=['Without Depression', 'With Depression'],
                            discretize_continuous=True
                        )
                        i = 0
                        instance = X.iloc[i].values
                        exp = explainer.explain_instance(instance, pipeline.predict_proba, num_features=7)
                        exp.show_in_notebook(show_table=True)
                        fig = exp.as_pyplot_figure()
                        
                        
                        
                        
                        
                        
                        accuracies.append(accuracy_score(y_test, y_pred))
                        file.write(f"{accuracies}\n")
                        
                        f1_scores.append(f1_score(y_test, y_pred))
                        file.write(f"{f1_scores}\n")
                        
                        precision.append(precision_score(y_test, y_pred))
                        file.write(f"{precision}\n")
                        
                        recall.append(recall_score(y_test, y_pred))                 
                        file.write(f"{recall}\n")
                        
                        y_true_all.extend(y_test)
                        y_pred_all.extend(y_pred)
                        file.write(f"{y_true_all}\n")
                        file.write(f"{y_pred_all}\n")
                    
                            
                
                mean_accuracy = sum(accuracies) / len(accuracies)
                std_accuracy = statistics.stdev(accuracies) if len(accuracies) > 1 else 0
                
                mean_f1_score = sum(f1_scores) / len(f1_scores)
                std_f1_score = statistics.stdev(f1_scores) if len(f1_scores) > 1 else 0
                
                mean_precision = sum(precision) / len(precision)
                std_precision = statistics.stdev(precision) if len(precision) > 1 else 0
                
                mean_recall = sum(recall) / len(recall)
                std_recall = statistics.stdev(recall) if len(recall) > 1 else 0
                
                cm = confusion_matrix(y_true_all, y_pred_all)
                cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
                
                
                
                
                #SHAP
                shap_values_list = np.array(shap_values_list)
                aggregated_shap_values = np.mean(shap_values_list, axis=0)
                
                aggregated_shap_values_object = shap.Explanation(
                    values=aggregated_shap_values,
                    base_values=shap_values.base_values,  
                    data=shap_values.data,                
                    feature_names=shap_values.feature_names
                )  
                plt.figure(figsize=(12, 8))
                shap.plots.waterfall(shap_values[0], max_display=14, show=False)
                plt.savefig(f"./waterfall_plot_{method_name}_{reduction_name}_{clf_name}.png", bbox_inches='tight', dpi=1080)
                
                
                
                
                
                
                print("Mean Accuracy:", mean_accuracy)
                print("STD Accuracy:", std_accuracy)
                file.write(f"{mean_accuracy}\n")
                file.write(f"{std_accuracy}\n")
                
                print("Mean F1-score:", mean_f1_score)
                print("STD F1-score:", std_f1_score)
                file.write(f"{mean_f1_score}\n")
                file.write(f"{std_f1_score}\n")
                
                print("Mean Precision:", mean_precision)
                print("STD Precision:", std_precision)
                file.write(f"{mean_precision}\n")
                file.write(f"{std_precision}\n")
                
                print("Mean Recall:", mean_recall)
                print("STD Recall:", std_recall)
                file.write(f"{mean_recall}\n")
                file.write(f"{std_recall}\n")
                            
                
                
                results.append({
                    'Feature Selection Method': method_name,
                    'Dimensionality Reduction Method': reduction_name,
                    'Classification Method': clf_name,
                    'Mean Accuracy': mean_accuracy,
                    'STD Accuracy': std_accuracy,
                    'Mean F1-score': mean_f1_score,
                    'STD F1-score': std_f1_score,
                    'Mean Precision': mean_precision,
                    'STD Precision': std_precision,
                    'Mean Recall': mean_recall,
                    'STD Recall': std_recall,
                    'Confusion Matrix': cm
                })
                    
                    
                    
                plt.figure(figsize=(8, 6))
                sns.heatmap(cm, annot=True, cmap='Blues', fmt='.2f', xticklabels=['Without Depression', 'With Depression'], yticklabels=['Without Depression', 'With Depression'])
                plt.title(f"Confusion Matrix - {method_name} - {reduction_name} - {clf_name}")
                plt.xlabel('Predicted Label')
                plt.ylabel('Actual Label')
                plt.show()

                

   

    
    for method_name, method in feature_selection_methods.items():
        for reduction_name, reduction in reductioner.items():
            for ensemble_name, ensemble_clf in ensemble_classifiers.items():
                y_true_all          = []
                y_pred_all          = []
                accuracies          = []
                f1_scores           = []
                precision           = []
                recall              = []
                shap_values_list    = []
                
                for _ in range(10): 
                    print(f"Evaluating feature selection: {method_name} - feature reduction: {reduction_name} - ensemble classifier: {ensemble_name}")
                    file.write(f"{method_name} - {reduction_name} - {ensemble_name} - {_}\n")

                    kf = KFold(n_splits=4)
                    
                    
                    for train_index, test_index in kf.split(X):
                        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                        
                        
                        pipeline = Pipeline([
                            ('preprocessor', preprocessor), 
                            ('feature_selection', method),  
                            ('reduction', reduction),  
                            ('classifier', ensemble_clf)  
                        ])


                        pipeline.fit(X_train, y_train)
                        y_pred = pipeline.predict(X_test)
                        
                        
                        
                        
                        
                        #SHAP
                        explainer = shap.Explainer(pipeline.predict, X_train)
                        shap_values = explainer(X_test)
                        shap_values_list.append(shap_values.values)
                        
                        
                        
                        
                        #LIME
                        explainer = lime.lime_tabular.LimeTabularExplainer(
                            training_data=X.values,
                            feature_names=X.columns,
                            class_names=['Without Depression', 'With Depression'],
                            discretize_continuous=True
                        )
                        i = 0
                        instance = X.iloc[i].values
                        exp = explainer.explain_instance(instance, pipeline.predict_proba, num_features=7)
                        exp.show_in_notebook(show_table=True)
                        fig = exp.as_pyplot_figure()
                        
                        
                        
                        
                        
                        accuracies.append(accuracy_score(y_test, y_pred))
                        file.write(f"{accuracies}\n")
                       
                        f1_scores.append(f1_score(y_test, y_pred))
                        file.write(f"{f1_scores}\n")
                        
                        precision.append(precision_score(y_test, y_pred))
                        file.write(f"{precision}\n")
                        
                        recall.append(recall_score(y_test, y_pred))
                        file.write(f"{recall}\n")
                        
                       
                        y_true_all.extend(y_test)
                        y_pred_all.extend(y_pred)
                        file.write(f"{y_true_all}\n")
                        file.write(f"{y_pred_all}\n")
                    
                mean_accuracy = sum(accuracies) / len(accuracies)
                std_accuracy = statistics.stdev(accuracies) if len(accuracies) > 1 else 0
                
                mean_f1_score = sum(f1_scores) / len(f1_scores)
                std_f1_score = statistics.stdev(f1_scores) if len(f1_scores) > 1 else 0
                
                mean_precision = sum(precision) / len(precision)
                std_precision = statistics.stdev(precision) if len(precision) > 1 else 0
                
                mean_recall = sum(recall) / len(recall)
                std_recall = statistics.stdev(recall) if len(recall) > 1 else 0
                
                cm = confusion_matrix(y_true_all, y_pred_all)
                cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
                
                
                
                #SHAP
                shap_values_list = np.array(shap_values_list)
                aggregated_shap_values = np.mean(shap_values_list, axis=0)
                
                aggregated_shap_values_object = shap.Explanation(
                    values=aggregated_shap_values,
                    base_values=shap_values.base_values,  
                    data=shap_values.data,                
                    feature_names=shap_values.feature_names
                )  
                plt.figure(figsize=(12, 8))
                shap.plots.waterfall(shap_values[0], max_display=14, show=False)
                plt.savefig(f"./waterfall_plot_{method_name}_{reduction_name}_{clf_name}.png", bbox_inches='tight', dpi=1080)
                
                
                
                
                
                print("Mean Accuracy:", mean_accuracy)
                print("STD Accuracy:", std_accuracy)
                file.write(f"{mean_accuracy}\n")
                file.write(f"{std_accuracy}\n")
                
                #print("Mean AUC:", mean_auc)
                print("Mean F1-score:", mean_f1_score)
                print("STD F1-score:", std_f1_score)
                file.write(f"{mean_f1_score}\n")
                file.write(f"{std_f1_score}\n")
                
                print("Mean Precision:", mean_precision)
                print("STD Precision:", std_precision)
                file.write(f"{mean_precision}\n")
                file.write(f"{std_precision}\n")
                
                print("Mean Recall:", mean_recall)
                print("STD Recall:", std_recall)
                file.write(f"{mean_recall}\n")
                file.write(f"{std_recall}\n")
                
                
                
                results.append({
                    'Feature Selection Method': method_name,
                    'Dimensionality Reduction Method': reduction_name,
                    'Classification Method': ensemble_name,
                    'Mean Accuracy': mean_accuracy,
                    'STD Accuracy': std_accuracy,
                    'Mean F1-score': mean_f1_score,
                    'STD F1-score': std_f1_score,
                    'Mean Precision': mean_precision,
                    'STD Precision': std_precision,
                    'Mean Recall': mean_recall,
                    'STD Recall': std_recall,
                    'Confusion Matrix': cm
                })
                    
                    
                    
                plt.figure(figsize=(8, 6))
                sns.heatmap(cm, annot=True, cmap='Blues', fmt='.2f', xticklabels=['Without Depression', 'With Depression'], yticklabels=['Without Depression', 'With Depression'])
                plt.title(f"Confusion Matrix - {method_name} - {reduction_name} - {ensemble_name}")
                plt.xlabel('Predicted Label')
                plt.ylabel('Actual Label')
                plt.show()
                    
                   


results_df = pd.DataFrame(results)
results_df

