# Imports 

In [1]:
import pandas as pd
import numpy as np
import joblib
import sys
import os
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import r_regression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, StratifiedKFold
from sklearn.pipeline import Pipeline 
from sklearn.pipeline import make_pipeline
from sklearn.metrics import matthews_corrcoef, roc_auc_score, balanced_accuracy_score, f1_score, recall_score, precision_score, precision_recall_curve, auc, confusion_matrix
from sklearn.base import clone
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

In [4]:
from src.functions import NestedCrossVal, Classifier
from src.LightGBM_nCV import LightGBMNestedCV 
classifier=Classifier()
ncv=NestedCrossVal()
lgb_ncv=LightGBMNestedCV()

# Load the data

In [5]:
path_to_data="/home/user_stel/Assignment-2/data/breast_cancer.csv"
data_df=classifier.load_data(path_to_data)

#print(data_df.head()) #it should display a 512x32 dataframe

# Preprocessing

In [6]:
data_new_df=classifier.preprocess_data(data_df, columns_to_drop=[])

In [7]:
X, y=classifier.separate_features_target(data_new_df, target='diagnosis', columns_to_remove=None)
#print(X)
#print(y)

In [8]:
selected_features, correlations=classifier.select_features(X, y, threshold=0.5)
print(selected_features)

# Creates a new dataset that contains only the selected features 
X_selected=X[selected_features]
#print(X_selected)

selected_feature_names = X_selected.columns.tolist()
target = 'diagnosis'
data_selected_df = data_new_df[selected_feature_names + [target]]
#print(data_selected_df) # the way this new dataframe is built the diagnosis column is last

The selected features of 31 were: 15
['radius_mean', 'perimeter_mean', 'area_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'radius_se', 'perimeter_se', 'area_se', 'radius_worst', 'perimeter_worst', 'area_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst']


In [9]:
models = {
    'LogisticRegression-elasticnet': LogisticRegression(
        penalty='elasticnet', solver='saga', random_state=0, max_iter=10000
    ),
    'GaussianNB': GaussianNB(),
    'LDA': LinearDiscriminantAnalysis(),
    'SVC': SVC(random_state=0),
    'RandomForest': RandomForestClassifier(random_state=0),
    'LightGBM': lgb.LGBMClassifier(random_state=0, verbose=-1, silent=True)
}

param_grid = {
    'LogisticRegression-elasticnet': {
        'C': [0.01, 0.1, 1, 10],
        'l1_ratio': [0.0, 0.25, 0.5, 0.75, 1.0]
    },
    'GaussianNB': {
        'var_smoothing': np.logspace(-9, -1, 9)
    },
    'LDA': [
        {'solver': ['svd']},
        {'solver': ['lsqr', 'eigen'], 'shrinkage': [None, 'auto']}
    ],
    'SVC': [
        {'kernel': ['linear'], 'C': [0.1, 1, 10]},
        {'kernel': ['rbf'], 'C': [0.1, 1, 10], 'gamma': ['scale', 'auto', 0.01, 0.1]}
    ],
    'RandomForest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'LightGBM': {
        'n_estimators': [100, 200],
        'num_leaves': [31, 50, 100],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [-1, 10, 20]
    }
}

In [10]:
model_combinations=ncv.generate_param_combinations(param_grid=param_grid)
model_combo_df_summary = pd.DataFrame.from_dict(model_combinations, orient='index')
#print(model_combo_df_summary)

In [11]:
print(data_selected_df) #it should output 512x16 (15 selected features + target)

     radius_mean  perimeter_mean    area_mean  compactness_mean  \
0          14.68           94.74   684.500000           0.07200   
1          11.50           73.28   407.400000           0.05991   
2          15.85          103.70   782.700000           0.10020   
3          18.82          123.70  1110.000000           0.13890   
4          12.95           83.14   513.700000           0.07943   
..           ...             ...          ...               ...   
507        13.00           82.61   520.200000           0.05073   
508        14.20           92.41   657.616929           0.11080   
509        13.86           90.96   578.900000           0.15170   
510        17.30          113.00   928.200000           0.10410   
511        23.27          152.10  1686.000000           0.11450   

     concavity_mean  concave points_mean  radius_se  perimeter_se  area_se  \
0          0.073950             0.052590     0.4727         3.195    45.40   
1          0.026380             0.02069

In [12]:
X, y=classifier.separate_features_target(data_selected_df, target='diagnosis', columns_to_remove=None)
#print(X)

X_train, x_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42)

#print(X_train)

# Nested Cross Validation (no iterations)

### Test SVC model

In [12]:
metrics_df, best_param_list = ncv.outer_loop(
    data_selected_df,
    'diagnosis',
    'SVC',
    outer_cv=5,
    inner_cv=3
)

print(metrics_df)
print('Best params per fold:', best_param_list)

[SVC] Tested {'kernel': 'linear', 'C': 0.1} -> AUC 0.9835
[SVC] New best AUC 0.9835, params {'kernel': 'linear', 'C': 0.1}
[SVC] Tested {'kernel': 'linear', 'C': 1} -> AUC 0.9848
[SVC] New best AUC 0.9848, params {'kernel': 'linear', 'C': 1}
[SVC] Tested {'kernel': 'linear', 'C': 10} -> AUC 0.9760
[SVC] Tested {'kernel': 'rbf', 'C': 0.1, 'gamma': 'scale'} -> AUC 0.9812
[SVC] Tested {'kernel': 'rbf', 'C': 0.1, 'gamma': 'auto'} -> AUC 0.9812
[SVC] Tested {'kernel': 'rbf', 'C': 0.1, 'gamma': 0.01} -> AUC 0.9820
[SVC] Tested {'kernel': 'rbf', 'C': 0.1, 'gamma': 0.1} -> AUC 0.9797
[SVC] Tested {'kernel': 'rbf', 'C': 1, 'gamma': 'scale'} -> AUC 0.9850
[SVC] New best AUC 0.9850, params {'kernel': 'rbf', 'C': 1, 'gamma': 'scale'}
[SVC] Tested {'kernel': 'rbf', 'C': 1, 'gamma': 'auto'} -> AUC 0.9850
[SVC] Tested {'kernel': 'rbf', 'C': 1, 'gamma': 0.01} -> AUC 0.9836
[SVC] Tested {'kernel': 'rbf', 'C': 1, 'gamma': 0.1} -> AUC 0.9829
[SVC] Tested {'kernel': 'rbf', 'C': 10, 'gamma': 'scale'} -> AU

# Repeated Nested Cross Validation Results 

### Test Logistic Regression with Elastic Net Regularization algorithm

In [13]:
results = ncv.run_repeated_nested_cv(data_selected_df, 'diagnosis', 'LogisticRegression-elasticnet', outer_cv=5, inner_cv=3, num_rounds=10, columns_to_remove=None)

[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 0.0} -> AUC 0.9818
[LogisticRegression-elasticnet] New best AUC 0.9818, params {'C': 0.01, 'l1_ratio': 0.0}
[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 0.25} -> AUC 0.9809
[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 0.5} -> AUC 0.9823
[LogisticRegression-elasticnet] New best AUC 0.9823, params {'C': 0.01, 'l1_ratio': 0.5}
[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 0.75} -> AUC 0.9818
[LogisticRegression-elasticnet] Tested {'C': 0.01, 'l1_ratio': 1.0} -> AUC 0.9702
[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 0.0} -> AUC 0.9837
[LogisticRegression-elasticnet] New best AUC 0.9837, params {'C': 0.1, 'l1_ratio': 0.0}
[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 0.25} -> AUC 0.9831
[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 0.5} -> AUC 0.9827
[LogisticRegression-elasticnet] Tested {'C': 0.1, 'l1_ratio': 0.75} -> AUC 0.9

ValueError: not enough values to unpack (expected 4, got 2)

In [13]:
summary_log_reg = results['summary']
print(summary_log_reg)

               median  ci_lower  ci_upper
MCC          0.876263  0.834553  0.938607
AUC          0.992388  0.964227  0.995888
PRAUC        0.989754  0.964052  0.993442
BA           0.942434  0.900202  0.971554
F1           0.923077  0.885714  0.962025
F2           0.937500  0.842391  0.969388
Recall       0.947368  0.815789  0.974359
Specificity  0.968750  0.937500  0.984615
Precision    0.942857  0.900000  0.968750
NPV          0.967742  0.901408  0.984127


### Test Gaussian Naive Bayes algorithm

In [16]:
results_gnb = ncv.run_repeated_nested_cv(data_selected_df, 'diagnosis', 'GaussianNB', outer_cv=5, inner_cv=3, num_rounds=10, columns_to_remove=None)

[GaussianNB] Tested {'var_smoothing': np.float64(1e-09)} -> AUC 0.9797
[GaussianNB] New best AUC 0.9797, params {'var_smoothing': np.float64(1e-09)}
[GaussianNB] Tested {'var_smoothing': np.float64(1e-08)} -> AUC 0.9797
[GaussianNB] Tested {'var_smoothing': np.float64(1e-07)} -> AUC 0.9797
[GaussianNB] Tested {'var_smoothing': np.float64(1e-06)} -> AUC 0.9797
[GaussianNB] Tested {'var_smoothing': np.float64(1e-05)} -> AUC 0.9797
[GaussianNB] Tested {'var_smoothing': np.float64(0.0001)} -> AUC 0.9797
[GaussianNB] Tested {'var_smoothing': np.float64(0.001)} -> AUC 0.9797
[GaussianNB] Tested {'var_smoothing': np.float64(0.01)} -> AUC 0.9796
[GaussianNB] Tested {'var_smoothing': np.float64(0.1)} -> AUC 0.9808
[GaussianNB] New best AUC 0.9808, params {'var_smoothing': np.float64(0.1)}
[GaussianNB] Tested {'var_smoothing': np.float64(1e-09)} -> AUC 0.9833
[GaussianNB] New best AUC 0.9833, params {'var_smoothing': np.float64(1e-09)}
[GaussianNB] Tested {'var_smoothing': np.float64(1e-08)} -> 

In [17]:
summary_gnb = results_gnb['summary']
print(summary_gnb)

               median  ci_lower  ci_upper
MCC          0.852608  0.768380  0.877454
AUC          0.983964  0.969636  0.995888
PRAUC        0.971906  0.955614  0.993442
BA           0.921053  0.873766  0.925280
F1           0.906667  0.849315  0.914286
F2           0.869565  0.796703  0.902062
Recall       0.842105  0.763158  0.897436
Specificity  0.953125  0.938462  1.000000
Precision    0.921053  0.885714  1.000000
NPV          0.914286  0.875000  0.938462


### Test Linear Discriminant Analysis algorithm

In [18]:
results_lda = ncv.run_repeated_nested_cv(data_selected_df, 'diagnosis', 'LDA', outer_cv=5, inner_cv=3, num_rounds=10, columns_to_remove=None)

[LDA] Tested {'solver': 'svd'} -> AUC 0.9803
[LDA] New best AUC 0.9803, params {'solver': 'svd'}
[LDA] Tested {'solver': 'lsqr', 'shrinkage': None} -> AUC 0.9803
[LDA] Tested {'solver': 'lsqr', 'shrinkage': 'auto'} -> AUC 0.9852
[LDA] New best AUC 0.9852, params {'solver': 'lsqr', 'shrinkage': 'auto'}
[LDA] Tested {'solver': 'eigen', 'shrinkage': None} -> AUC 0.9803
[LDA] Tested {'solver': 'eigen', 'shrinkage': 'auto'} -> AUC 0.9852
[LDA] Tested {'solver': 'svd'} -> AUC 0.9811
[LDA] New best AUC 0.9811, params {'solver': 'svd'}
[LDA] Tested {'solver': 'lsqr', 'shrinkage': None} -> AUC 0.9811
[LDA] Tested {'solver': 'lsqr', 'shrinkage': 'auto'} -> AUC 0.9888
[LDA] New best AUC 0.9888, params {'solver': 'lsqr', 'shrinkage': 'auto'}
[LDA] Tested {'solver': 'eigen', 'shrinkage': None} -> AUC 0.9811
[LDA] Tested {'solver': 'eigen', 'shrinkage': 'auto'} -> AUC 0.9888
[LDA] Tested {'solver': 'svd'} -> AUC 0.9848
[LDA] New best AUC 0.9848, params {'solver': 'svd'}
[LDA] Tested {'solver': 'lsqr

In [19]:
summary_lda = results_lda['summary']
print(summary_lda)

               median  ci_lower  ci_upper
MCC          0.873603  0.759063  0.938607
AUC          0.990954  0.977385  0.994391
PRAUC        0.986615  0.970323  0.992249
BA           0.926398  0.842105  0.971554
F1           0.916667  0.812500  0.962025
F2           0.887097  0.730337  0.969388
Recall       0.868421  0.684211  0.974359
Specificity  0.984375  0.968750  1.000000
Precision    0.968750  0.944444  1.000000
NPV          0.926471  0.842105  0.984127


### Test Support Vector Machines algorithm 

In [12]:
results_svc = ncv.run_repeated_nested_cv(data_selected_df, 'diagnosis', 'SVC', outer_cv=5, inner_cv=3, num_rounds=10, columns_to_remove=None)

[SVC] Tested {'kernel': 'linear', 'C': 0.1} -> AUC 0.9835
[SVC] New best AUC 0.9835, params {'kernel': 'linear', 'C': 0.1}
[SVC] Tested {'kernel': 'linear', 'C': 1} -> AUC 0.9848
[SVC] New best AUC 0.9848, params {'kernel': 'linear', 'C': 1}
[SVC] Tested {'kernel': 'linear', 'C': 10} -> AUC 0.9760
[SVC] Tested {'kernel': 'rbf', 'C': 0.1, 'gamma': 'scale'} -> AUC 0.9812
[SVC] Tested {'kernel': 'rbf', 'C': 0.1, 'gamma': 'auto'} -> AUC 0.9812
[SVC] Tested {'kernel': 'rbf', 'C': 0.1, 'gamma': 0.01} -> AUC 0.9820
[SVC] Tested {'kernel': 'rbf', 'C': 0.1, 'gamma': 0.1} -> AUC 0.9797
[SVC] Tested {'kernel': 'rbf', 'C': 1, 'gamma': 'scale'} -> AUC 0.9850
[SVC] New best AUC 0.9850, params {'kernel': 'rbf', 'C': 1, 'gamma': 'scale'}
[SVC] Tested {'kernel': 'rbf', 'C': 1, 'gamma': 'auto'} -> AUC 0.9850
[SVC] Tested {'kernel': 'rbf', 'C': 1, 'gamma': 0.01} -> AUC 0.9836
[SVC] Tested {'kernel': 'rbf', 'C': 1, 'gamma': 0.1} -> AUC 0.9829
[SVC] Tested {'kernel': 'rbf', 'C': 10, 'gamma': 'scale'} -> AU

In [15]:
summary_svc = results_svc['summary']
print(summary_svc)

               median  ci_lower  ci_upper
MCC          0.916036  0.768380  0.937991
AUC          0.988898  0.973279  0.997533
PRAUC        0.979417  0.965287  0.995886
BA           0.952714  0.877126  0.966546
F1           0.945946  0.849315  0.961039
F2           0.930851  0.801105  0.963542
Recall       0.921053  0.763158  0.973684
Specificity  0.984375  0.938462  1.000000
Precision    0.972222  0.885714  1.000000
NPV          0.954545  0.876712  0.983871


### Test Random Forests algorithm

In [22]:
results_rf = ncv.run_repeated_nested_cv(data_selected_df, 'diagnosis', 'RandomForest', outer_cv=5, inner_cv=3, num_rounds=10, columns_to_remove=None)

[RandomForest] Tested {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1} -> AUC 0.9782
[RandomForest] New best AUC 0.9782, params {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1}
[RandomForest] Tested {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 2} -> AUC 0.9797
[RandomForest] New best AUC 0.9797, params {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 2}
[RandomForest] Tested {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 5, 'min_samples_leaf': 1} -> AUC 0.9797
[RandomForest] Tested {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 5, 'min_samples_leaf': 2} -> AUC 0.9794
[RandomForest] Tested {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1} -> AUC 0.9782
[RandomForest] Tested {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2} -> AUC 0.979

In [23]:
summary_rf = results_rf['summary']
print(summary_rf)

               median  ci_lower  ci_upper
MCC          0.895900  0.791498  0.958059
AUC          0.991365  0.959109  0.997533
PRAUC        0.986155  0.956598  0.996370
BA           0.950247  0.895749  0.979030
F1           0.935065  0.868421  0.973684
F2           0.942408  0.868421  0.973684
Recall       0.947368  0.868421  0.973684
Specificity  0.968750  0.923077  0.984375
Precision    0.942857  0.868421  0.973684
NPV          0.968254  0.923077  0.984375


### Test LightGBM algorithm

In [9]:
results_gbm = lgb_ncv.run_repeated_nested_cv(data_selected_df, 'diagnosis', outer_cv=5, inner_cv=3, num_rounds=10, columns_to_remove=None)

In [10]:
summary_gbm = results_gbm['summary']
print(summary_gbm)

               median  ci_lower  ci_upper
MCC          0.876263  0.749798  0.958468
AUC          0.992188  0.952429  0.993832
PRAUC        0.987818  0.952203  0.991137
BA           0.942434  0.874899  0.973684
F1           0.923077  0.842105  0.972973
F2           0.937500  0.842105  0.957447
Recall       0.947368  0.842105  0.948718
Specificity  0.953125  0.907692  1.000000
Precision    0.916667  0.842105  1.000000
NPV          0.967742  0.907692  0.969697
