In [None]:
import pandas as pd
import copy
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler, MaxAbsScaler, QuantileTransformer, PowerTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, auc, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.impute import SimpleImputer, KNNImputer
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.inspection import permutation_importance

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

In [None]:
df = pd.read_csv('F:/Python/project 650/_CDI.Training.cleaned.csv')

In [None]:
target = 'inhospital_death'
random_state = 1
if set(df[target].unique()) == {0, 1}:
    counts = df[target].value_counts()
    scale_pos_weight = round((counts.get(0) / counts.get(1)), 1) #used by some models for balancing
    print(f"scale_pos_weight = {scale_pos_weight}")
else:
    print("Target is not binary!")

scale_pos_weight = 3.8


In [None]:
def cstat(df, sortstyle='Feature', return_lists = False, show_table = True, nan_threshold = 50, zero_threshold = 50, rnd_lvl=2):
    results = []

    for column in df:
        dtype = str(df[column].dtype)
        unique_count = df[column].nunique()

        noNan_count = df[column].count()
        Nan_count = df[column].isna().sum()
        dupe_counts = df[column].value_counts()
        dupes = dupe_counts[dupe_counts > 1]
        num_dupes = dupes.count()

        percent_nan = round(df[column].isna().mean() * 100, rnd_lvl)
        percent_zero = round((df[column] == 0).mean() * 100, rnd_lvl)

        min_value = "."
        max_value = "."
        range_value = "."
        mean_value = "."
        median_value = "."
        std_dev = "."
        kurtosis = "."
        skewness = "."

        if pd.api.types.is_numeric_dtype(df[column]):
            min_value = round(df[column].min(), rnd_lvl)
            max_value = round(df[column].max(), rnd_lvl)
            mean_value = round(df[column].mean(), rnd_lvl)
            median_value = round(df[column].median(), rnd_lvl)
            std_dev = round(df[column].std(), rnd_lvl)
            kurtosis = round(df[column].kurtosis(), rnd_lvl)
            skewness = round(df[column].skew(), rnd_lvl)
            if pd.isna(kurtosis) or not isinstance(kurtosis, (int, float)): #bc if there's too many nans, kurtosis returns a string
                kurtosis = "."
            if pd.isna(skewness) or not isinstance(skewness, (int, float)): #same for skew
                skewness = "."

        results.append({
            "Feature": column,
            "Dtype": dtype,
            "# Count": noNan_count,
            "# NaN": Nan_count,
            "# Unique": unique_count,
            "# Dupes": num_dupes,
            "Min": min_value,
            "Max": max_value,
            "% NaN": percent_nan,
            "% Zero": percent_zero,
            "Mean": mean_value,
            "Median": median_value,
            "Std Dev": std_dev,
            "Kurtosis (3)": kurtosis,
            "Skew (0)": skewness
        })

    results_df = pd.DataFrame(results)

    if sortstyle == 'Feature':
        results_df = results_df.sort_values(by=sortstyle)
    elif sortstyle == 'Dtype':
        results_df = results_df.sort_values(by=[sortstyle, 'Feature'], ascending =[False, True])
    else:
        results_df = results_df.sort_values(by=[sortstyle, 'Feature'])

    pd.options.display.float_format = f'{{:.{rnd_lvl}f}}'.format

    if show_table: display(results_df)

    if return_lists:
        results_df = results_df[results_df['# Unique'] > 2]

        nancols2drop = results_df[results_df['% NaN'] > nan_threshold]['Feature'].tolist()
        nancols2drop_str = ', '.join([f"'{s}'" for s in nancols2drop])
        print(f"nancols2drop = [{nancols2drop_str}] #nan threshold: {nan_threshold}%")

        zerocols2drop = results_df[results_df['% Zero'] > zero_threshold]['Feature'].tolist()
        zerocols2drop_str = ', '.join([f"'{s}'" for s in zerocols2drop])
        print(f"zerocols2drop = [{zerocols2drop_str}] #zero threshold: {zero_threshold}%")

        skew_df = results_df[results_df['Skew (0)'] != '.']
        cols2normalize = skew_df[(skew_df['Skew (0)'] > 1) | (skew_df['Skew (0)'] < -1)]['Feature'].tolist()
        cols2normalize_str = ', '.join([f"'{s}'" for s in cols2normalize])
        print(f"cols2normalize = [{cols2normalize_str}]")

        kurt_df = results_df[results_df['Kurtosis (3)'] != '.']
        cols2winsorize = kurt_df[kurt_df['Kurtosis (3)'] > 5]['Feature'].tolist()
        cols2winsorize_str = ', '.join([f"'{s}'" for s in cols2winsorize])
        print(f"cols2winsorize = [{cols2winsorize_str}]")

        all_numeric_columns = df.select_dtypes(include=['number']).columns

        binary_cols = [col for col in all_numeric_columns if df[col].nunique() == 2]
        binarycols_str = ', '.join([f"'{s}'" for s in binary_cols])
        print(f"binary_cols = [{binarycols_str}]")

        numeric_cols = [col for col in all_numeric_columns if df[col].nunique() > 2]
        numericcols_str = ', '.join([f"'{s}'" for s in numeric_cols])
        print(f"numeric_cols = [{numericcols_str}]")

        object_cols = df.select_dtypes(include=['object']).columns
        objectcols_str = ', '.join([f"'{s}'" for s in object_cols])
        print(f"object_cols = [{objectcols_str}]")

        return nancols2drop, zerocols2drop, cols2normalize, cols2winsorize, binary_cols, numeric_cols, object_cols

In [None]:
nancols2drop, zerocols2drop, cols2normalize, cols2winsorize, binary_cols, numeric_cols, object_cols = cstat(df, 'Dtype', True)

Unnamed: 0,Feature,Dtype,# Count,# NaN,# Unique,# Dupes,Min,Max,% NaN,% Zero,Mean,Median,Std Dev,Kurtosis (3),Skew (0)
4,chronic_ischemic_heart_disease,int64,1294,0,2,2,0.0,1.0,0.0,71.72,0.28,0.0,0.45,-1.07,0.97
3,chronic_kidney_disease,int64,1294,0,2,2,0.0,1.0,0.0,71.64,0.28,0.0,0.45,-1.08,0.96
2,diabetes,int64,1294,0,2,2,0.0,1.0,0.0,65.53,0.34,0.0,0.48,-1.57,0.65
0,gender_binaried,int64,1294,0,2,2,0.0,1.0,0.0,48.69,0.51,1.0,0.5,-2.0,-0.05
1,inhospital_death,int64,1294,0,2,2,0.0,1.0,0.0,79.13,0.21,0.0,0.41,0.06,1.44
6,albumin,float64,1097,197,39,35,1.0,5.0,15.22,0.0,2.96,3.0,0.68,-0.34,0.07
5,anion_gap,float64,1294,0,35,30,4.0,41.0,0.0,0.0,16.23,16.0,4.74,3.92,1.4
7,bicarbonate,float64,1293,1,38,35,5.0,52.0,0.08,0.0,23.79,24.0,5.36,1.74,0.07
8,bilirubin_total,float64,1098,196,92,43,0.0,82.8,15.15,0.15,1.34,0.5,3.88,202.51,11.94
26,body_temperature,float64,1264,30,108,84,36.0,104.8,2.32,0.0,97.88,98.0,3.84,189.46,-12.24


nancols2drop = ['hemoglobin_bloodgas'] #nan threshold: 50%
zerocols2drop = [] #zero threshold: 50%
cols2normalize = ['anion_gap', 'bilirubin_total', 'body_temperature', 'bun', 'calcium_total', 'creatinine', 'glucose_bloodgas', 'inr_pt', 'lactate', 'mean_arterial_pressure', 'platelet_count', 'potassium_blood', 'ptt', 'respiratory_rate', 'spo2', 'systolic_blood_pressure', 'wbc_blood']
cols2winsorize = ['bilirubin_total', 'body_temperature', 'bun', 'calcium_total', 'creatinine', 'free_calcium', 'glucose_bloodgas', 'inr_pt', 'lactate', 'mean_arterial_pressure', 'platelet_count', 'ptt', 'respiratory_rate', 'spo2', 'wbc_blood']
binary_cols = ['gender_binaried', 'inhospital_death', 'diabetes', 'chronic_kidney_disease', 'chronic_ischemic_heart_disease']
numeric_cols = ['anion_gap', 'albumin', 'bicarbonate', 'bilirubin_total', 'creatinine', 'chloride', 'glucose_bloodgas', 'hematocrit_blood', 'hemoglobin_bloodgas', 'lactate', 'platelet_count', 'potassium_blood', 'ptt', 'inr_pt', 'sodium', 'bun',

In [None]:
df.drop(columns=nancols2drop, inplace=True)
nancols2drop, zerocols2drop, cols2normalize, cols2winsorize, binary_cols, numeric_cols, object_cols = cstat(df, 'Dtype', True)

Unnamed: 0,Feature,Dtype,# Count,# NaN,# Unique,# Dupes,Min,Max,% NaN,% Zero,Mean,Median,Std Dev,Kurtosis (3),Skew (0)
4,chronic_ischemic_heart_disease,int64,1294,0,2,2,0.0,1.0,0.0,71.72,0.28,0.0,0.45,-1.07,0.97
3,chronic_kidney_disease,int64,1294,0,2,2,0.0,1.0,0.0,71.64,0.28,0.0,0.45,-1.08,0.96
2,diabetes,int64,1294,0,2,2,0.0,1.0,0.0,65.53,0.34,0.0,0.48,-1.57,0.65
0,gender_binaried,int64,1294,0,2,2,0.0,1.0,0.0,48.69,0.51,1.0,0.5,-2.0,-0.05
1,inhospital_death,int64,1294,0,2,2,0.0,1.0,0.0,79.13,0.21,0.0,0.41,0.06,1.44
6,albumin,float64,1097,197,39,35,1.0,5.0,15.22,0.0,2.96,3.0,0.68,-0.34,0.07
5,anion_gap,float64,1294,0,35,30,4.0,41.0,0.0,0.0,16.23,16.0,4.74,3.92,1.4
7,bicarbonate,float64,1293,1,38,35,5.0,52.0,0.08,0.0,23.79,24.0,5.36,1.74,0.07
8,bilirubin_total,float64,1098,196,92,43,0.0,82.8,15.15,0.15,1.34,0.5,3.88,202.51,11.94
25,body_temperature,float64,1264,30,108,84,36.0,104.8,2.32,0.0,97.88,98.0,3.84,189.46,-12.24


nancols2drop = [] #nan threshold: 50%
zerocols2drop = [] #zero threshold: 50%
cols2normalize = ['anion_gap', 'bilirubin_total', 'body_temperature', 'bun', 'calcium_total', 'creatinine', 'glucose_bloodgas', 'inr_pt', 'lactate', 'mean_arterial_pressure', 'platelet_count', 'potassium_blood', 'ptt', 'respiratory_rate', 'spo2', 'systolic_blood_pressure', 'wbc_blood']
cols2winsorize = ['bilirubin_total', 'body_temperature', 'bun', 'calcium_total', 'creatinine', 'free_calcium', 'glucose_bloodgas', 'inr_pt', 'lactate', 'mean_arterial_pressure', 'platelet_count', 'ptt', 'respiratory_rate', 'spo2', 'wbc_blood']
binary_cols = ['gender_binaried', 'inhospital_death', 'diabetes', 'chronic_kidney_disease', 'chronic_ischemic_heart_disease']
numeric_cols = ['anion_gap', 'albumin', 'bicarbonate', 'bilirubin_total', 'creatinine', 'chloride', 'glucose_bloodgas', 'hematocrit_blood', 'lactate', 'platelet_count', 'potassium_blood', 'ptt', 'inr_pt', 'sodium', 'bun', 'wbc_blood', 'calcium_total', 'free_calcium

In [None]:
model_params = {
'ab': {
       1: {'estimator': DecisionTreeClassifier(max_depth=1), 'n_estimators': 20, 'learning_rate': 1.0, 'random_state': random_state}
       },
'bc': {
       1: {'estimator': DecisionTreeClassifier(max_depth=5), 'n_estimators': 40, 'random_state': random_state, 'max_samples': 0.8, 'max_features': 0.9}
       },
'cb': {
       1: {'iterations': 50, 'scale_pos_weight': scale_pos_weight, 'learning_rate': 0.1, 'depth': 3, 'verbose': 0, 'random_state': random_state}
       }, #cat_features = cols2categorize
'gb': {
       1: {'n_estimators': 50, 'learning_rate': 0.1, 'max_depth': 3, 'random_state': random_state}
       },
'hg': {
       1: {'max_iter': 50, 'learning_rate': 0.1, 'max_depth': 3, 'tol': 1e-3, 'min_samples_leaf': 15, 'random_state': random_state}
       },
'lg': { #boost_type 'dart' or 'bgdt'; dart randomly drops trees for training, which is theoretically better against overfitting; gbdt builds on errors of previous trees, better for stable data
       1: {'objective': 'binary', 'boosting_type': 'dart', 'n_estimators': 50, 'learning_rate': 0.1, 'max_depth': 3, 'num_leaves': 15, 'subsample': 0.9, 'min_child_samples': 50, 'reg_alpha': 0.1, 'lambda_l2': 0.1, 'verbosity': -1, 'scale_pos_weight': scale_pos_weight, 'random_state': random_state}
       }, #'categorical_feature': cols2categorize   <--- first convert object type to category type  df['object_col'] = df['object_col'].astype('category')
'lr': {
       1: {'tol': 1e-1, 'max_iter': 1000, 'random_state': random_state, 'solver': 'liblinear'}
       #'solver': 'liblinear' for small data sets of thousands of rows
       #... for large datasets: lbfgs (default), sag, saga, newton-cg
       },
'rf': {
       1: {'n_estimators': 50, 'max_depth': 3, 'class_weight': 'balanced', 'n_jobs': -1, 'random_state': random_state}
       },
'xg': {
       1: {'reg_alpha': 0.05, 'reg_lambda': 1, 'scale_pos_weight': scale_pos_weight, 'n_estimators': 50, 'learning_rate': 0.1, 'max_depth': 3, 'n_jobs': -1, 'random_state': random_state}
       }
}

In [None]:
db_mode0 = True
db_mode0 = False
db_mode1 = True
db_mode1 = False
def winsorizer(df, df2, cols2winsor, upper_percentile=95):
    bounds = {}
    for column in cols2winsor:
        lower_bound = df[column].quantile((100-upper_percentile) / 100)
        upper_bound = df[column].quantile(upper_percentile / 100)
        bounds[column] = (lower_bound, upper_bound)

    for column, (lower_bound, upper_bound) in bounds.items():
        df[column] = np.where(df[column] < lower_bound, lower_bound, np.where(df[column] > upper_bound, upper_bound, df[column]))
        df2[column] = np.where(df2[column] < lower_bound, lower_bound, np.where(df2[column] > upper_bound, upper_bound, df2[column]))
    return df, df2

def model_comparer(dftmp, model_type, impute_type, scale_type, cols2scale, run):
    model_type_1 = 'cb'; model_name_1 = "CatBoost Classifier"
    model_type_2 = 'lg'; model_name_2 = "LightGBM Classifier"
    model_type_3 = 'ab'; model_name_3 = "AdaBoost Classifier"
    model_type_4 = 'bc'; model_name_4 = "Bagging Classifier"
    model_type_5 = 'gb'; model_name_5 = "Gradient Boosting Classifier"
    model_type_6 = 'hg'; model_name_6 = "Histogram-Based Gradient Boosting Classifier"
    model_type_7 = 'lr'; model_name_7 = "Logistic Regression"
    model_type_8 = 'rf'; model_name_8 = "Random Forest Classifier"
    model_type_9 = 'xg'; model_name_9 = "XGBoost Classifier"

    X = dftmp.drop(columns=[target])
    y = dftmp[target]

    X_train_tmp, X_test_tmp, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state, stratify=y)
    X_train = copy.deepcopy(X_train_tmp)
    X_test = copy.deepcopy(X_test_tmp)

    X_train, X_test = winsorizer(X_train, X_test, cols2winsorize, 99)

    # Imputation and scaling
    if impute_type != 'noimpute':
        if impute_type == 'knn':
            imputer = KNNImputer(n_neighbors=5)
        else:
            imputer = SimpleImputer(strategy=impute_type)
            X_train[numeric_cols] = imputer.fit_transform(X_train[numeric_cols])
            X_test[numeric_cols] = imputer.transform(X_test[numeric_cols])

        if scale_type != 'noscale':
            scaler = None
            if scale_type == 'maxabs': scaler = MaxAbsScaler()
            elif scale_type == 'minmax': scaler = MinMaxScaler()
            elif scale_type == 'power': scaler = PowerTransformer(method='yeo-johnson')
            elif scale_type == 'quantile': scaler = QuantileTransformer(output_distribution='uniform')
            elif scale_type == 'robust': scaler = RobustScaler()
            elif scale_type == 'standard': scaler = StandardScaler()
            elif scale_type == 'logarithm':
                for col in cols2scale:
                    if X_train[col].min() >= 0 and X_test[col].min() >= 0:
                        X_train[col] = np.log(X_train[col].fillna(0) + 1)
                        X_test[col] = np.log(X_test[col].fillna(0) + 1)

            if scale_type != 'logarithm' and scaler is not None:
                X_train[cols2scale] = scaler.fit_transform(X_train[cols2scale])
                X_test[cols2scale] = scaler.transform(X_test[cols2scale])

        if impute_type == 'knn':
            X_train[numeric_cols] = imputer.fit_transform(X_train[numeric_cols])
            X_test[numeric_cols] = imputer.transform(X_test[numeric_cols])

    # Model selection and fitting
    if model_type in model_params and run in model_params[model_type]:
        params = model_params[model_type][run]
        if model_type == model_type_1: model = CatBoostClassifier(**params); model_name = model_name_1
        if model_type == model_type_2: model = lgb.LGBMClassifier(**params); model_name = model_name_2
        if model_type == model_type_3: model = AdaBoostClassifier(**params); model_name = model_name_3
        if model_type == model_type_4: model = BaggingClassifier(**params); model_name = model_name_4
        if model_type == model_type_5: model = GradientBoostingClassifier(**params); model_name = model_name_5
        if model_type == model_type_6: model = HistGradientBoostingClassifier(**params); model_name = model_name_6
        if model_type == model_type_7: model = LogisticRegression(**params); model_name = model_name_7
        if model_type == model_type_8: model = RandomForestClassifier(**params); model_name = model_name_8
        if model_type == model_type_9: model = XGBClassifier(**params); model_name = model_name_9

        model.fit(X_train, y_train)

    # Predictions and metrics
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    x_pred = model.predict(X_train)

    roc_auc = roc_auc_score(y_test, y_pred_prob)
    precision, recall, _ = precision_recall_curve(y_test, y_pred_prob)
    pr_auc = auc(recall, precision)
    accuracy_train = accuracy_score(y_train, x_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision_metric = precision_score(y_test, y_pred, zero_division=0)
    recall_metric = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f'Metrics ({model_name}):')
    print(f'ROC AUC: {roc_auc:.3f}')
    print(f'PR AUC: {pr_auc:.3f}')
    print(f'Accuracy: {accuracy:.3f}')
    print(f'Precision: {precision_metric:.3f}')
    print(f'Recall: {recall_metric:.3f}')
    print(f'F1 Score: {f1:.3f}')

    # Coefficients and feature importance
    if model_type == 'bc':
        print("Bagging Classifier does not have coefficients\n")
    else:
        if model_type == 'lr':
            coefficients = model.coef_[0]
        elif model_type == 'hg':
            result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=random_state)
            coefficients = result.importances_mean
        else:
            coefficients = model.feature_importances_
        coeff_series = pd.Series(data=coefficients, index=X_train.columns)
        sorted_coeff_series = coeff_series.sort_values(ascending=False).round(3)
        print(f"Coefficients ({model_name})\n{sorted_coeff_series}\n")

    print('---------------------------------------------------------------------------------')

    return model_name, y_pred_prob, y_test, roc_auc, accuracy_train, accuracy


In [None]:
#ENTER model configs here
model_type_1 = 'ab'; impute_type_1 = 'knn'; scale_type_1 = 'robust'; cols2scale_1 = numeric_cols; depth_level_1 = 1
model_type_2 = 'bc'; impute_type_2 = 'knn'; scale_type_2 = 'standard'; cols2scale_2 = cols2normalize; depth_level_2 = 1
model_type_3 = 'cb'; impute_type_3 = 'knn'; scale_type_3 = 'quantile'; cols2scale_3 = numeric_cols; depth_level_3 = 1
model_type_4 = 'gb'; impute_type_4 = 'knn'; scale_type_4 = 'robust'; cols2scale_4 = numeric_cols; depth_level_4 = 1
model_type_5 = 'hg'; impute_type_5 = 'knn'; scale_type_5 = 'standard'; cols2scale_5 = numeric_cols; depth_level_5 = 1
model_type_6 = 'lg'; impute_type_6 = 'mean'; scale_type_6 = 'standard'; cols2scale_6 = numeric_cols; depth_level_6 = 1
model_type_7 = 'lr'; impute_type_7 = 'median'; scale_type_7 = 'minmax'; cols2scale_7 = numeric_cols; depth_level_7 = 1
model_type_8 = 'rf'; impute_type_8 = 'knn'; scale_type_8 = 'quantile'; cols2scale_8 = numeric_cols; depth_level_8 = 1
model_type_9 = 'xg'; impute_type_9 = 'knn'; scale_type_9 = 'robust'; cols2scale_9 = cols2normalize; depth_level_9 = 1

In [None]:
#Now plot the metrics
model_name_1, y_pred_prob_1, y_test_1, auc_1, accuracy_train_1, accuracy_test_1 = model_comparer(df, model_type_1, impute_type_1, scale_type_1, cols2scale_1, depth_level_1)
model_name_2, y_pred_prob_2, y_test_2, auc_2, accuracy_train_2, accuracy_test_2 = model_comparer(df, model_type_2, impute_type_2, scale_type_2, cols2scale_2, depth_level_2)
model_name_3, y_pred_prob_3, y_test_3, auc_3, accuracy_train_3, accuracy_test_3 = model_comparer(df, model_type_3, impute_type_3, scale_type_3, cols2scale_3, depth_level_3)
model_name_4, y_pred_prob_4, y_test_4, auc_4, accuracy_train_4, accuracy_test_4 = model_comparer(df, model_type_4, impute_type_4, scale_type_4, cols2scale_4, depth_level_4)
model_name_5, y_pred_prob_5, y_test_5, auc_5, accuracy_train_5, accuracy_test_5 = model_comparer(df, model_type_5, impute_type_5, scale_type_5, cols2scale_5, depth_level_5)
model_name_6, y_pred_prob_6, y_test_6, auc_6, accuracy_train_6, accuracy_test_6 = model_comparer(df, model_type_6, impute_type_6, scale_type_6, cols2scale_6, depth_level_6)
model_name_7, y_pred_prob_7, y_test_7, auc_7, accuracy_train_7, accuracy_test_7 = model_comparer(df, model_type_7, impute_type_7, scale_type_7, cols2scale_7, depth_level_7)
model_name_8, y_pred_prob_8, y_test_8, auc_8, accuracy_train_8, accuracy_test_8 = model_comparer(df, model_type_8, impute_type_8, scale_type_8, cols2scale_8, depth_level_8)
model_name_9, y_pred_prob_9, y_test_9, auc_9, accuracy_train_9, accuracy_test_9 = model_comparer(df, model_type_9, impute_type_9, scale_type_9, cols2scale_9, depth_level_9)

models = [model_type_1, model_type_2, model_type_3, model_type_4, model_type_5, model_type_6, model_type_7, model_type_8, model_type_9]
model_names = [model_name_1, model_name_2, model_name_3, model_name_4, model_name_5, model_name_6, model_name_7, model_name_8, model_name_9]

y_pred_probs = [y_pred_prob_1, y_pred_prob_2, y_pred_prob_3, y_pred_prob_4,
                  y_pred_prob_5, y_pred_prob_6, y_pred_prob_7, y_pred_prob_8,
                  y_pred_prob_9]

y_tests = [y_test_1, y_test_2, y_test_3, y_test_4,
                  y_test_5, y_test_6, y_test_7, y_test_8,
                  y_test_9]

aucs = [auc_1, auc_2, auc_3, auc_4,
                  auc_5, auc_6, auc_7, auc_8,
                  auc_9]

accuracy_trains = [accuracy_train_1, accuracy_train_2, accuracy_train_3, accuracy_train_4,
                  accuracy_train_5, accuracy_train_6, accuracy_train_7, accuracy_train_8,
                  accuracy_train_9]

accuracy_tests = [accuracy_test_1, accuracy_test_2, accuracy_test_3, accuracy_test_4,
                 accuracy_test_5, accuracy_test_6, accuracy_test_7, accuracy_test_8,
                 accuracy_test_9]

#minichart
metrics_data = []
for i in range(len(y_pred_probs)):
    #roc_auc = roc_auc_score(y_tests[i], y_pred_probs[i])
    precision, recall, _ = precision_recall_curve(y_tests[i], y_pred_probs[i])
    pr_auc = auc(recall, precision)
    #accuracy = accuracy_score(y_tests[i], (y_pred_probs[i] > 0.5).astype(int))  # Binary predictions
    #metrics_data.append([roc_auc, pr_auc, accuracy])
    metrics_data.append([aucs[i], pr_auc, accuracy_tests[i]])
metrics_df = pd.DataFrame(metrics_data, columns=['ROC AUC', 'PR AUC', 'Accuracy'], index=model_names)
metrics_df = metrics_df.round(2)
print("\nMinichart of Model AUC's and Accuracies")
print(f'{metrics_df}\n')

#compare roc auc's
plt.figure(figsize=(12, 6))
lw = 3
colors = ['blue', 'green', 'red', 'purple', 'orange', 'magenta', 'pink', 'cyan', 'lime']

for i in range(len(y_pred_probs)):
    fpr, tpr, _ = roc_curve(y_tests[i], y_pred_probs[i])
    plt.plot(fpr, tpr, color=colors[i], lw=lw, label=f'AUC: {aucs[i]:.3f} ({model_names[i]})')

plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # Diagonal line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f"ROC AUC Curves")
plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1), borderaxespad=0.)
plt.grid(True)
plt.tight_layout()
plt.show()

#compare pr auc's
pr_aucs = []
plt.figure(figsize=(12, 6))
for i in range(len(y_pred_probs)):
    precision, recall, _ = precision_recall_curve(y_tests[i], y_pred_probs[i])
    pr_auc = auc(recall, precision)  # Calculate PR AUC for the current model
    pr_aucs.append(pr_auc)  # Append to the list
    plt.plot(recall, precision, color=colors[i], lw=lw, label=f'PR AUC: {pr_auc:.3f} ({model_names[i]})')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title(f"Precision-Recall AUC Curves")
plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1), borderaxespad=0.)
plt.grid(True)
plt.tight_layout()
plt.show()


#accuracies racing graph
accuracies = pd.DataFrame(columns=['train', 'test'], index=model_names)
for i, model_name in enumerate(model_names):
    accuracies.loc[model_name, 'train'] = accuracy_trains[i]
    accuracies.loc[model_name, 'test'] = accuracy_tests[i]

fig, ax = plt.subplots(figsize=(9, 8))
accuracies_sorted = accuracies.sort_index(ascending=False)  # Change to `ascending=False` for reverse alphabetical order
accuracies_sorted[['test', 'train']].plot(kind='barh', ax=ax, stacked=False, zorder=3)
ax.grid(zorder=0)

plt.title("Accuracy Comparison")
plt.xlabel("Accuracy Score")
plt.ylabel("Model")
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=[handles[1], handles[0]], labels=[labels[1], labels[0]], loc='upper left', bbox_to_anchor=(1.05, 1), borderaxespad=0.)
plt.show()