# Installing required libraries

In [None]:
!pip install xgboost catboost polars optuna shap -q

In [None]:
!pip install xgboost --upgrade

In [None]:
!pip install scikit-learn-intelex -q

In [None]:
!pip install mlflow -q

# Importing required libraries

In [None]:
import pandas as pd
import polars as pl
import optuna
import pickle

import joblib

import numpy as np
## Enabling intel optimizations to 
import matplotlib.pyplot as plt
import os
import seaborn as sns
from sklearnex import patch_sklearn
patch_sklearn()

In [None]:
import mlflow

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,f1_score, roc_auc_score, accuracy_score

In [None]:
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from xgboost import XGBClassifier

# Helper functions

In [None]:
# Helper functions
def diag_med_lab_pid_exist_check(modeling_pids, diag_pid, medications_pid, lab_pid,age_data = None):
    # Convert sets of pids for faster lookup
    diag_pid_set = set(diag_pid)
    medications_pid_set = set(medications_pid)
    lab_pid_set = set(lab_pid)

    # Create the result list using a single loop
    if age_data:
        result = [
        f"{age}_{int(pid in diag_pid_set)}{int(pid in medications_pid_set)}{int(pid in lab_pid_set)}"
        for pid,age in zip(modeling_pids,age_data)
    ]
    else:
        result = [
            f"{int(pid in diag_pid_set)}{int(pid in medications_pid_set)}{int(pid in lab_pid_set)}"
            for pid in modeling_pids
        ]
    
    return result

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, average_precision_score
from tqdm import tqdm

def get_metrics(model, X_test, y_test, n_bootstrap=1000, random_state=42):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    roc_auc = roc_auc_score(y_test, y_pred_proba)
    print("AUC:", roc_auc)

    auc_pr = average_precision_score(y_test, y_pred_proba)
    print("Precision-Recall AUC:", auc_pr)

    report = classification_report(y_test, y_pred)
    print("Classification Report:\n", report)

    cm = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cm.ravel()

    sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0
    specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
    ppv = TP / (TP + FP) if (TP + FP) > 0 else 0   # PPV calculation

    print("Sensitivity (Recall):", sensitivity)
    print("Specificity:", specificity)
    print("PPV (Precision):", ppv)

    
    def stratified_bootstrap(y_true, y_pred, y_pred_proba, n_bootstrap=1000, random_state=42):
        pos_idx = np.where(y_true == 1)[0]
        neg_idx = np.where(y_true == 0)[0]
        n_pos, n_neg = int(len(pos_idx)/10), int(len(neg_idx)/10)
        rng = np.random.default_rng(seed=random_state)

        aucs, sensitivities, specificities, ppvs = [], [], [], []  # include ppvs

        for _ in tqdm(range(n_bootstrap), desc="Bootstrapping"):
            pos_bs = rng.choice(pos_idx, size=n_pos, replace=True)
            neg_bs = rng.choice(neg_idx, size=n_neg, replace=True)
            idx = np.concatenate([pos_bs, neg_bs])
            y_bs = np.array(y_true)[idx]
            y_pred_bs = np.array(y_pred)[idx]
            y_pred_proba_bs = np.array(y_pred_proba)[idx]

            try:
                auc = roc_auc_score(y_bs, y_pred_proba_bs)
                aucs.append(auc)
            except Exception:
                continue

            cm_bs = confusion_matrix(y_bs, y_pred_bs)
            if cm_bs.shape != (2, 2): continue
            TN_bs, FP_bs, FN_bs, TP_bs = cm_bs.ravel()
            sens_bs = TP_bs / (TP_bs + FN_bs) if (TP_bs + FN_bs) > 0 else 0
            spec_bs = TN_bs / (TN_bs + FP_bs) if (TN_bs + FP_bs) > 0 else 0
            ppv_bs = TP_bs / (TP_bs + FP_bs) if (TP_bs + FP_bs) > 0 else 0   # PPV bootstrapped

            sensitivities.append(sens_bs)
            specificities.append(spec_bs)
            ppvs.append(ppv_bs)  # store ppv

        return aucs, sensitivities, specificities, ppvs

    aucs, sensitivities, specificities, ppvs = stratified_bootstrap(y_test, y_pred, y_pred_proba, n_bootstrap, random_state)

    def get_ci(data, alpha=0.05):
        lower = np.percentile(data, 100 * (alpha/2))
        upper = np.percentile(data, 100 * (1 - alpha/2))
        return lower, upper

    roc_auc_ci = get_ci(aucs)
    sensitivity_ci = get_ci(sensitivities)
    specificity_ci = get_ci(specificities)
    ppv_ci = get_ci(ppvs)  # compute PPV CI

    print(f"ROC AUC 95% CI: {roc_auc_ci}")
    print(f"Sensitivity 95% CI: {sensitivity_ci}")
    print(f"Specificity 95% CI: {specificity_ci}")
    print(f"PPV 95% CI: {ppv_ci}")

    return {
        'confusion_matrix': cm,
        'accuracy': accuracy,
        'roc_auc': roc_auc, 'roc_auc_ci': roc_auc_ci,
        'auc_pr': auc_pr,
        'sensitivity': sensitivity, 'sensitivity_ci': sensitivity_ci,
        'specificity': specificity, 'specificity_ci': specificity_ci,
        'ppv': ppv, 'ppv_ci': ppv_ci,  # return PPV as well!
        'classification_report': report
    }

# Config class

In [None]:
mlflow.set_tracking_uri("../Determine_ML_FLOW_Experiment")

In [None]:
class ml_config:
    base_folder ='../Determine_final_modeling_datasets/'
    columns_to_ignore_cat = ['PATIENT_NUM','FirstOutcomeDate','Outcome']
    target_column = 'Outcome'
    file = 'Determine_joined_med_usage_lab_median_domain_expert_diag_phemap_without_icd10z_bmi_bp_cvs_ordinal_nominal_encoded.parquet'
    
    patient_enc_info_path = '../Determine_cohort_after_visit_index_details.parquet'
    
    no_enc_ppid_experiment =  True
    low_feature_count_less_than_1_bmi_missing_exclude = False
    
    no_enc_after_2_plus_vi_above_50_exlcude = False
    
    pat_before_index_visit_dia_meds_remove = True
    
    pat_outcome0_with_dia_meds_remove = True ### Dropping patients with active ingrident of diabetes medications
    
    sdoh_screening_experiment = False ### SDOH screening experiment flag
    
    replace_neg100_with_none = True
    
    
    ### MlFlow Variables
    mlflow_experiment_name = 'Low feature count experiments'
    run_name = 'Effect of removing patient records who have less than 5 feature'
    
    model_name = 'catboost'
    
    with open('./Experiements_related_files/Outcome0_after_vi+1_no_enc_pids_over_50.pkl','rb') as f:
        Outcome0_after_vi_1_no_enc_pids_over_50 = pickle.load(f)
    
    with open('./Experiements_related_files/Outcome0_after_vi+2_no_enc_pids_over_50.pkl','rb') as f:
        Outcome0_after_vi_2_no_enc_pids_over_50 = pickle.load(f)
        
    with open('./Experiements_related_files/Patient_ids_feature_count_less_than_1_no_bmi.pkl','rb') as f:
        Patient_ids_feature_count_less_than_1_no_bmi = pickle.load(f)
    

# Loading modeling data file

In [None]:
modeling_df = pl.read_parquet(ml_config.base_folder + ml_config.file)

In [None]:
'LOINC:2532-0' in modeling_df.columns

In [None]:
modeling_df.head()

In [None]:
### Replacing -100 with None
if ml_config.replace_neg100_with_none:
    print('Replacing -100 with None')
    modeling_df = modeling_df.with_columns([
        pl.when(pl.col(c) == -100).then(np.nan).otherwise(pl.col(c)).alias(c)
        for c in modeling_df.columns
    ])

In [None]:
print("Number of data points in the dataset: ",len(modeling_df))

In [None]:
modeling_df.filter(pl.col('Outcome')==1).sort('FirstOutcomeDate', descending=False).head()

In [None]:
modeling_df.filter(pl.col('PATIENT_NUM') == 297249)

In [None]:
np.unique(modeling_df['Outcome'].to_list(),return_counts = True)

In [None]:
if ml_config.pat_before_index_visit_dia_meds_remove:
    with open('pat_num_ignore_dia_meds.pkl','rb') as f:
        med_ignore_patient_num = pickle.load(f)
    modeling_df = modeling_df.filter(~pl.col('PATIENT_NUM').is_in(med_ignore_patient_num))
    print(len(modeling_df))
    
    

In [None]:
'metformin' in modeling_df.columns

In [None]:
if ml_config.no_enc_ppid_experiment:
    modeling_patient_ids = modeling_df['PATIENT_NUM'].to_list()
    pids_enc_info_after_vi = pl.read_parquet(ml_config.patient_enc_info_path)['PATIENT_NUM'].to_list()
    pids_to_drop = set(modeling_patient_ids) - set(pids_enc_info_after_vi)
#     modeling_df = modeling_df.filter(((~pl.col('PATIENT_NUM').is_in(pids_to_drop)) & (pl.col('Outcome')==0)) | (pl.col('Outcome')==1))
    print("Number of data points that needs to be excluded if seen in train: ",len(pids_to_drop))
    print("The target distribution is:", np.unique(modeling_df['Outcome'].to_list(),return_counts = True) )

In [None]:
# if ml_config.low_feature_count_exclude:
    
    
if ml_config.pat_outcome0_with_dia_meds_remove:
    with open('Determine_outcome0_act_ing_dia_after_index_visit_patient_nums.pkl', 'rb') as f:
        pat_outcome0_with_dia_meds_remove = pickle.load(f)
    print("Number of patient ids removed from dataset: ",len(pat_outcome0_with_dia_meds_remove))    
    modeling_df = modeling_df.filter(~pl.col('PATIENT_NUM').is_in(pat_outcome0_with_dia_meds_remove)) 
    print(np.unique(modeling_df['Outcome'].to_list(),return_counts = True))

In [None]:
# modeling_df  = modeling_df.drop(ml_config.columns_to_drop)
# modeling_df.head()

In [None]:
### Defining categorical columns
cat_features = [col for col in modeling_df.columns if  not (col.startswith('LOINC') 
                                                            or col in ml_config.columns_to_ignore_cat
                                                            or col in ['BMI',
                                                                       'mode_height',
                                                                       'average_weight',
                                                                         'average_diastolic_value',
                                                                         'average_systolic_value',
                                                                          "ACS_MedHHIncome", 
                                                                       "ACS_GINI", 
                                                                       "ACS_Unemployment", 
                                                                       "ACS_pctPoverty100", 
                                                                       "ACS_pctCollGrad"]
                                                                          )]
numerical_features = [col for col in modeling_df.columns if ((col.startswith('LOINC') 
#                                                             or col not in ml_config.columns_to_ignore_cat
                                                            or col in ['BMI',
                                                                       'mode_height',
                                                                       'average_weight',
                                                                         'average_diastolic_value',
                                                                         'average_systolic_value',
                                                                          "ACS_MedHHIncome", 
                                                                       "ACS_GINI", 
                                                                       "ACS_Unemployment", 
                                                                       "ACS_pctPoverty100", 
                                                                       "ACS_pctCollGrad"
                                                                          ] + [
                                                                            "mode_height",
#                                                                             "median_value",
#                                                                             "slope_weight",
                                                                            "BMI",
                                                                            "median_diastolic_value",
#                                                                             "slope_dia_bp",
                                                                            "median_systolic_value",
#                                                                             "slope_sys_bp"
                                                                        ])
                                                            and (col not in ml_config.columns_to_ignore_cat))]

In [None]:
'average_weight' in cat_features

In [None]:
print("Number of categorical features: ",len(cat_features))
print("Number of numerical features: ",len(numerical_features))

In [None]:
loinc_columns = [col for col in modeling_df.columns if col.startswith('LOINC')]
print("Number of lab results features: ",len(loinc_columns))

In [None]:
### VERY IMPORTANT!!!!
modeling_df = modeling_df.with_columns([
    pl.col(col).cast(pl.Float32)
    for col in loinc_columns
])

In [None]:
modeling_df.filter(pl.col('metformin')==1)

In [None]:
# Get unique data types
unique_dtypes = set(modeling_df.dtypes)
print(unique_dtypes)

In [None]:
np.unique(modeling_df['Outcome'].to_list(),return_counts = True)

# Train/test split

In [None]:
data_train_pids, data_test_pids = train_test_split(modeling_df['PATIENT_NUM'], test_size=0.2, stratify=modeling_df['Outcome'], random_state = 42)

In [None]:
data_train_pids = data_train_pids.to_list()
data_test_pids = data_test_pids.to_list()

In [None]:
train_pids = data_train_pids
test_pids = data_test_pids

In [None]:
len(data_train_pids)

In [None]:
len(data_test_pids)

In [None]:
if ml_config.sdoh_screening_experiment:

    with open('sdohscreen-test_data_pids.pkl', 'wb') as file: 
        # A new file will be created 
        pickle.dump(data_test_pids, file) 

    with open('sdohscreen-train_data_pids.pkl', 'wb') as file: 
        # A new file will be created 
        pickle.dump(data_train_pids, file) 

    #Open the file in binary mode 
    with open('sdohscreen-train_data_pids.pkl', 'rb') as file: 
        train_pids = pickle.load(file) 
    with open('sdohscreen-test_data_pids.pkl', 'rb') as file: 
        test_pids = pickle.load(file)  
    
    
else:
    # saving test pids
    with open('test_data_pids.pkl', 'wb') as file: 
        # A new file will be created 
        pickle.dump(data_test_pids, file) 

    with open('train_data_pids.pkl', 'wb') as file: 
        # A new file will be created 
        pickle.dump(data_train_pids, file) 

    
    
print("Number of train pids: ", len(train_pids))
print("Number of test pids: ", len(test_pids))

In [None]:
if ml_config.sdoh_screening_experiment:
    print("In screening sdoh experiment")
#Open the file in binary mode 
    with open('sdohscreen-train_data_pids.pkl', 'rb') as file: 
        train_pids = pickle.load(file) 
    with open('sdohscreen-test_data_pids.pkl', 'rb') as file: 
        test_pids = pickle.load(file) 
    
else:    
    #Open the file in binary mode 
    with open('train_data_pids.pkl', 'rb') as file: 
        train_pids = pickle.load(file) 
    with open('test_data_pids.pkl', 'rb') as file: 
        test_pids = pickle.load(file)

In [None]:
print(len(train_pids))
print(len(test_pids))

In [None]:
# train_pids = list(set(data_train_pids+data_test_pids) - set(test_pids))

In [None]:
data_train = modeling_df.filter(pl.col('PATIENT_NUM').is_in(train_pids))
data_test = modeling_df.filter(pl.col('PATIENT_NUM').is_in(test_pids))

In [None]:
print(len(data_train))
print(len(data_test))

### Experiments exclusion part

In [None]:
if ml_config.no_enc_ppid_experiment:
    print("Dropping patients with no encounters")
    data_train = data_train.filter(((~pl.col('PATIENT_NUM').is_in(pids_to_drop)) & (pl.col('Outcome')==0)) | (pl.col('Outcome')==1))

In [None]:
if ml_config.no_enc_after_2_plus_vi_above_50_exlcude:
    print("Dropping patients with no encounters after bi +2 year and age over 50")
    data_train = data_train.filter(~pl.col('PATIENT_NUM').is_in(ml_config.Outcome0_after_vi_1_no_enc_pids_over_50 +
                                                                ml_config.Outcome0_after_vi_2_no_enc_pids_over_50
                                                               ))

In [None]:
if ml_config.low_feature_count_less_than_1_bmi_missing_exclude:
    
    print("Dropping patients with fetaures less than 1 and bmi is missing")
    data_train = data_train.filter(~pl.col('PATIENT_NUM').is_in(ml_config.Patient_ids_feature_count_less_than_1_no_bmi))
    

In [None]:
print(len(data_train))
print(len(data_test))

In [None]:
np.unique(data_test['Outcome'].to_list(), return_counts =True)

In [None]:
del modeling_df

In [None]:
test_data_firstoutcome_df = data_test.select(['PATIENT_NUM','FirstOutcomeDate','Outcome'])

In [None]:
X_train,y_train = data_train.drop(['PATIENT_NUM','FirstOutcomeDate','Outcome','mode_height']).to_pandas(), data_train['Outcome'].to_pandas()
X_test,y_test = data_test.drop(['PATIENT_NUM','FirstOutcomeDate','Outcome', 'mode_height']).to_pandas(), data_test['Outcome'].to_pandas()

In [None]:
type(y_train)

In [None]:
del data_train#, data_test

In [None]:
X_test[['Sex_CD_F','Sex_CD_M']].sum()

# Boruta feature selection

In [None]:
!pip install boruta -q 

In [None]:
# loinc_columns = [col for col in X_train.columns if col.startswith('LOINC')]
# len(loinc_columns)
from xgboost import XGBClassifier

In [None]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

In [None]:
# from sklearn.model_selection import train_test_split

# X_train, X_train_2, y_train, y_train_2 = train_test_split(
#     X_train, y_train,
#     train_size=0.6,       # Or e.g. 0.2 if you want a % instead of raw size
#     stratify=y_train,
#     random_state=42
# )

In [None]:
ratio = float(y_train.value_counts()[0]) / y_train.value_counts()[1]


model = XGBClassifier(n_jobs=-1, n_estimators = 300, max_depth = 4, scale_pos_weight = ratio)

#Initialize Boruta
feat_selector = BorutaPy(verbose=2, estimator=model, max_iter=150)  # number of iterations to perform

#Train Boruta
#N.B.: X and y must be numpy arrays
feat_selector.fit(X_train, y_train)

In [None]:
selected_features = X_train.columns[feat_selector.support_]
print(selected_features)

In [None]:
bf

# ML models

## Single objective function

In [None]:
use_boruta = True

In [None]:
if use_boruta:
    boruta_features = ['Age_group', 'azithromycin', 'levothyroxine', 'acyclovir',
       'ceftriaxone', 'phe_401.1', 'phe_271.3', 'phe_41.0', 'phe_278.11',
       'phe_649.1', 'LOINC:2085-9', 'LOINC:2345-7', 'LOINC:74774-1',
       'LOINC:27353-2', 'LOINC:9318-7', 'LOINC:62238-1', 'mode_height', 'BMI',
       'median_diastolic_value', 'ACS_MedHHIncome', 'ACS_pctCollGrad',
       'Race_CD_02', 'Race_CD_05', 'Hispanic_CD_Y', 'Gender_CD_M',
       'Gender_CD_W']
    boruta_features.remove('mode_height')
    
    X_train = X_train[boruta_features]
    X_test =X_test[boruta_features]
    
    print("Number of featues being used: ",len(boruta_features))

In [None]:
# Remove columns starting with 'SDH'

if ml_config.sdoh_screening_experiment:
    print('SDH features excluded')
    X_train = X_train.loc[:, ~X_train.columns.str.startswith('SDH')]
    X_test = X_test.loc[:, ~X_test.columns.str.startswith('SDH')]

In [None]:
X_train.columns

#### Adjusting features based on model_name (some models need feature scaling for numerical data)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
numerical_features_use

In [None]:
# Initialize the scaler

if ml_config.model_name == 'logistic_regression':
    numerical_features_use = [i for i in X_train.columns if i in numerical_features]
    scaler = StandardScaler()
    print("Scaling data")
    # Fit and transform only the specified columns
    X_train[numerical_features_use] = scaler.fit_transform(X_train[numerical_features_use])
    X_test[numerical_features_use] = scaler.transform(X_test[numerical_features_use])

### Optuna

In [None]:
ml_config.model_name = 'xgboost'

In [None]:
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

def objective(trial):
    if ml_config.model_name == 'xgboost':
    
        param = {
        'objective': 'binary:logistic',  # Binary classification
        #'eval_metric': 'auc',             # Evaluation metric
        'seed': 42,
        'eta': trial.suggest_float('eta', 0.01, 0.3, step=0.01),
        'n_estimators': trial.suggest_int('n_estimators', 100, 600, step =100),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0, step=0.1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.1),
        'gamma': trial.suggest_float('gamma', 0, 5.0),
        'scale_pos_weight': (len(y_train) - sum(y_train)) / sum(y_train),  # Class weight for imbalance
        }
        
        model = XGBClassifier(**param, enable_categorical=True, device="cuda")

    elif ml_config.model_name == 'catboost':
        param = {
            "iterations": trial.suggest_categorical('iterations',[200, 400, 600, 800]),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
            "depth": trial.suggest_int("depth", 4, 12),
            #"subsample": trial.suggest_float("subsample", 0.05, 1.0),
            #"colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
            'task_type':"GPU",
        }
        model = CatBoostClassifier(**param, auto_class_weights='Balanced',allow_writing_files=False,
                                   silent=True)

    elif ml_config.model_name == 'random_forest':
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 800),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        }
        model = RandomForestClassifier(**param, class_weight ='balanced', n_jobs = -1)

    elif ml_config.model_name == 'logistic_regression':
        param = {
            'C': trial.suggest_loguniform('C', 1e-2, 10.0),
            'max_iter': trial.suggest_int('max_iter', 100, 600),
            'solver': trial.suggest_categorical('solver', ['liblinear']),
        }
        model = LogisticRegression(**param,class_weight ='balanced', n_jobs = -1 )
        
    
    

    else:
        raise ValueError("Unsupported model name")
    
    
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
        
    y_pred = model.predict(X_test)

    recall1 = recall_score(y_test, y_pred, pos_label=1)
    return recall1



In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials= 10)

print('')
print('Best hyperparameters:', study.best_params)
print('Best AUC:', study.best_value)
print('')

In [None]:
# best_params =study.best_params

if ml_config.model_name == 'xgboost':
        
    model = XGBClassifier(#**best_params, 
                          scale_pos_weight = (len(y_train) - sum(y_train)) / sum(y_train),  
#                           use_label_encoder=False,
                          device = 'cuda', 
                          )
    
elif ml_config.model_name == 'catboost':
    model = CatBoostClassifier(#**best_params, 
                               auto_class_weights='Balanced',allow_writing_files=False,
                                   task_type ='GPU',
                                   silent=True)
    
elif ml_config.model_name == 'random_forest':
    model = RandomForestClassifier(**best_params, 
                                   class_weight ='balanced', n_jobs = -1)
    
elif ml_config.model_name == 'logistic_regression':
    model = LogisticRegression(**best_params, 
#                                solver = 'liblinear',
                               class_weight ='balanced', n_jobs = -1)
    
else:
    raise ValueError("Unsupported model name")

In [None]:
model.fit(X_train,y_train)

In [None]:
all_metrics = get_metrics(model,X_test, y_test)

In [None]:
## Confusion matrix
cm = all_metrics['confusion_matrix']
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cm.flatten()/np.sum(cm)]
labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_names, group_counts)]
labels = np.asarray(labels).reshape(2,2)

fig, ax = plt.subplots()  # Use fig, ax instead of plt.subplot()
sns.heatmap(cm, annot=labels, fmt='', cmap='Blues', ax=ax)

ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix') 
plt.show()

In [None]:
from joblib import dump  

In [None]:
# Save model using joblib
if use_boruta:
    feat_use = 'boruta_features'
else:
    feat_use = 'all_features'

model_filename_base = f"../Determine_trained_models/{ml_config.model_name}_dataset_{ml_config.file.split('.')[0]}_{feat_use}"

if "xgboost" in ml_config.model_name:
    # XGBoost - use built-in save_model
    model_filename = model_filename_base + ".json"
    model.save_model(model_filename)
    print(f"XGBoost model saved as {model_filename}")
elif "catboost" in ml_config.model_name:
    # CatBoost - use built-in save_model
    model_filename = model_filename_base + ".cbm"
    model.save_model(model_filename)
    print(f"CatBoost model saved as {model_filename}")
elif  "random_forest" in ml_config.model_name:
    # RandomForest (scikit-learn) - use joblib
    model_filename = model_filename_base + ".pkl"
    dump(model, model_filename)
    print(f"RandomForest model saved as {model_filename}")
else:
    # Default to joblib for 'sklearn' compatible models
    model_filename = model_filename_base + ".pkl"
    dump(model, model_filename)
    print(f"Model saved as {model_filename}")

# Feature Importance

In [None]:
feature_importances = model.get_feature_importance()
feature_names = X_train.columns  # Assuming X_train is a DataFrame

# Combine feature names with their importances
feature_importance_dict = dict(zip(feature_names, feature_importances))

In [None]:
features_not_ignored = [feature for feature, importance in feature_importance_dict.items() if importance != 0]

print("Features that are not ignored:", len(features_not_ignored))
print("Actual feature count: ",len(feature_names))

# Fairness Metrics

In [None]:
!pip install fairlearn -q

from fairlearn.metrics import MetricFrame, selection_rate, demographic_parity_difference, equalized_odds_difference
from sklearn.metrics import accuracy_score

In [None]:
y_pred = model.predict(X_test)

In [None]:
race_columns = []

In [None]:
X_test

In [None]:
def row_collapse(row):
    if row.sum() == 0:
        return -1
    return np.argmax(row)

sensitive_feature_df = X_test[['Race_CD_01','Race_CD_02','Race_CD_03','Race_CD_04', 'Race_CD_05']]  



sensitive_feature = sensitive_feature_df.apply(row_collapse, axis =1)

In [None]:
print('For RACE')
metricframe = MetricFrame(
    metrics={
        'accuracy': accuracy_score,
        'selection_rate': selection_rate
    },
    y_true=y_test,
    y_pred=y_pred,
    sensitive_features=sensitive_feature
)

print(metricframe.by_group)

# Demographic parity difference (difference in positive rate between groups)
dp_diff = demographic_parity_difference(y_test, y_pred, sensitive_features=sensitive_feature)
print("Demographic Parity Difference:", dp_diff)

# Equalized odds difference (difference in TPR/FPR between groups)
eo_diff = equalized_odds_difference(y_test, y_pred, sensitive_features=sensitive_feature)
print("Equalized Odds Difference:", eo_diff)

In [None]:
[col for col in X_train.columns if 'Gender' in col]

In [None]:
sensitive_feature_df = X_test[[col for col in X_train.columns if 'Gender' in col]]



sensitive_feature = sensitive_feature_df.apply(row_collapse, axis =1)
print('For gender')
metricframe = MetricFrame(
    metrics={
        'accuracy': accuracy_score,
        'selection_rate': selection_rate
    },
    y_true=y_test,
    y_pred=y_pred,
    sensitive_features=sensitive_feature
)

print(metricframe.by_group)

# Demographic parity difference (difference in positive rate between groups)
dp_diff = demographic_parity_difference(y_test, y_pred, sensitive_features=sensitive_feature)
print("Demographic Parity Difference:", dp_diff)

# Equalized odds difference (difference in TPR/FPR between groups)
eo_diff = equalized_odds_difference(y_test, y_pred, sensitive_features=sensitive_feature)
print("Equalized Odds Difference:", eo_diff)

In [None]:
np.unique(sensitive_feature.to_list(), return_counts =True)

In [None]:
# Demographic Parity Difference: 0.1147
# What it means: The difference between the highest and lowest group selection rates.
# Interpretation: The model is 11.47% more likely to assign the positive outcome to the most-favored group than the least-favored, when comparing all racial groups. Ideally, for fairness, this should be close to 0.
# Equalized Odds Difference: 0.1308
# What it means: Measures the largest difference in true positive rates (and possibly false positive rates) between groups.
# Interpretation: A value of 0.13 (13.08%) means there is a noticeable disparity in how accurately the model assigns outcomes between groups. Again, closer to 0 is more fair.

## Saving results and details to ml flow

In [None]:
import io
import json

class mlflow_config:
    log_fairness_metrics = False
    
    log_feature_importance = False
    
    log_df_count_after_before = True
    
    
def log_target_distribution(y, split_name):
    counts = pd.Series(y).value_counts().sort_index()
    fig, ax = plt.subplots(figsize=(6,4))
    counts.plot(kind='bar', ax=ax)
    ax.set_title(f'Target distribution for {split_name}')
    ax.set_xlabel('Class')
    ax.set_ylabel('Count')
    for i, v in enumerate(counts):
        ax.text(i, v + 0.01*max(counts), str(v), ha='center')
    # Log the figure directly
    mlflow.log_figure(fig, f"{split_name}_target_distribution.png")
    plt.close(fig)

In [None]:
params = model.get_params()

# Create a new MLflow Experiment
mlflow.set_experiment(f"{ml_config.mlflow_experiment_name}_ModelName_{ml_config.model_name}",
#                      description=ml_config.description
                     )

#set tags
tags = {
        'model_name': ml_config.model_name,
        'dataset_filename':ml_config.file,
        "no_enc_ppid_experiment" : ml_config.no_enc_ppid_experiment,
    "low_feature_count_less_than_5_bmi_exclude" : ml_config.low_feature_count_less_than_5_bmi_exclude,
    'no_enc_after_2_plus_vi_above_50_exlcude':no_enc_after_2_plus_vi_above_50_exlcude,
    
    "pat_before_index_visit_dia_meds_remove" : ml_config.pat_before_index_visit_dia_meds_remove,
    "pat_outcome0_with_dia_meds_remove" : ml_config.pat_outcome0_with_dia_meds_remove,
    
    "sdoh_screening_experiment" : ml_config.sdoh_screening_experiment
    }
# Start an MLflow run
with mlflow.start_run(run_name = ml_config.run_name,
                     tags=tags):
    
    
    
    # Log train and test PIDs
    with io.StringIO() as f:
        json.dump(train_pids, f)
        f.seek(0)
        mlflow.log_text(f.read(), "train_pids.json")

    with io.StringIO() as f:
        json.dump(test_pids, f)
        f.seek(0)
        mlflow.log_text(f.read(), "test_pids.json")
    
    # Log the hyperparameters   
    mlflow.log_params(params)
    

    # Log the loss metric
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("roc_auc", roc_auc)
    mlflow.log_metric("sensitivity", sensitivity)
    mlflow.log_metric("specificity", specificity)
    mlflow.log_metric("auc_pr", auc_pr)
    mlflow.log_dict(report, "classification_report.json")
    
    
    log_target_distribution(y_train, 'train')
    log_target_distribution(y_test, 'test')
    
    mlflow.log_figure(fig, "confusion_matrix.png")
    plt.close(fig)
    
    ### Fairnesss metrics
    mlflow.log_metric("demographic parity difference for RACE", dp_diff)
    mlflow.log_metric("Equalized odds difference for RACE", eo_diff)
    





In [None]:
!mlflow ui

# Model performance who didn't develop in 5 year but later

In [None]:
data_test_after_5 = data_test.with_columns(pl.Series('model_prediction',model.predict(X_test)))
data_test_after_5 = data_test_after_5.with_columns(pl.Series('model_prediction_prob',model.predict_proba(X_test)[:,1]))

In [None]:
data_test_after_5 = data_test.filter((pl.col('Outcome') == 0) &  
                 (~pl.col('FirstOutcomeDate').is_null())
                )

In [None]:
from sklearn.metrics import accuracy_score

print(accuracy_score(data_test_after_5['Outcome'].to_list(), data_test_after_5['model_prediction'].to_list()))

# Model Performance over years

In [None]:
data_test = data_test.with_columns(pl.Series('model_prediction',model.predict(X_test)))
data_test = data_test.with_columns(pl.Series('model_prediction_prob',model.predict_proba(X_test)[:,1]))

In [None]:
# def adjust_prediction(df: pl.DataFrame) -> pl.DataFrame:
#     return df.with_columns(
#         pl.when(pl.col("Outcome") == 0)
#         .then(1 - pl.col("model_prediction_prob"))
#         .otherwise(pl.col("model_prediction_prob"))
#         .alias("model_prediction_prob")
#     )

# data_test = adjust_prediction(data_test)

In [None]:
data_test.filter((pl.col('Outcome') == 0) &  (pl.col('model_prediction_prob') >0.95))

In [None]:
req_test_df = data_test.select(['PATIENT_NUM','FirstOutcomeDate',
                                'Outcome','model_prediction','model_prediction_prob']).filter(pl.col('Outcome') == 1)
len(req_test_df)

In [None]:
req_test_df.head()

In [None]:
np.unique(data_test['Outcome'], return_counts =True)

In [None]:
np.unique(data_test.filter(pl.col('model_prediction_prob')>0.9)['Outcome'].to_list(), return_counts = True)

In [None]:
before_04_2020_prob_09 = data_test.filter(pl.col('model_prediction_prob')>0.97)


print(classification_report(before_04_2020_prob_09['Outcome'].to_list(), before_04_2020_prob_09['model_prediction'].to_list()))

In [None]:
req_test_df

In [None]:
# Define the date ranges
import datetime

date_ranges = [
    (datetime.date(2016, 4, 1), datetime.date(2018, 4, 1)),
    (datetime.date(2018, 4, 1), datetime.date(2019, 4, 1)),
    (datetime.date(2019, 4, 1), datetime.date(2020, 4, 1)),
    (datetime.date(2020, 4, 1), datetime.date(2021, 4, 1)),
    (datetime.date(2021, 4, 1), datetime.date(2022, 4, 1)),
]

sum_pat = 0
for start, end in date_ranges:
    print(start)
    df = req_test_df.filter(
        (pl.col("FirstOutcomeDate") >= pl.lit(start)) &
        (pl.col("FirstOutcomeDate") < pl.lit(end))
    )
    
    accuracy = (
    df.select(
        (pl.col("Outcome") == pl.col("model_prediction")).alias("correct")
    )
    .select(pl.col("correct").mean())
    .to_numpy()[0]
    )
    plt.hist(df.filter(pl.col('Outcome')==1)['model_prediction_prob'].to_list(), bins=20)  # 'bins' can be changed as needed
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.title('Probability distribution of class 1')
    plt.show()

    print("Number of data points: ", len(df))
    
    sum_pat+= len(df)
    print("Accuracy: ",accuracy)


In [None]:
sum_pat

In [None]:
req_test_df_2 = data_test.select(['PATIENT_NUM','FirstOutcomeDate',
                                'Outcome','model_prediction']).filter((pl.col('Outcome') == 0) & 
                                                                      (~pl.col('FirstOutcomeDate').is_null()))
req_test_df_2

In [None]:
len(req_test_df_2.filter(pl.col('Outcome')==pl.col('model_prediction')))

# Pre-diabetes and general population performance

In [None]:
pids_dia_loincs_values  = pl.read_parquet('Determine_Outcome1_dia_labloinc_values.parquet')
print("Number of data points with data available for determining pre-diabetes or not: ",len(pids_dia_loincs_values))
pids_dia_loincs_values.head()

In [None]:
data_test.filter(pl.col('PATIENT_NUM').is_in(pids_dia_loincs_values['PATIENT_NUM'].to_list()))

In [None]:
pids_dia_loincs_values.filter(
    pl.any_horizontal(
        pl.col(col) > 10 for col in pids_dia_loincs_values.columns[1:]
    ))

In [None]:
columns_to_check = pids_dia_loincs_values.columns[1:]  # Dynamically get all except the first

pids_pre_diabetic = pids_dia_loincs_values.filter(
    pl.any_horizontal(
        ((pl.col(col) >= 5.7) & (pl.col(col) <= 6.4)) for col in pids_dia_loincs_values.columns[1:]
    ))['PATIENT_NUM'].to_list()

pids_normal_value = pids_dia_loincs_values.filter(
    pl.any_horizontal(
        pl.col(col) < 5.7 for col in pids_dia_loincs_values.columns[1:]
    ))['PATIENT_NUM'].to_list()

pids_diab_value = pids_dia_loincs_values.filter(
    pl.any_horizontal(
        pl.col(col) > 6.5 for col in pids_dia_loincs_values.columns[1:]
    ))['PATIENT_NUM'].to_list()

In [None]:
len(pids_diab_value)

In [None]:
len(pids_normal_value)

In [None]:
dt_pre_dai = data_test.filter(pl.col('PATIENT_NUM').is_in(pids_pre_diabetic))
dt_normal = data_test.filter(pl.col('PATIENT_NUM').is_in(pids_normal_value))

In [None]:
len(dt_pre_dai)

In [None]:
len(set(data_test_pids_fp) & set(dt_pre_dai['PATIENT_NUM'].to_list()))

In [None]:
np.unique(dt_pre_dai['Outcome'].to_list(), return_counts = True)

In [None]:
### FOR pre-iabetic patient
report = classification_report(dt_pre_dai['Outcome'].to_list(), dt_pre_dai['model_prediction'].to_list())
print("Classification Report:\n", report)

In [None]:
### FOR normal range patients
report = classification_report(dt_normal['Outcome'].to_list(), dt_normal['model_prediction'].to_list())
print("Classification Report:\n", report)

In [None]:
report = classification_report(dt_pre_dai['Outcome'].to_list(), dt_pre_dai['model_prediction'].to_list())
print("Classification Report:\n", report)

In [None]:
test_pre_dia_df = data_test.filter(pl.col('PATIENT_NUM').is_in(pids_pre_diabetic))
print('Number of patients who are pre-diabetes and diagnosed with diabetes: ',len(test_pre_dia_df))
test_normal_df = data_test.filter(pl.col('PATIENT_NUM').is_in(pids_normal_value))
print('Number of patients who are normal and got diagnosed with diabetes: ',len(test_normal_df))

In [None]:
test_pre_dia_df.filter((pl.col("Outcome") == pl.col("model_prediction")) & (pl.col("Outcome") == 0))

# SHaP

In [None]:
import shap

In [None]:
# Create a SHAP explainer  
explainer = shap.TreeExplainer(model, X_train)

# Calculate SHAP values
shap_values = explainer(X_test)

# Plot the summary


In [None]:
shap.summary_plot(shap_values, X_test, max_display=20)

In [None]:
# Plot the summary
shap.summary_plot(shap_values, X_test, max_display= 10, show=False)

# Now get current axes and set y-tick labels manually
ax = plt.gca()
ax.set_yticklabels(['Age group', 'BMI', 'Gender: women', 'Percent of adults age >25 who graduated from college', 
                    'Gender: man','Hispanic: yes', 'Glucose [Mass/volume] in Serum or Plasma', 'Race: white', 'Diastolic blood pressure',
                   'Median household income'][::-1])  # Put your custom labels here
plt.show()

In [None]:
with open(f'shap_values_{ml_config.model_name}_{ml_config.file.split(".")[0]}.pkl', 'wb') as f:
    pickle.dump(shap_values, f)
# 
# Loading the explainer later
with open(f'shap_values_{ml_config.model_name}_{ml_config.file.split(".")[0]}.pkl', 'rb') as f:
    shap_values = pickle.load(f)

In [None]:
ml_config.file

In [None]:
explainer = shap.TreeExplainer(model, X_test[:75000])

# Calculate SHAP values
shap_values = explainer(X_test[:75000])

# Plot the summary
shap.summary_plot(shap_values, X_test[:75000])

## Individual experiments

## Bias From Gender_CD column

In [None]:
!pip install fairlearn -q

from fairlearn.metrics import MetricFrame, selection_rate, demographic_parity_difference, equalized_odds_difference
from sklearn.metrics import accuracy_score

In [None]:
model = XGBClassifier(#**best_params, 
                          scale_pos_weight = (len(y_train) - sum(y_train)) / sum(y_train),  
#                           use_label_encoder=False,
                          device = 'cuda', 
                          )

model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [None]:
def row_collapse(row):
    if row.sum() == 0:
        return -1
    return np.argmax(row)

In [None]:
sensitive_feature_df = X_test[[col for col in X_train.columns if 'Gender' in col]]



sensitive_feature = sensitive_feature_df.apply(row_collapse, axis =1)
print('For gender')
metricframe = MetricFrame(
    metrics={
        'accuracy': accuracy_score,
        'selection_rate': selection_rate
    },
    y_true=y_test,
    y_pred=y_pred,
    sensitive_features=sensitive_feature
)

print(metricframe.by_group)

# Demographic parity difference (difference in positive rate between groups)
dp_diff = demographic_parity_difference(y_test, y_pred, sensitive_features=sensitive_feature)
print("Demographic Parity Difference:", dp_diff)

# Equalized odds difference (difference in TPR/FPR between groups)
eo_diff = equalized_odds_difference(y_test, y_pred, sensitive_features=sensitive_feature)
print("Equalized Odds Difference:", eo_diff)

In [None]:
model = XGBClassifier(#**best_params, 
                          scale_pos_weight = (len(y_train) - sum(y_train)) / sum(y_train),  
#                           use_label_encoder=False,
                          device = 'cuda', 
                          )

In [None]:
X_train_rm_gen = X_train.drop(['Gender_CD_M', 'Gender_CD_W'], axis =1)
X_test_rm_gen = X_test.drop(['Gender_CD_M', 'Gender_CD_W'], axis =1)

In [None]:
model.fit(X_train_rm_gen, y_train)

In [None]:
y_pred = model.predict(X_test_rm_gen)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
metricframe = MetricFrame(
    metrics={
        'accuracy': accuracy_score,
        'selection_rate': selection_rate
    },
    y_true=y_test,
    y_pred=y_pred,
    sensitive_features=sensitive_feature
)

print(metricframe.by_group)

# Demographic parity difference (difference in positive rate between groups)
dp_diff = demographic_parity_difference(y_test, y_pred, sensitive_features=sensitive_feature)
print("Demographic Parity Difference:", dp_diff)

# Equalized odds difference (difference in TPR/FPR between groups)
eo_diff = equalized_odds_difference(y_test, y_pred, sensitive_features=sensitive_feature)
print("Equalized Odds Difference:", eo_diff)

In [None]:
# Create a SHAP explainer  
explainer = shap.TreeExplainer(model, X_train_rm_gen)

# Calculate SHAP values
shap_values = explainer(X_test_rm_gen)

# Plot the summary


### Performance on before april 2020

In [None]:
data_test.head()

In [None]:
test_before_apr2020 = data_test.filter((pl.col('FirstOutcomeDate').is_null()) |
                (pl.col('FirstOutcomeDate') < pl.lit('2019-04-01').str.strptime(pl.Datetime, "%Y-%m-%d")))

In [None]:
test_before_apr2020.head()

In [None]:
np.unique(test_before_apr2020['Outcome'].to_list(), return_counts = True)

In [None]:
test_df_before_2020_set = test_before_apr2020.drop(['PATIENT_NUM','FirstOutcomeDate','Outcome', 'mode_height']).to_pandas()

In [None]:
all_metrics = get_metrics(model,test_df_before_2020_set, test_before_apr2020['Outcome'])

### Experiment: Understanding how low feature count features are classified

In [None]:
with open('../Experiment related files/pids_1_feat_bmi_missing.pkl','rb') as f:
    pids_1_feat_bmi_missing = pickle.load(f).to_list()
print("Number of patient IDs:", len(pids_1_feat_bmi_missing))

In [None]:
pids_1_feat_bmi_missing[:5]

In [None]:
data_test_with_index = data_test.with_row_index("row_nr", offset=1) 
data_test_with_index.filter(pl.col('PATIENT_NUM').is_in(pids_1_feat_bmi_missing))

In [None]:
data_test.filter(pl.col('PATIENT_NUM') == 2386950).index

In [None]:
shap.plots.waterfall(shap_values[9], max_display=50)

#### Hypothesis: removing data points which have feature count 1, bmi and bp missing, will improve perofrmance of test_dataset

In [None]:
data_train.filter(pl.col('PATIENT_NUM').is_in(pids_1_feat_bmi_missing))

In [None]:
print("Data train before removing pids:",len(data_train))

In [None]:
data_train = data_train.filter(~pl.col('PATIENT_NUM').is_in(pids_1_feat_bmi_missing))
# data_test = data_test.filter(~pl.col('PATIENT_NUM').is_in(pids_1_feat_bmi_missing))

In [None]:
print("Data train after removing pids:",len(data_train))

In [None]:
X_train,y_train = data_train.drop(['PATIENT_NUM','FirstOutcomeDate','Outcome']).to_pandas(), data_train['Outcome'].to_pandas()
X_test,y_test = data_test.drop(['PATIENT_NUM','FirstOutcomeDate','Outcome']).to_pandas(), data_test['Outcome'].to_pandas()

In [None]:
model = CatBoostClassifier( auto_class_weights='Balanced',allow_writing_files=False,
                                   task_type ='GPU',
                                   silent=True)

In [None]:
model.fit(X_train,y_train)

In [None]:
## Metrics
conf_matrix, accuracy, roc_auc, sensitivity, specificity, auc_pr, cm = get_metrics(model,X_test, y_test)

## TP, TN, FN and FP indexes

In [None]:
# model  = CatBoostClassifier().load_model('../Determine_trained_models/Catboost'+ ml_config.file)

In [None]:
# Assuming 'model' is your trained model and it has a predict method
# Predict on the train data
predictions = model.predict(X_test)

# Boolean array where True indicates false positives
false_positives_mask = (predictions == 1) & (y_test == 0)
false_negatives_mask = (predictions == 0) & (y_test == 1)
true_positives_mask = (predictions == 1) & (y_test == 1)
true_negatives_mask = (predictions == 0) & (y_test == 0)
# Get indices of false positives
false_positive_indices = np.where(false_positives_mask)[0]
false_negatives_indices = np.where(false_negatives_mask)[0]
true_positives_indices = np.where(true_positives_mask)[0]
true_negatives_indices = np.where(true_negatives_mask)[0]

print("Indices of False Positives:", false_positive_indices)
print("Indices of False Negatives:", false_negatives_indices)
print("Indices of True Positives:", true_positives_indices)
print("Indices of True Negatives:", true_negatives_indices)

In [None]:
len(false_positive_indices)

In [None]:
len(false_negatives_indices)

In [None]:
data_test_pids_fp = data_test[false_positive_indices]['PATIENT_NUM'].to_list()
data_test_pids_fn = data_test[false_negatives_indices]['PATIENT_NUM'].to_list()
data_test_pids_tp = data_test[true_positives_indices]['PATIENT_NUM'].to_list()
data_test_pids_tn = data_test[true_negatives_indices]['PATIENT_NUM'].to_list()


In [None]:
directory_path = f"./FP_FN_{ml_config.file.split('.')[0]}"

# Create the directory
os.makedirs(directory_path, exist_ok=True)

In [None]:
import pickle

labels = {
    'false_positives': data_test_pids_fp,
    'false_negatives': data_test_pids_fn,
    'true_positives': data_test_pids_tp,
    'true_negatives': data_test_pids_tn
}

# Save all
for label, data in labels.items():
    with open(f'./{directory_path}/{ml_config.model_name}_{label}_pids_test.pkl', 'wb') as file:
        pickle.dump(data, file)

# Load all
loaded = {}
for label in labels:
    with open(f'./{directory_path}/{ml_config.model_name}_{label}_pids_test.pkl', 'rb') as file:
        loaded[label] = pickle.load(file)

# Unpack if needed
data_test_pids_fp = loaded['false_positives']
data_test_pids_fn = loaded['false_negatives']
data_test_pids_tp = loaded['true_positives']
data_test_pids_tn = loaded['true_negatives']

In [None]:
from datetime import datetime

In [None]:
data_test.filter(pl.col('PATIENT_NUM').is_in(data_test_pids_fn)).sort('FirstOutcomeDate').filter(pl.col("FirstOutcomeDate") < datetime(2019, 1, 2))
          #,return_counts = True)

In [None]:
data_test.filter(pl.col('PATIENT_NUM').is_in(data_test_pids_fp)).filter(pl.col("Outcome")== 1)
          #,return_counts = True)

In [None]:
data_test[[36, 54]]

# Probability Distribution across FP, FN, TN and TP

In [None]:
predictions = model.predict(X_test)

# Boolean arrays
false_positives_mask = (predictions == 1) & (y_test == 0)
false_negatives_mask = (predictions == 0) & (y_test == 1)
true_positives_mask = (predictions == 1) & (y_test == 1)
true_negatives_mask = (predictions == 0) & (y_test == 0)

# Indices for each group
false_positive_indices = np.where(false_positives_mask)[0]
false_negatives_indices = np.where(false_negatives_mask)[0]
true_positives_indices = np.where(true_positives_mask)[0]
true_negatives_indices = np.where(true_negatives_mask)[0]

print("Indices of False Positives:", false_positive_indices)
print("Indices of False Negatives:", false_negatives_indices)
print("Indices of True Positives:", true_positives_indices)
print("Indices of True Negatives:", true_negatives_indices)

# Probability predictions
x_test_prob = model.predict_proba(X_test)

# Probability distributions for each group
fp_probs = x_test_prob[false_positive_indices]  # False positives
fn_probs = x_test_prob[false_negatives_indices] # False negatives
tp_probs = x_test_prob[true_positives_indices]  # True positives
tn_probs = x_test_prob[true_negatives_indices]  # True negatives

print("Probabilities for False Positives:", fp_probs)
print("Probabilities for False Negatives:", fn_probs)
print("Probabilities for True Positives:", tp_probs)
print("Probabilities for True Negatives:", tn_probs)

In [None]:
# Extract probability assigned to positive class (class 1)
fp_probs_pos = fp_probs[:, 1]
fn_probs_pos = fn_probs[:, 0]
tp_probs_pos = tp_probs[:, 1]
tn_probs_pos = tn_probs[:, 0]

plt.figure(figsize=(10,6))
plt.hist(fp_probs_pos, bins=20, alpha=0.6, label='False Positives', color='red', density=True)
plt.hist(fn_probs_pos, bins=20, alpha=0.6, label='False Negatives', color='blue', density=True)
plt.hist(tp_probs_pos, bins=20, alpha=0.6, label='True Positives', color='green', density=True)
plt.hist(tn_probs_pos, bins=20, alpha=0.6, label='True Negatives', color='gray', density=True)
plt.xlabel('Predicted Probability of Positive Class')
plt.ylabel('Density')
plt.title('Histogram of Predicted Probabilities by Prediction Type')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
plt.suptitle('Probability distribution across FP, FN, TN and TP')
# False Positives
axes[0, 0].hist(fp_probs_pos, bins=20, color='red')
axes[0, 0].set_title('False Positives')
axes[0, 0].set_xlabel('Predicted Probability (Positive Class)')
axes[0, 0].set_ylabel('Count')
axes[0, 0].grid(True)

# False Negatives
axes[0, 1].hist(fn_probs_pos, bins=20, color='blue')
axes[0, 1].set_title('False Negatives')
axes[0, 1].set_xlabel('Predicted Probability (Positive Class)')
axes[0, 1].set_ylabel('Count')
axes[0, 1].grid(True)

# True Positives
axes[1, 0].hist(tp_probs_pos, bins=20, color='green')
axes[1, 0].set_title('True Positives')
axes[1, 0].set_xlabel('Predicted Probability (Positive Class)')
axes[1, 0].set_ylabel('Count')
axes[1, 0].grid(True)

# True Negatives
axes[1, 1].hist(tn_probs_pos, bins=20, color='gray')
axes[1, 1].set_title('True Negatives')
axes[1, 1].set_xlabel('Predicted Probability (Positive Class)')
axes[1, 1].set_ylabel('Count')
axes[1, 1].grid(True)



plt.tight_layout()
plt.show()

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt

# Get predicted probabilities for positive class
y_prob = model.predict_proba(X_test)[:, 1]
thresholds = np.linspace(0, 1, 101)

accuracies = []
precisions = []
recalls = []
f1s = []

for thresh in thresholds:
    y_pred_thresh = (y_prob >= thresh).astype(int)
    accuracies.append(accuracy_score(y_test, y_pred_thresh))
    precisions.append(precision_score(y_test, y_pred_thresh, zero_division=0))
    recalls.append(recall_score(y_test, y_pred_thresh))
    f1s.append(f1_score(y_test, y_pred_thresh))

plt.figure(figsize=(10,6))
plt.plot(thresholds, accuracies, label='Accuracy')
plt.plot(thresholds, precisions, label='Precision')
plt.plot(thresholds, recalls, label='Recall')
plt.plot(thresholds, f1s, label='F1 Score')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Model Performance at Different Thresholds')
plt.legend()
plt.grid(True)

plt.show()

In [None]:
# Predict class and probabilities
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 0]


false_negatives_mask = (y_pred == 0) & (y_test == 1)


high_prob_mask = y_prob > 0.90


fn_high_prob_mask = false_negatives_mask & high_prob_mask

fn_high_prob_indices = np.where(fn_high_prob_mask)[0]

# Get data points and optionally their probabilities
X_fn_high_prob = X_test.iloc[fn_high_prob_indices]
probs_fn_high = y_prob[fn_high_prob_indices]

print("Indices of FNs with prob > 0.90:", fn_high_prob_indices)
print("Probs of those:", probs_fn_high)

In [None]:
data_test[fn_high_prob_indices]

In [None]:
# Predict class and probabilities
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]


false_positives_mask = (y_pred == 1) & (y_test == 0)


high_prob_mask = y_prob > 0.90


fp_high_prob_mask = false_positives_mask & high_prob_mask

fp_high_prob_indices = np.where(fp_high_prob_mask)[0]

# Get data points and optionally their probabilities
X_fp_high_prob = X_test.iloc[fp_high_prob_indices]
probs_fp_high = y_prob[fp_high_prob_indices]

print("Indices of FNs with prob > 0.90:", fp_high_prob_indices)
print("Probs of those:", probs_fp_high)

In [None]:
data_test[fp_high_prob_indices]

In [None]:
# Predict class and probabilities
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]


true_positives_mask = (y_pred == 1) & (y_test == 1)


high_prob_mask = y_prob > 0.90


tp_high_prob_mask = true_positives_mask & high_prob_mask

tp_high_prob_indices = np.where(tp_high_prob_mask)[0]

# Get data points and optionally their probabilities
X_tp_high_prob = X_test.iloc[tp_high_prob_indices]
probs_tp_high = y_prob[tp_high_prob_indices]

print("Indices of FNs with prob > 0.90:", tp_high_prob_indices)
print("Probs of those:", probs_tp_high)

In [None]:
data_test[tp_high_prob_indices]

In [None]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# True Negatives mask (predicted 0, actual 0)
true_negatives_mask = (y_pred == 0) & (y_test == 0)

# High confidence negatives: probability of class 1 is LOW (< 0.10)
low_prob_mask = y_prob < 0.10

# TN with high probability (really low proba for class 1)
tn_high_conf_mask = true_negatives_mask & low_prob_mask

tn_high_conf_indices = np.where(tn_high_conf_mask)[0]

# Get data points and their probabilities
X_tn_high_conf = X_test.iloc[tn_high_conf_indices]
probs_tn_high = y_prob[tn_high_conf_indices]

print("Indices of TNs with prob < 0.10:", tn_high_conf_indices)
print("Probs of those:", probs_tn_high)

In [None]:
data_test[tn_high_conf_indices]

### Loooking to waterfall plots for >90 prob tn, tp, fn and fp

In [None]:
feature_missing_df = pl.read_parquet('../Determine_missing_features_OCHIN.parquet')

In [None]:
feature_missing_df.filter(pl.col('PATIENT_NUM').is_in(data_test[fn_high_prob_indices]['PATIENT_NUM'].to_list())).filter(pl.col('Total_Feature_Count')!=1)

In [None]:
feature_missing_df.filter(pl.col('PATIENT_NUM').is_in(data_test[fn_high_prob_indices]['PATIENT_NUM'].to_list())).filter(pl.col('Total_Feature_Count')<40)

In [None]:
# Find the index of the row with patient_num 4321905
data_test.with_row_index().filter(pl.col('PATIENT_NUM')==4314903)

# Waterfall plot

## FN 

In [None]:
patient_nums_of_interest = (
    feature_missing_df
    .filter(
        pl.col('PATIENT_NUM').is_in(
            data_test[fn_high_prob_indices]['PATIENT_NUM'].to_list()
#         ) & (pl.col('Total_Feature_Count') < 10)
    ))
    .select('PATIENT_NUM')
    .to_series()
    .to_list()
)

# Add row index to data_test for easy lookup
test_with_index = data_test.with_row_index(name="ROW_INDEX")

# Map each PATIENT_NUM to its row index
patnum_to_index = dict(
    test_with_index
    .filter(pl.col('PATIENT_NUM').is_in(patient_nums_of_interest))
    .select(['PATIENT_NUM', 'ROW_INDEX'])
    .rows()
)

# Plot SHAP waterfall plot for each index
for pat_num, idx in patnum_to_index.items():
    print(f"Waterfall plot for PATIENT_NUM {pat_num} (row index {idx}):")
    print("Total feature count(meds, labs and dxs): ",feature_missing_df.filter(pl.col('PATIENT_NUM')==pat_num)['Total_Feature_Count'][0])
    shap.plots.waterfall(shap_values[idx])
    plt.show()

## FP

In [None]:
patient_nums_of_interest = (
    feature_missing_df
    .filter(
        pl.col('PATIENT_NUM').is_in(
            data_test[fp_high_prob_indices]['PATIENT_NUM'].to_list()
        ) 
        
#         & (pl.col('Total_Feature_Count') != 1)
    )
    .select('PATIENT_NUM')
    .to_series()
    .to_list()
)

# Add row index to data_test for easy lookup
test_with_index = data_test.with_row_index(name="ROW_INDEX")

# Map each PATIENT_NUM to its row index
patnum_to_index = dict(
    test_with_index
    .filter(pl.col('PATIENT_NUM').is_in(patient_nums_of_interest))
    .select(['PATIENT_NUM', 'ROW_INDEX'])
    .rows()
)

# Plot SHAP waterfall plot for each index
for pat_num, idx in patnum_to_index.items():
    print(f"Waterfall plot for PATIENT_NUM {pat_num} (row index {idx}):")
    print("Total feature count(meds, labs and dxs): ",feature_missing_df.filter(pl.col('PATIENT_NUM')==pat_num)['Total_Feature_Count'][0])
    shap.plots.waterfall(shap_values[idx])
    plt.show()

# TP

In [None]:
patient_nums_of_interest = (
    feature_missing_df
    .filter(
        pl.col('PATIENT_NUM').is_in(
            data_test[tp_high_prob_indices]['PATIENT_NUM'].to_list()[:200]
        ) & (pl.col('Total_Feature_Count') >20)
    )
    .select('PATIENT_NUM')
    .to_series()
    .to_list()
)

# Add row index to data_test for easy lookup
test_with_index = data_test.with_row_index(name="ROW_INDEX")

# Map each PATIENT_NUM to its row index
patnum_to_index = dict(
    test_with_index
    .filter(pl.col('PATIENT_NUM').is_in(patient_nums_of_interest))
    .select(['PATIENT_NUM', 'ROW_INDEX'])
    .rows()
)

# Plot SHAP waterfall plot for each index
for pat_num, idx in patnum_to_index.items():
    print(f"Waterfall plot for PATIENT_NUM {pat_num} (row index {idx}):")
    print("Total feature count(meds, labs and dxs): ",feature_missing_df.filter(pl.col('PATIENT_NUM')==pat_num)['Total_Feature_Count'][0])
    shap.plots.waterfall(shap_values[idx])
    plt.show()

In [None]:
data_train = data_train.with_columns(pl.Series('Outcome',y_train.values))


# Count occurrences of each combination
counts = data_train.group_by(['Gender_CD_W', 'Gender_CD_M', 'Outcome']).count().sort('Outcome')

print(counts)

In [None]:
data_test = data_test.with_columns(pl.Series('Outcome',y_test.values))


# Count occurrences of each combination
counts_test = data_test.group_by(['Gender_CD_W', 'Gender_CD_M', 'Outcome']).count().sort('Outcome')

print(counts)

In [None]:
model.fit(X_train.drop(['Gender_CD_W', 'Gender_CD_M'], axis = 1),y_train)

conf_matrix, accuracy, roc_auc, sensitivity, specificity, auc_pr, cr = get_metrics(model,X_test.drop(['Gender_CD_W', 'Gender_CD_M'], axis =1), y_test)

In [None]:
explainer1 = shap.TreeExplainer(model, X_train)

# Calculate SHAP values
shap_values1 = explainer1(X_test)

# Plot the summary
shap.summary_plot(shap_values1, X_test)

In [None]:
shap.plots.waterfall(shap_values[71179])

In [None]:
# data point with 42 features
shap.plots.waterfall(shap_values[126669])

In [None]:
## with 33 features
shap.plots.waterfall(shap_values[69511])

# Fairness metrics

# Sub-group performance

In [None]:
## model to use
model  = CatBoostClassifier().load_model('../Determine_trained_models/catboost_whole_dataset_phecodes_bmi_bp')

In [None]:
X_test.columns[:15]

In [None]:
y_test.values

In [None]:
def predict_on_data(sub_x_test,sub_y_test, model):
    y_pred = model.predict(sub_x_test)
    y_pred_probs = model.predict_proba(sub_x_test)[:,1]
    
    tn, fp, fn, tp = confusion_matrix(sub_y_test, y_pred).ravel()

    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)

    print("Sensitivity: ", sensitivity)
    print("Specificity: ", specificity)
    roc_auc = roc_auc_score(sub_y_test, y_pred_probs)
    print("ROC curve: ", roc_auc)
    
    acc = accuracy_score(sub_y_test, y_pred)
    print('Accuracy: ', acc)
    return sensitivity, specificity, roc_auc, acc
    
     

In [None]:
# - **'DEM|RACE:01'**: White
# - **'DEM|RACE:02'**: Black or African American
# - **'DEM|RACE:03'**: Asian
# - **'DEM|RACE:04'**: Native Hawaiian or Other Pacific Islander
# - **'DEM|RACE:05'**: American Indian or Alaska Native
# - **'DEM|RACE:06'**: Other or Unknown
# - **'DEM|RACE:07'**: More than one race

In [None]:
import matplotlib.pyplot as plt

In [None]:
race_cols = [
    "Race_CD_01",
    "Race_CD_02",
    "Race_CD_03",
    "Race_CD_04",
    "Race_CD_05",
    "Race_CD_06",
    "Race_CD_07",
    "Race_CD_UNK"
]

races = []
sens = []
specs = []
roc_aucs = []
counts = []
races = []
class_1_count = []

for col in race_cols:
    if col.endswith('01'):
        race = 'white'
    if col.endswith('02'):
        race = 'Black or African American'
    if col.endswith('03'):
        race = 'Asian'
    if col.endswith('04'):
        race = 'Native Hawaiian or Other Pacific Islander'
    if col.endswith('05'):
        race = 'American Indian or Alaska Native'
    if col.endswith('06'):
        race = 'Multiple Race'
    if col.endswith('07'):
        race = 'refuse to Answer'
    elif col.endswith('UNK'):
        race = 'UNK'
        
    print("Number of data points with race {} are: {}".format( col, X_test[X_test[col]==1].shape[0] ))
    
    # Get indexes where the value is 1 for the current race column
    X_test_race = X_test[X_test[col] == 1]
    indexes = X_test_race.index
    
    # Retrieve the respective y_test data using these indexes
    y_test_race = y_test.loc[indexes]
    
    sensitivity, specificity, roc_auc = predict_on_data(X_test_race, y_test_race, model)
    races.append(race)
    sens.append(sensitivity)
    specs.append(specificity)
    roc_aucs.append(roc_auc)
    class_1_count.append(sum(y_test_race.values))
    counts.append(X_test[X_test[col]==1].shape[0])
    

    

In [None]:
df = pd.DataFrame({
    'Race': races,
    'counts':counts,
    'class_1 count': class_1_count,
    'Sensitivity': sens,
    'Specificity': specs,
    'ROC_AUC': roc_aucs
})
df

In [None]:
to_collapse = df.iloc[5:8]

# Calculate the new collapsed row
new_row = {
    'Race': 'Collapsed Race',
    'counts': to_collapse['counts'].sum(),
    'class_1 count': to_collapse['class_1 count'].sum(),
    'Sensitivity': to_collapse['Sensitivity'].mean(),
    'Specificity': to_collapse['Specificity'].mean(),
    'ROC_AUC': to_collapse['ROC_AUC'].mean()
}

# Drop the old rows and append the new row
df = df.drop(index=range(5, 8)).append(new_row, ignore_index=True)

In [None]:
new_row

In [None]:
age_groups = np.unique(X_test['Age_group'].values)

age_ranges = []
sens = []
specs = []
roc_aucs = []
counts =[]
class_1_count = []

races = []
for age_enc in age_groups:
    
    if age_enc == 0:
        age = '18-34'
    elif age_enc == 1:
        age = '35-44'
    elif age_enc == 2:
        age = '45-54'
    elif age_enc == 3:
        age = '54-65'
    elif age_enc == 4:
        age = '65-74'
    else:
        age = '75_older'
        
    print("Number of data points with age group {} are: {}".format( age_enc, X_test[X_test['Age_group']== age_enc].shape[0] ))
    
    # Get indexes where the value is 1 for the current race column
    X_test_race = X_test[X_test['Age_group']== age_enc]
    indexes = X_test_race.index
    
    # Retrieve the respective y_test data using these indexes
    y_test_race = y_test.loc[indexes]
    
    sensitivity, specificity, roc_auc = predict_on_data(X_test_race, y_test_race, model)
    age_ranges.append(age)
    sens.append(sensitivity)
    specs.append(specificity)
    roc_aucs.append(roc_auc)
    class_1_count.append(sum(y_test_race.values))
    counts.append(X_test[X_test['Age_group']== age_enc].shape[0])


In [None]:
df = pd.DataFrame({
    'Age_Group': age_ranges,
    'counts': counts,
    'class_1 count': class_1_count,
    'Sensitivity': sens,
    'Specificity': specs,
    'ROC_AUC': roc_aucs,
})
df

## Gender

In [None]:
gender_cols = ["Gender_CD_GQ", "Gender_CD_M", "Gender_CD_TG", "Gender_CD_UNK", "Gender_CD_W"]

In [None]:
gender_list = []
sens = []
specs = []
roc_aucs = []
counts =[]
class_1_count = []

races = []
for gen in gender_cols:
    
    if gen.endswith('GQ'):
        gender = 'GenderQueer'
    elif gen.endswith('M'):
        gender = 'Man'
    elif gen.endswith('W'):
        gender = 'Woman'
    elif gen.endswith('TG'):
        gender = 'Transgender'
    elif gen.endswith('UNK'):
        gender = 'Unknown'

        
    print("Number of data points with gender group {} are: {}".format( gender, X_test[X_test[gen]== 1].shape[0] ))
    
    # Get indexes where the value is 1 for the current race column
    X_test_gen = X_test[X_test[gen]== 1]
    indexes = X_test_gen.index
    
    # Retrieve the respective y_test data using these indexes
    y_test_gen = y_test.loc[indexes]
    
    sensitivity, specificity, roc_auc,_ = predict_on_data(X_test_gen, y_test_gen, model)
    gender_list.append(gender)
    sens.append(sensitivity)
    specs.append(specificity)
    roc_aucs.append(roc_auc)
    class_1_count.append(sum(y_test_gen.values))
    counts.append(X_test[X_test[gen]== 1].shape[0])

In [None]:
df = pd.DataFrame({
    'Gender': gender_list,
    'counts': counts,
    'class_1 count': class_1_count,
    'Sensitivity': sens,
    'Specificity': specs,
    'ROC_AUC': roc_aucs,
})
df.reset_index(drop=True, inplace=True)
df

# 5-year performance

In [None]:
model  = CatBoostClassifier().load_model('../Determine_trained_models/catboost_whole_dataset_phecodes_bmi_bp')

In [None]:
req_test_dates_df = test_data_firstoutcome_df.drop_nulls().to_pandas()
req_test_dates_df

In [None]:
# Constants
start_date = pd.Timestamp('2017-04-01')

# Calculate the difference in years between the Index_Start_date and the start_date if data is available
req_test_dates_df['year_difference'] = (req_test_dates_df['FirstOutcomeDate'] - start_date).dt.days / 365.25

# Filter DataFrames based on year difference, ignore NaNs as they indicate missing data in cohort_df
df_less_than_1 = req_test_dates_df[req_test_dates_df['year_difference'] < 1].dropna()
df_1_to_2 = req_test_dates_df[(req_test_dates_df['year_difference'] >= 1) & (req_test_dates_df['year_difference'] < 2)].dropna()
df_2_to_3 = req_test_dates_df[(req_test_dates_df['year_difference'] >= 2) & (req_test_dates_df['year_difference'] < 3)].dropna()
df_3_to_4 = req_test_dates_df[(req_test_dates_df['year_difference'] >= 3) & (req_test_dates_df['year_difference'] < 4)].dropna()
df_4_to_5 = req_test_dates_df[(req_test_dates_df['year_difference'] >= 4) & (req_test_dates_df['year_difference'] < 5)].dropna()

In [None]:
year_ranges = []
sens = []
specs = []
roc_aucs = []
counts =[]
class_1_count = []
acc_scores = []

In [None]:
for i,buf_df in enumerate([df_less_than_1, df_1_to_2, df_2_to_3, df_3_to_4]):
     # Use the index of buf_df
    indices = buf_df.index
    # Use the index to filter X_test
    X_test_buf = X_test.loc[indices.intersection(X_test.index)]
    y_test_buf = y_test.loc[indices.intersection(X_test.index)]
    
    if i == 0:
        year ='Less than 1 year'
    if i == 1:
        year ='Between 1 and 2 year'
    if i == 2:
        year ='Between 2 and 3 year'
    if i == 3:
        year ='Between 3 and 4 year'
    
    print(year)
    sensitivity, specificity, roc_auc, acc = predict_on_data(X_test_buf, y_test_buf, model)
    year_ranges.append(year)
    sens.append(sensitivity)
    specs.append(specificity)
    roc_aucs.append(roc_auc)
    counts.append(len(y_test_buf))
    acc_scores.append(acc)

In [None]:
df = pd.DataFrame({
    'Age_Group': year_ranges,
    'counts (class 1)': counts,
    'Accuracy (class 1)': acc_scores,
    'Sensitivity': sens,
    'Specificity': specs,
    'ROC_AUC': roc_aucs,
})
df

# Ensemble

In [None]:
### 1. Fold Ensemble (catboost)

import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score, confusion_matrix, classification_report

# Parameters
n_folds = 5

# Stratified K-Fold object
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

test_preds = []
valid_scores = []

for fold, (train_idx, valid_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"Training fold {fold + 1}/{n_folds}")

    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]
    
    model = CatBoostClassifier(
        auto_class_weights='Balanced',allow_writing_files=False,
        task_type ='GPU', 
        verbose=0
    )
    model.fit(X_tr, y_tr)
    
    val_pred = model.predict(X_val)
    acc = accuracy_score(y_val, val_pred)
    valid_scores.append(acc)
    print(f"Validation Accuracy for Fold {fold + 1}: {acc:.4f}")

    test_pred = model.predict_proba(X_test)[:, 1]
    test_preds.append(test_pred)


In [None]:
# Average predictions across folds
mean_test_pred = np.mean(test_preds, axis=0)
final_test_pred = (mean_test_pred >= 0.5).astype(int)

# Evaluate on test set if y_test is available
test_acc = accuracy_score(y_test, final_test_pred)
print(f"Ensemble Test Accuracy: {test_acc:.4f}")




# Sensitivity (Recall for class 1)
sensitivity = recall_score(y_test, final_test_pred)
print(f"Sensitivity (Recall for positive class): {sensitivity:.4f}")

# Specificity (Recall for class 0)
cm = confusion_matrix(y_test, final_test_pred)
# cm format: [[TN, FP],
#             [FN, TP]]
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)
print(f"Specificity (Recall for negative class): {specificity:.4f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, final_test_pred))

In [None]:
len(final_test_pred)

In [None]:
mode((np.array(test_preds)>0.5).astype(int)).mode

In [None]:
from scipy.stats import mode

# Majority voting

final_test_pred = mode((np.array(test_preds)>0.5).astype(int)).mode

# test_acc = accuracy_score(y_test, final_test_pred)
# print(f"Ensemble Test Accuracy (majority vote): {test_acc:.4f}")
# print("Validation Accuracies for all folds:", valid_scores)
# print("Mean Validation Accuracy:", np.mean(valid_scores))


# Sensitivity (Recall for class 1)
sensitivity = recall_score(y_test, final_test_pred)
print(f"Sensitivity (Recall for positive class): {sensitivity:.4f}")

# Specificity (Recall for class 0)
cm = confusion_matrix(y_test, final_test_pred)
# cm format: [[TN, FP],
#             [FN, TP]]
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)
print(f"Specificity (Recall for negative class): {specificity:.4f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, final_test_pred))

# # (Rest of your code)
# print("Validation Accuracies for all folds:", valid_scores)
# print("Mean Validation Accuracy:", np.mean(valid_scores))

In [None]:
### 2. Different Models Ensemble
###    -> Avg. Probability
###    -> Voting


In [None]:
import joblib
from xgboost import XGBClassifier, XGBRegressor, Booster
from catboost import CatBoostClassifier, CatBoostRegressor
import os

def load_model(model_path, model_name=None):

    
    
    if "xgboost" in model_path:

            model = XGBClassifier()
            model.load_model(model_path)
 
            return model
    elif "catboost" in model_path:
        # For CatBoost: use load_model
        try:
            model = CatBoostClassifier()
            model.load_model(model_path)
        except Exception:
            model = CatBoostRegressor()
            model.load_model(model_path)
        return model
    elif "random_forest" in model_path:
        # For sklearn-based models
        model = joblib.load(model_path)
        return model
    else:
        # Try generic joblib loading as fallback
        model = joblib.load(model_path)
        return model


catboost_path = "../Determine_trained_models/catboost_dataset_Determine_joined_med_usage_lab_median_domain_expert_diag_phemap_without_icd10z_bmi_bp_cvs_ordinal_nominal_encoded_boruta_features.cbm"
xgboost_path = "../Determine_trained_models/xgboost_dataset_Determine_joined_med_usage_lab_median_domain_expert_diag_phemap_without_icd10z_bmi_bp_cvs_ordinal_nominal_encoded_boruta_features.json"
randomforest_path = "../Determine_trained_models/random_forest_dataset_Determine_joined_med_usage_lab_median_domain_expert_diag_phemap_without_icd10z_bmi_bp_cvs_ordinal_nominal_encoded_boruta_features.pkl"


catboost_model = load_model(catboost_path, model_name="catboost")
xgboost_model = load_model(xgboost_path, model_name="xgboost")
rf_model = load_model(randomforest_path, model_name="random_forest")

In [None]:
test_preds = []
for model in [catboost_model, xgboost_model, rf_model]:

    test_pred = model.predict_proba(X_test)[:, 1]
    test_preds.append(test_pred)


In [None]:
len(test_preds)

In [None]:
# Average predictions across folds
mean_test_pred = np.mean(test_preds, axis=0)
final_test_pred = (mean_test_pred >= 0.5).astype(int)

# Evaluate on test set if y_test is available
test_acc = accuracy_score(y_test, final_test_pred)
print(f"Ensemble Test Accuracy: {test_acc:.4f}")




# Sensitivity (Recall for class 1)
sensitivity = recall_score(y_test, final_test_pred)
print(f"Sensitivity (Recall for positive class): {sensitivity:.4f}")

# Specificity (Recall for class 0)
cm = confusion_matrix(y_test, final_test_pred)
# cm format: [[TN, FP],
#             [FN, TP]]
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)
print(f"Specificity (Recall for negative class): {specificity:.4f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, final_test_pred))

In [None]:
from scipy.stats import mode

# Majority voting

final_test_pred = mode((np.array(test_preds)>0.5).astype(int)).mode

# test_acc = accuracy_score(y_test, final_test_pred)
# print(f"Ensemble Test Accuracy (majority vote): {test_acc:.4f}")
# print("Validation Accuracies for all folds:", valid_scores)
# print("Mean Validation Accuracy:", np.mean(valid_scores))


# Sensitivity (Recall for class 1)
sensitivity = recall_score(y_test, final_test_pred)
print(f"Sensitivity (Recall for positive class): {sensitivity:.4f}")

# Specificity (Recall for class 0)
cm = confusion_matrix(y_test, final_test_pred)
# cm format: [[TN, FP],
#             [FN, TP]]
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)
print(f"Specificity (Recall for negative class): {specificity:.4f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, final_test_pred))