# Installing required libraries

In [1]:
!pip install xgboost catboost polars optuna -q

In [2]:
!pip install scikit-learn-intelex -q

In [3]:
!pip install mlflow -q

# Importing required libraries

In [4]:
import pandas as pd
import polars as pl
import optuna
import pickle

import joblib

import numpy as np
## Enabling intel optimizations to 
import matplotlib.pyplot as plt
from sklearnex import patch_sklearn
patch_sklearn()

Matplotlib is building the font cache; this may take a moment.
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [5]:
import mlflow

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,f1_score, roc_auc_score, accuracy_score

# Helper functions

In [7]:
# Helper functions
def diag_med_lab_pid_exist_check(modeling_pids, diag_pid, medications_pid, lab_pid,age_data = None):
    # Convert sets of pids for faster lookup
    diag_pid_set = set(diag_pid)
    medications_pid_set = set(medications_pid)
    lab_pid_set = set(lab_pid)

    # Create the result list using a single loop
    if age_data:
        result = [
        f"{age}_{int(pid in diag_pid_set)}{int(pid in medications_pid_set)}{int(pid in lab_pid_set)}"
        for pid,age in zip(modeling_pids,age_data)
    ]
    else:
        result = [
            f"{int(pid in diag_pid_set)}{int(pid in medications_pid_set)}{int(pid in lab_pid_set)}"
            for pid in modeling_pids
        ]
    
    return result

In [8]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, average_precision_score

def get_metrics(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    roc_auc = roc_auc_score(y_test, y_pred_proba[:, 1])
    print("AUC:", roc_auc)

    # Calculate the Precision-Recall AUC
    auc_pr = average_precision_score(y_test, y_pred_proba[:, 1])
    print("Precision-Recall AUC:", auc_pr)

    report = classification_report(y_test, y_pred)
    print("Classification Report:\n", report)

    # Compute confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cm.ravel()

    # Calculate Sensitivity and Specificity
    sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0  # handle division by zero
    specificity = TN / (TN + FP) if (TN + FP) > 0 else 0  # handle division by zero

    print("Sensitivity (Recall):", sensitivity)
    print("Specificity:", specificity)
    
    return cm, accuracy, roc_auc, auc_pr, sensitivity, specificity, report

# Config class

In [9]:
mlflow.set_tracking_uri("../Determine_ML_FLOW_Experiment")

In [17]:
class ml_config:
    base_folder ='../Determine_final_modeling_datasets/'
    columns_to_ignore_cat = ['PATIENT_NUM','FirstOutcomeDate','Outcome']
    target_column = 'Outcome'
    file = 'Determine_joined_med_usage_lab_domain_expert_median_diag_phemap_with_icd10z_bmi_bp_cvs_ordinal_nominal_encoded.parquet'
    
    patient_enc_info_path = '../Determine_cohort_after_visit_index_details.parquet'
    no_enc_ppid_experiment =  True
    
    mlflow_experiment_name = 'Test101_mlflow'
    
    model_name = 'catboost'
    
    

# Loading modeling data file

In [11]:
modeling_df = pl.read_parquet(ml_config.base_folder + ml_config.file)

In [12]:
modeling_df.filter(pl.col('PATIENT_NUM') == 297249)

PATIENT_NUM,Age_group,FirstOutcomeDate,Outcome,nitrofurantoin,nystatin,atorvastatin,amlodipine,lisinopril_hydrochlorothiazide,tadalafil,albuterol,chlorthalidone,potassium chloride,vilanterol_fluticasone,hydrocortisone,lisinopril,"sennosides, USP_docusate",mupirocin,folic acid,tamsulosin,meloxicam,amitriptyline,quetiapine,gabapentin,tranexamic acid,polyvinyl alcohol,terbinafine,ascorbic acid,ferrous sulfate,losartan,clavulanate_amoxicillin,penicillin V,duloxetine,clobetasol,methadone,bupropion,tretinoin,…,LOINC:14805-6,LOINC:62255-5,LOINC:13988-1,LOINC:18488-7,LOINC:21458-5,LOINC:60474-4,LOINC:734-4,LOINC:14118-4,LOINC:32693-4,LOINC:1994-3,LOINC:29945-3,mode_height,median_value,slope_weight,BMI,median_diastolic_value,slope_dia_bp,median_systolic_value,slope_sys_bp,ACS_MedHHIncome,ACS_GINI,ACS_Unemployment,ACS_pctPoverty100,ACS_pctCollGrad,Sex_CD_F,Sex_CD_M,Race_CD_01,Race_CD_02,Race_CD_03,Race_CD_04,Race_CD_05,Hispanic_CD_N,Hispanic_CD_Y,Gender_CD_GQ,Gender_CD_M,Gender_CD_TG,Gender_CD_W
i64,i64,datetime[μs],i64,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8


In [13]:
np.unique(modeling_df['Outcome'].to_list(),return_counts = True)

(array([0, 1]), array([992825,  49388]))

In [14]:
if ml_config.no_enc_ppid_experiment:
    modeling_patient_ids = modeling_df['PATIENT_NUM'].to_list()
    pids_enc_info_after_vi = pl.read_parquet(ml_config.patient_enc_info_path)['PATIENT_NUM'].to_list()
    pids_to_drop = set(modeling_patient_ids) - set(pids_enc_info_after_vi)
    modeling_df = modeling_df.filter(((~pl.col('PATIENT_NUM').is_in(pids_to_drop)) & (pl.col('Outcome')==0)) | (pl.col('Outcome')==1))
    print("Number of records after dropping the patients who dont have encounters after their index visit:",len(modeling_df))
    print("The target distribution is:", np.unique(modeling_df['Outcome'].to_list(),return_counts = True) )

Number of records after dropping the patients who dont have encounters after their index visit: 868792
The target distribution is: (array([0, 1]), array([819404,  49388]))


In [15]:
# modeling_df  = modeling_df.drop(ml_config.columns_to_drop)
# modeling_df.head()

In [16]:
### Defining categorical columns
cat_features = [col for col in modeling_df.columns if  not (col.startswith('LOINC') 
                                                            or col in ml_config.columns_to_ignore_cat
                                                            or col in ['BMI',
                                                                       'mode_height',
                                                                       'average_weight',
                                                                         'average_diastolic_value',
                                                                         'average_systolic_value',
                                                                          "ACS_MedHHIncome", 
                                                                       "ACS_GINI", 
                                                                       "ACS_Unemployment", 
                                                                       "ACS_pctPoverty100", 
                                                                       "ACS_pctCollGrad"]
                                                                          )]
numerical_features = [col for col in modeling_df.columns if col.startswith('LOINC') 
                                                            or col not in ml_config.columns_to_ignore_cat
                                                            or col in ['BMI',
                                                                       'mode_height',
                                                                       'average_weight',
                                                                         'average_diastolic_value',
                                                                         'average_systolic_value',
                                                                          "ACS_MedHHIncome", 
                                                                       "ACS_GINI", 
                                                                       "ACS_Unemployment", 
                                                                       "ACS_pctPoverty100", 
                                                                       "ACS_pctCollGrad"]
                                                                          ] +
                                                                        [
                                                                            "mode_height",
                                                                            "median_value",
                                                                            "slope_weight",
                                                                            "BMI",
                                                                            "median_diastolic_value",
                                                                            "slope_dia_bp",
                                                                            "median_systolic_value",
                                                                            "slope_sys_bp"
                                                                        ]

SyntaxError: invalid syntax (4056229119.py, line 15)

In [None]:
'average_weight' in cat_features

In [None]:
print("Number of categorical features: ",len(cat_features))
print("Number of numerical features: ",len(numerical_features))

In [None]:
loinc_columns = [col for col in modeling_df.columns if col.startswith('LOINC')]
print("Number of lab results features: ",len(loinc_codes))

In [None]:
### VERY IMPORTANT!!!!
modeling_df = modeling_df.with_columns([
    pl.col(col).cast(pl.Float32)
    for col in loinc_columns
])

In [None]:
modeling_df.head()

# Train/test split

In [None]:
data_train_pids, data_test_pids = train_test_split(modeling_df['PATIENT_NUM'], test_size=0.2, stratify=modeling_df['Outcome'], random_state = 42)

In [None]:
train_pids = data_train_pids.to_list()
test_pids = data_test_pids.to_list()

In [None]:
len(test_pids)

In [None]:
## saving test pids
# with open('test_data_pids.pkl', 'wb') as file: 
#     # A new file will be created 
#     pickle.dump(test_pids, file) 
    
# with open('train_data_pids.pkl', 'wb') as file: 
#     # A new file will be created 
#     pickle.dump(train_pids, file) 

# Open the file in binary mode 
# with open('train_data_pids.pkl', 'rb') as file: 
#     train_pids = pickle.load(file) 
# with open('test_data_pids.pkl', 'rb') as file: 
#     test_pids = pickle.load(file) 

In [None]:
data_train = modeling_df.filter(pl.col('PATIENT_NUM').is_in(train_pids))
data_test = modeling_df.filter(pl.col('PATIENT_NUM').is_in(test_pids))

In [25]:
data_test

PATIENT_NUM,Age_group,FirstOutcomeDate,Outcome,nitrofurantoin,nystatin,atorvastatin,amlodipine,lisinopril_hydrochlorothiazide,tadalafil,albuterol,chlorthalidone,potassium chloride,vilanterol_fluticasone,hydrocortisone,lisinopril,"sennosides, USP_docusate",mupirocin,folic acid,tamsulosin,meloxicam,amitriptyline,quetiapine,gabapentin,tranexamic acid,polyvinyl alcohol,terbinafine,ascorbic acid,ferrous sulfate,losartan,clavulanate_amoxicillin,penicillin V,duloxetine,clobetasol,methadone,bupropion,tretinoin,…,LOINC:14805-6,LOINC:62255-5,LOINC:13988-1,LOINC:18488-7,LOINC:21458-5,LOINC:60474-4,LOINC:734-4,LOINC:14118-4,LOINC:32693-4,LOINC:1994-3,LOINC:29945-3,mode_height,median_value,slope_weight,BMI,median_diastolic_value,slope_dia_bp,median_systolic_value,slope_sys_bp,ACS_MedHHIncome,ACS_GINI,ACS_Unemployment,ACS_pctPoverty100,ACS_pctCollGrad,Sex_CD_F,Sex_CD_M,Race_CD_01,Race_CD_02,Race_CD_03,Race_CD_04,Race_CD_05,Hispanic_CD_N,Hispanic_CD_Y,Gender_CD_GQ,Gender_CD_M,Gender_CD_TG,Gender_CD_W
i64,i64,datetime[μs],i64,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8
9958038,1,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,62.25,134.199997,-0.199997,24.346085,62.5,4.3,106.5,-4.9,120772.0,38.360001,2.55798,5.60921,41.368198,1,0,0,0,0,0,0,0,1,0,0,0,1
7615966,4,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,100.0,0.0,134.0,0.0,57422.0,40.57,2.27416,21.816099,5.5336,1,0,0,0,0,0,1,1,0,0,0,0,1
9387592,3,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,67.0,222.0,2.5,34.766316,72.0,-2.0,119.0,1.5,85047.0,40.709999,2.94748,6.97079,34.150398,0,1,0,0,0,0,1,1,0,0,1,0,0
10655534,5,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,67599.0,45.59,3.90124,11.2949,32.2836,1,0,0,0,0,0,1,0,1,0,0,0,1
3693315,1,,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,72.0,165.0,-0.5,22.37558,90.0,5.5,140.0,-4.6927e-16,67147.0,45.27,4.77789,29.688499,52.203701,0,1,0,0,1,0,0,1,0,0,1,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2030050,3,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,68.5,134.0,0.2,20.076083,72.5,-0.9,117.0,-0.8,59062.0,42.5,3.78051,17.4809,24.3978,0,1,0,0,0,0,1,1,0,0,1,0,0
7740621,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,65.0,115.0,0.0,19.134911,62.0,0.0,104.0,0.0,34075.0,48.869999,3.82143,25.0921,19.6677,1,0,0,0,0,0,1,1,0,0,0,0,1
5092649,1,,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,65.0,139.0,3.5,23.128285,62.0,3.0,110.0,-7.0,58590.0,53.290001,2.96276,18.1803,60.6026,1,0,0,0,0,0,1,1,0,0,0,0,1
601971,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,66.93,151.800003,6.399994,23.822395,61.5,1.3,114.5,3.3,103421.0,38.419998,4.40513,4.33584,54.3027,1,0,0,0,0,0,1,1,0,0,0,0,1


In [26]:
np.unique(data_test['Outcome'].to_list(), return_counts =True)

(array([0, 1]), array([163881,   9878]))

In [27]:
del modeling_df

In [28]:
test_data_firstoutcome_df = data_test.select(['PATIENT_NUM','FirstOutcomeDate','Outcome'])

In [29]:
X_train,y_train = data_train.drop(['PATIENT_NUM','FirstOutcomeDate','Outcome']).to_pandas(), data_train['Outcome'].to_pandas()
X_test,y_test = data_test.drop(['PATIENT_NUM','FirstOutcomeDate','Outcome']).to_pandas(), data_test['Outcome'].to_pandas()

In [30]:
type(y_train)

pandas.core.series.Series

In [31]:
del data_train#, data_test

In [None]:
X_train.dtypes[-200:]

### Adjusting features based on model_name (some models need feature scaling for numerical data)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# Initialize the scaler

if ml_config.model_name == 'logistic_regression':
    scaler = StandardScaler()

    # Fit and transform only the specified columns
    X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# Boruta feature selection

In [None]:
!pip install boruta -q 

In [None]:
loinc_columns = [col for col in X_train.columns if col.startswith('LOINC')]
len(loinc_columns)

In [None]:
# X_train[loinc_columns] = X_train[loinc_columns].astype('float') 
# X_test[loinc_columns] = X_test[loinc_columns].astype('float')

In [None]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs =-1)

In [None]:
nan_columns = X_train.columns[X_train.isna().any()].tolist()

print(nan_columns)

In [None]:
print("hello")

In [None]:
trans = BorutaPy(clf, max_iter=5, random_state=42)
sel = trans.fit_transform(X_train.values, y_train.values)

In [None]:
print('Hello')

# ML models

## Single objective function

In [None]:
def objective(trial):
    if ml_config.model_name == 'xgboost':
    
        param = {
        'objective': 'binary:logistic',  # Binary classification
        #'eval_metric': 'auc',             # Evaluation metric
        'seed': 42,
        'eta': trial.suggest_float('eta', 0.01, 0.3, step=0.01),
        'n_estimators': trial.suggest_int('n_estimators', 100, 600, step =100),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0, step=0.1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.1),
        'gamma': trial.suggest_float('gamma', 0, 5.0),
        'scale_pos_weight': (len(y_train) - sum(y_train)) / sum(y_train),  # Class weight for imbalance
        }
        
        model = XGBClassifier(**param, enable_categorical=True, device="cuda")

    elif ml_config.model_name == 'catboost':
         params = {
            "iterations": trial.suggest_categorical('iterations',[200, 400, 600, 800]),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
            "depth": trial.suggest_int("depth", 4, 12),
            #"subsample": trial.suggest_float("subsample", 0.05, 1.0),
            #"colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
            'task_type':"GPU",
        }
        model = CatBoostClassifier(**param, auto_class_weights='Balanced',allow_writing_files=False,
                                   silent=True)

    elif ml_config.model_name == 'random_forest':
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 800),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        }
        model = RandomForestClassifier(**param, class_weight ='balanced', n_jobs = -1)

    elif ml_config.model_name == 'logistic_regression':
        param = {
            'C': trial.suggest_loguniform('C', 1e-2, 10.0),
            'max_iter': trial.suggest_int('max_iter', 100, 600),
            'solver': trial.suggest_categorical('solver', ['liblinear']),
        }
        model = LogisticRegression(**param,class_weight ='balanced', n_jobs = -1 )
        
    
    

    else:
        raise ValueError("Unsupported model name")
    
    
    model.fit(X_train, y_train)
    y_pred_proba = model._predict_proba(X_test)[:, 1]
        
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    return roc_auc


In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials= 5)

print('')
print('Best hyperparameters:', study.best_params)
print('Best AUC:', study.best_value)
print('')

In [None]:
if ml_config.model_name == 'xgboost':
        
    model = XGBClassifier(**best_params, 
                          scale_pos_weight = (len(y_train) - sum(y_train)) / sum(y_train),  
#                           use_label_encoder=False,
                          device = 'cuda', 
                          )
    
elif ml_config.model_name == 'catboost':
    model = CatBoostClassifier(**best_params, auto_class_weights = 'Balanced', verbose=0)
elif ml_config.model_name == 'random_forest':
    model = RandomForestClassifier(**best_params, class_weight ='balanced', n_jobs = -1)
elif ml_config.model_name == 'logistic_regression':
    model = LogisticRegression(**best_params, class_weight ='balanced', n_jobs = -1)
else:
    raise ValueError("Unsupported model name")

In [None]:
model.fit(X_train,y_train)

In [None]:
## Metrics
conf_matrix, accuracy, roc_auc, sensitivity, specificity, auc_pr, classification_report = get_metrics(logistic_model,X_test, y_test)

In [None]:
## Confusion matrix
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                conf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     conf_matrix.flatten()/np.sum(conf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
cm_plot = plt.subplot()
sns.heatmap(conf_matrix, annot=labels, fmt='', cmap='Blues')

#Labels, title and ticks
cm_plot.set_xlabel('Predicted labels');ax.set_ylabel('True labels');
cm_plot.set_title('Confusion Matrix'); 
plt.show()

In [None]:
# Save model using joblib
model_filename = f"../Determine_trained_models/{ml_config.model_name}_dataset_{ml_config.file.split('.')[0]}.pkl"
joblib.dump(model, model_filename)

print(f"Model saved as {model_filename}")

## Saving results and details to ml flow

In [None]:
params = model.get_params()


# Create a new MLflow Experiment
mlflow.set_experiment(f"{ml_cofig.mlflow_experiment_name}_ModelName_{ml_config.model_name}",
                     description=ml_config.description)

# Start an MLflow run
with mlflow.start_run():
    
    #set tags
    tags = {
        'model_name': ml_config.model_name
        'dataset_filename':ml_config.file
    }
    
    # Log the hyperparameters   
    mlflow.log_params(params)
    
    mlflow.log_input(X_train, context="training")
    mlflow.log_input(X_test, context="testing")

    # Log the loss metric
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("roc_auc", roc_auc)
    mlflow.log_metric("sensitivity", sensitivity)
    mlflow.log_metric("specificity", sensitivity)
    mlflow.log_metric("auc_pr", accuracy)
    mlflow.log_dict(classification_report, "classification_report.json")
    
    mlflow.log_image(cm_plot, "confusion_matrix.png")



# Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [None]:
loinc_columns = [col for col in X_train.columns if col.startswith('LOINC')]
len(loinc_columns)

In [None]:
X_train.dtypes[-100:]

In [None]:
X_train[loinc_columns] = X_train[loinc_columns].astype('float') 
X_test[loinc_columns] = X_test[loinc_columns].astype('float')

In [None]:
print('Hello')

In [None]:
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform only the specified columns
X_train[loinc_columns] = scaler.fit_transform(X_train[loinc_columns])
X_test[loinc_columns] = scaler.transform(X_test[loinc_columns])

In [None]:
# Initialize the Logistic Regression model with class_weight set to 'balanced'
logistic_model = LogisticRegression(class_weight='balanced', random_state=42, n_jobs =-1, max_iter =300)

# Fit the model on the training data
logistic_model.fit(X_train, y_train)

In [None]:
get_metrics(logistic_model,X_test, y_test)

In [None]:
print('Hello')

## Catboost

In [32]:
from catboost import CatBoostClassifier

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [33]:
len(X_train.columns)

2805

In [34]:
def objective(trial):
        params = {
            "iterations": trial.suggest_categorical('iterations',[200, 400, 600, 800]),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
            "depth": trial.suggest_int("depth", 4, 12),
            #"subsample": trial.suggest_float("subsample", 0.05, 1.0),
            #"colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
            'task_type':"GPU",
        }

        model = CatBoostClassifier(**params , auto_class_weights='Balanced',allow_writing_files=False,
                                   #cat_features=cat_features,
                                   silent=True)
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
#         f1 = f1_score(y_test, predictions,average ='macro')
        y_pred_probs = model.predict_proba(X_test)[:,1]
        roc_auc = roc_auc_score(y_test, y_pred_probs)
        return roc_auc
    
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

best_params  = study.best_params

[I 2025-04-11 17:35:58,481] A new study created in memory with name: no-name-914e0cef-8084-455b-8ae6-298eda97c754
[I 2025-04-11 17:36:33,130] Trial 0 finished with value: 0.8278207332944943 and parameters: {'iterations': 600, 'learning_rate': 0.04889464272188424, 'depth': 8, 'min_data_in_leaf': 9}. Best is trial 0 with value: 0.8278207332944943.
[I 2025-04-11 17:37:25,406] Trial 1 finished with value: 0.809710439648479 and parameters: {'iterations': 600, 'learning_rate': 0.0033212105108746228, 'depth': 9, 'min_data_in_leaf': 37}. Best is trial 0 with value: 0.8278207332944943.
[I 2025-04-11 17:37:50,730] Trial 2 finished with value: 0.8222787222634456 and parameters: {'iterations': 800, 'learning_rate': 0.027260807090613178, 'depth': 4, 'min_data_in_leaf': 235}. Best is trial 0 with value: 0.8278207332944943.
[I 2025-04-11 17:38:10,377] Trial 3 finished with value: 0.8031796482447309 and parameters: {'iterations': 400, 'learning_rate': 0.007425618043132102, 'depth': 5, 'min_data_in_lea

Number of finished trials: 5
Best trial:
  Value: 0.8278207332944943
  Params: 
    iterations: 600
    learning_rate: 0.04889464272188424
    depth: 8
    min_data_in_leaf: 9


In [35]:
best_params = study.best_params

In [36]:
# best_params = {'iterations': 500, 'learning_rate': 0.043419222852673814, 'depth': 12, 'min_data_in_leaf': 54}

model = CatBoostClassifier(**best_params, auto_class_weights='Balanced',task_type = "GPU",allow_writing_files=False,
                           silent=True)
model.fit(X_train, y_train)


<catboost.core.CatBoostClassifier at 0x7f2e1c17d960>

In [37]:
get_metrics(model,X_test, y_test)

Accuracy: 0.7369344897242732
AUC:  0.8278207116737611
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.74      0.84    163881
           1       0.15      0.76      0.25      9878

    accuracy                           0.74    173759
   macro avg       0.56      0.75      0.54    173759
weighted avg       0.93      0.74      0.81    173759

Sensitivity (Recall): 0.755011135857461
Specificity: 0.7358449118567741


In [None]:
X_train.columns[2814]

In [None]:
### saving model

model.save_model('../Determine_trained_models/Catboost'+ ml_config.file)

### False positive indexes

In [None]:
model  = CatBoostClassifier().load_model('../Determine_trained_models/Catboost'+ ml_config.file)

In [None]:
# Assuming 'model' is your trained model and it has a predict method
# Predict on the train data
predictions = model.predict(X_test)

# Boolean array where True indicates false positives
false_positives_mask = (predictions == 1) & (y_test == 0)
false_negatives_mask = (predictions == 0) & (y_test == 1)
# Get indices of false positives
false_positive_indices = np.where(false_positives_mask)[0]
false_negatives_indices = np.where(false_negatives_mask)[0]

print("Indices of False Positives:", false_positive_indices)
print("Indices of False Negatives:", false_negatives_indices)

In [None]:
len(false_positive_indices)

In [None]:
len(false_negatives_indices)

In [None]:
data_test_pids_fp = data_test[false_positive_indices]['PATIENT_NUM'].to_list()
data_test_pids_fn = data_test[false_negatives_indices]['PATIENT_NUM'].to_list()

In [None]:
directory_path = f'./FP_FN_{ml_config.file.split('.')[0]}'

# Create the directory
os.makedirs(directory_path, exist_ok=True)

In [None]:
with open(f'./{directory_path}/{ml_config.model_name}_false_positives_pids_test.pkl', 'wb') as file: 
    pickle.dump(data_test_pids_fp, file) 

with open(f'./{directory_path}/{ml_config.model_name}_false_positives_pids_test.pkl', 'rb') as file: 
    data_test_pids_fp = pickle.load(file) 
    
    
with open(f'./{directory_path}/{ml_config.model_name}_false_negatives_pids_test.pkl', 'wb') as file: 
    pickle.dump(data_test_pids_fn, file) 

with open(f'./{directory_path}/{ml_config.model_name}_false_negatives_pids_test.pkl', 'rb') as file: 
    data_test_pids_fn = pickle.load(file) 

In [None]:
from datetime import datetime

In [None]:
data_test.filter(pl.col('PATIENT_NUM').is_in(data_test_pids_fn)).sort('FirstOutcomeDate').filter(pl.col("FirstOutcomeDate") < datetime(2019, 1, 2))
          #,return_counts = True)

In [None]:
data_test.filter(pl.col('PATIENT_NUM').is_in(data_test_pids_fp)).filter(pl.col("Outcome")== 1)
          #,return_counts = True)

In [None]:
data_test[[36, 54]]

### Loading model

In [None]:
model  = CatBoostClassifier().load_model('../Determine_trained_models/catboost_whole_dataset_phecodes_bmi_bp')

In [None]:
get_metrics(model,X_test, y_test)

In [None]:
# Get feature importances
feature_importances = model.get_feature_importance()

# Get feature names from the training data (X_train should be a DataFrame)
feature_names = X_train.columns

# Combine feature names and importances into a list of tuples
features = list(zip(feature_names, feature_importances))

# Sort features by importance
sorted_features = sorted(features, key=lambda x: x[1], reverse=True)

# Select top 10 features
top_10_features = sorted_features[:10]
# [('average_bmi', 10.15971291924134),
# ('Age_group', 9.380785704361632),
# ('LOINC:2345-7', 8.962230022321915),
# ('average_systolic_value', 3.329221847438418),
# ('LOINC:27353-2', 3.2908540684274525),
# ('Hispanic_CD_Y', 2.6087360120255334),
# ('LOINC:9318-7', 2.0793359364139765),
# ('average_diastolic_value', 1.9829892483669505),
# ('Race_CD_05', 1.755046409270668),
# ('LOINC:2349-9', 1.6860192611322427)]

# Separate the feature names and their importances for plotting
top_feature_names, top_importances = zip(*top_10_features)

# Plot top 10 feature importances
plt.figure(figsize=(10, 6))
plt.barh(top_feature_names, top_importances, color='skyblue')
plt.xlabel('Importance')
plt.title('Top 10 Feature Importances')
plt.gca().invert_yaxis()  # Invert y-axis to show the highest importance at the top
plt.show()

In [None]:
top_10_features

In [None]:
feature_importance

## Xgboost

In [None]:
import xgboost as xgb

In [None]:
X_train.dtypes[-100:]

In [None]:
loinc_columns = [col for col in X_train.columns if col.startswith('LOINC')]
len(loinc_columns)

In [None]:
X_train[loinc_columns] = X_train[loinc_columns].astype('float') 

In [None]:
X_test[loinc_columns] = X_test[loinc_columns].astype('float')

In [None]:
print("hello worlds")

In [None]:
def objective(trial):
    # Suggest hyperparameters
    param = {
        'objective': 'binary:logistic',  # Binary classification
        #'eval_metric': 'auc',             # Evaluation metric
        'seed': 42,
        'eta': trial.suggest_float('eta', 0.01, 0.3, step=0.01),
        'n_estimators': trial.suggest_int('n_estimators', 100, 600, step =100),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0, step=0.1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.1),
        'gamma': trial.suggest_float('gamma', 0, 5.0),
        'scale_pos_weight': (len(y_train) - sum(y_train)) / sum(y_train),  # Class weight for imbalance
    }
    
    # Train the model
    model = xgb.XGBClassifier(**param, enable_categorical=True, device="cuda")
    model.fit(X_train, y_train)

    # Predict and evaluate
    preds = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, preds)
    return auc


In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

best_params  = study.best_params

In [None]:
scale_pos_weight =  (len(y_train) - sum(y_train)) / sum(y_train)
model = xgb.XGBClassifier(#**param, 
                          scale_pos_weight = scale_pos_weight, enable_categorical=True, device="cuda")
model.fit(X_train, y_train)

In [None]:
np.unique(y_train.values, return_counts = True)

In [None]:
get_metrics(model,X_test, y_test)

## Random Forest

In [None]:
### CPU Optimaizations ###
from sklearnex import patch_sklearn
patch_sklearn()

In [None]:
### GPU Optimizations ###
from sklearnex import patch_sklearn, config_context
patch_sklearn()


In [None]:
def objective(trial):
    # Number of trees in random forest
    n_estimators = trial.suggest_int(name="n_estimators", low=100, high=700, step=100)

    # Number of features to consider at every split
    max_features = trial.suggest_categorical(name="max_features", choices=['auto', 'sqrt']) 

    # Maximum number of levels in tree
    max_depth = trial.suggest_int(name="max_depth", low=10, high=110, step=20)

    # Minimum number of samples required to split a node
    min_samples_split = trial.suggest_int(name="min_samples_split", low=2, high=10, step=2)

    # Minimum number of samples required at each leaf node
    min_samples_leaf = trial.suggest_int(name="min_samples_leaf", low=1, high=4, step=1)
    
    params = {
        "n_estimators": n_estimators,
        "max_features": max_features,
        "max_depth": max_depth,
        "min_samples_split": min_samples_split,
        "min_samples_leaf": min_samples_leaf
    }
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    y_pred_probs = model.predict_proba(X_test)[:,1]
    roc_auc = roc_auc_score(y_test, y_pred_probs)

    return mean_cv_accuracy

In [None]:
study = optuna.create_study()
study.optimize(objective, n_trials=10)
print(study.best_params)

In [None]:

# Train a new model using the best parameters
best_model = RandomForestClassifier(**study.best_params)
best_model.fit(X_train, y_train)

# Feature Importance using SHaP

# Fairness metrics

# Sub-group performance

In [None]:
## model to use
model  = CatBoostClassifier().load_model('../Determine_trained_models/catboost_whole_dataset_phecodes_bmi_bp')

In [None]:
X_test.columns[:15]

In [None]:
y_test.values

In [None]:
def predict_on_data(sub_x_test,sub_y_test, model):
    y_pred = model.predict(sub_x_test)
    y_pred_probs = model.predict_proba(sub_x_test)[:,1]
    
    tn, fp, fn, tp = confusion_matrix(sub_y_test, y_pred).ravel()

    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)

    print("Sensitivity: ", sensitivity)
    print("Specificity: ", specificity)
    roc_auc = roc_auc_score(sub_y_test, y_pred_probs)
    print("ROC curve: ", roc_auc)
    
    acc = accuracy_score(sub_y_test, y_pred)
    print('Accuracy: ', acc)
    return sensitivity, specificity, roc_auc, acc
    
     

In [None]:
# - **'DEM|RACE:01'**: White
# - **'DEM|RACE:02'**: Black or African American
# - **'DEM|RACE:03'**: Asian
# - **'DEM|RACE:04'**: Native Hawaiian or Other Pacific Islander
# - **'DEM|RACE:05'**: American Indian or Alaska Native
# - **'DEM|RACE:06'**: Other or Unknown
# - **'DEM|RACE:07'**: More than one race

In [None]:
import matplotlib.pyplot as plt

In [None]:
race_cols = [
    "Race_CD_01",
    "Race_CD_02",
    "Race_CD_03",
    "Race_CD_04",
    "Race_CD_05",
    "Race_CD_06",
    "Race_CD_07",
    "Race_CD_UNK"
]

races = []
sens = []
specs = []
roc_aucs = []
counts = []
races = []
class_1_count = []

for col in race_cols:
    if col.endswith('01'):
        race = 'white'
    if col.endswith('02'):
        race = 'Black or African American'
    if col.endswith('03'):
        race = 'Asian'
    if col.endswith('04'):
        race = 'Native Hawaiian or Other Pacific Islander'
    if col.endswith('05'):
        race = 'American Indian or Alaska Native'
    if col.endswith('06'):
        race = 'Multiple Race'
    if col.endswith('07'):
        race = 'refuse to Answer'
    elif col.endswith('UNK'):
        race = 'UNK'
        
    print("Number of data points with race {} are: {}".format( col, X_test[X_test[col]==1].shape[0] ))
    
    # Get indexes where the value is 1 for the current race column
    X_test_race = X_test[X_test[col] == 1]
    indexes = X_test_race.index
    
    # Retrieve the respective y_test data using these indexes
    y_test_race = y_test.loc[indexes]
    
    sensitivity, specificity, roc_auc = predict_on_data(X_test_race, y_test_race, model)
    races.append(race)
    sens.append(sensitivity)
    specs.append(specificity)
    roc_aucs.append(roc_auc)
    class_1_count.append(sum(y_test_race.values))
    counts.append(X_test[X_test[col]==1].shape[0])
    

    

In [None]:
df = pd.DataFrame({
    'Race': races,
    'counts':counts,
    'class_1 count': class_1_count,
    'Sensitivity': sens,
    'Specificity': specs,
    'ROC_AUC': roc_aucs
})
df

In [None]:
to_collapse = df.iloc[5:8]

# Calculate the new collapsed row
new_row = {
    'Race': 'Collapsed Race',
    'counts': to_collapse['counts'].sum(),
    'class_1 count': to_collapse['class_1 count'].sum(),
    'Sensitivity': to_collapse['Sensitivity'].mean(),
    'Specificity': to_collapse['Specificity'].mean(),
    'ROC_AUC': to_collapse['ROC_AUC'].mean()
}

# Drop the old rows and append the new row
df = df.drop(index=range(5, 8)).append(new_row, ignore_index=True)

In [None]:
new_row

In [None]:
age_groups = np.unique(X_test['Age_group'].values)

age_ranges = []
sens = []
specs = []
roc_aucs = []
counts =[]
class_1_count = []

races = []
for age_enc in age_groups:
    
    if age_enc == 0:
        age = '18-34'
    elif age_enc == 1:
        age = '35-44'
    elif age_enc == 2:
        age = '45-54'
    elif age_enc == 3:
        age = '54-65'
    elif age_enc == 4:
        age = '65-74'
    else:
        age = '75_older'
        
    print("Number of data points with age group {} are: {}".format( age_enc, X_test[X_test['Age_group']== age_enc].shape[0] ))
    
    # Get indexes where the value is 1 for the current race column
    X_test_race = X_test[X_test['Age_group']== age_enc]
    indexes = X_test_race.index
    
    # Retrieve the respective y_test data using these indexes
    y_test_race = y_test.loc[indexes]
    
    sensitivity, specificity, roc_auc = predict_on_data(X_test_race, y_test_race, model)
    age_ranges.append(age)
    sens.append(sensitivity)
    specs.append(specificity)
    roc_aucs.append(roc_auc)
    class_1_count.append(sum(y_test_race.values))
    counts.append(X_test[X_test['Age_group']== age_enc].shape[0])
    
#     # SHAP analysis for top features
#     explainer = shap.TreeExplainer(model)
#     shap_values = explainer.shap_values(X_test_race)
    
#     # Compute mean absolute SHAP values for each feature
#     mean_shap_values = np.abs(shap_values).mean(axis=0)
    
#     # Get feature names
#     feature_names = X_test_race.columns
    
#     # Get and display top 5 features for the current age group
#     top_5_indices = np.argsort(mean_shap_values)[-5:][::-1]
#     top_5_features = [feature_names[i] for i in top_5_indices]
# #     top_features.append(top_5_features)
    
#     # Plotting top 5 features for the current age group
#     plt.figure(figsize=(10, 6))
#     sns.barplot(x=mean_shap_values[top_5_indices], y=[feature_names[i] for i in top_5_indices], palette="viridis")
#     plt.title(f"Top 5 Features for Age Group {age}: SHAP Importance")
#     plt.xlabel("Mean SHAP Value (absolute)")
#     plt.ylabel("Features")
#     plt.show()
    

In [None]:
df = pd.DataFrame({
    'Age_Group': age_ranges,
    'counts': counts,
    'class_1 count': class_1_count,
    'Sensitivity': sens,
    'Specificity': specs,
    'ROC_AUC': roc_aucs,
})
df

## Gender

In [None]:
gender_cols = ["Gender_CD_GQ", "Gender_CD_M", "Gender_CD_TG", "Gender_CD_UNK", "Gender_CD_W"]

In [None]:
gender_list = []
sens = []
specs = []
roc_aucs = []
counts =[]
class_1_count = []

races = []
for gen in gender_cols:
    
    if gen.endswith('GQ'):
        gender = 'GenderQueer'
    elif gen.endswith('M'):
        gender = 'Man'
    elif gen.endswith('W'):
        gender = 'Woman'
    elif gen.endswith('TG'):
        gender = 'Transgender'
    elif gen.endswith('UNK'):
        gender = 'Unknown'

        
    print("Number of data points with gender group {} are: {}".format( gender, X_test[X_test[gen]== 1].shape[0] ))
    
    # Get indexes where the value is 1 for the current race column
    X_test_gen = X_test[X_test[gen]== 1]
    indexes = X_test_gen.index
    
    # Retrieve the respective y_test data using these indexes
    y_test_gen = y_test.loc[indexes]
    
    sensitivity, specificity, roc_auc,_ = predict_on_data(X_test_gen, y_test_gen, model)
    gender_list.append(gender)
    sens.append(sensitivity)
    specs.append(specificity)
    roc_aucs.append(roc_auc)
    class_1_count.append(sum(y_test_gen.values))
    counts.append(X_test[X_test[gen]== 1].shape[0])

In [None]:
df = pd.DataFrame({
    'Gender': gender_list,
    'counts': counts,
    'class_1 count': class_1_count,
    'Sensitivity': sens,
    'Specificity': specs,
    'ROC_AUC': roc_aucs,
})
df.reset_index(drop=True, inplace=True)
df

# 5-year performance

In [None]:
model  = CatBoostClassifier().load_model('../Determine_trained_models/catboost_whole_dataset_phecodes_bmi_bp')

In [None]:
req_test_dates_df = test_data_firstoutcome_df.drop_nulls().to_pandas()
req_test_dates_df

In [None]:
# Constants
start_date = pd.Timestamp('2019-04-01')

# Calculate the difference in years between the Index_Start_date and the start_date if data is available
req_test_dates_df['year_difference'] = (req_test_dates_df['FirstOutcomeDate'] - start_date).dt.days / 365.25

# Filter DataFrames based on year difference, ignore NaNs as they indicate missing data in cohort_df
df_less_than_1 = req_test_dates_df[req_test_dates_df['year_difference'] < 1].dropna()
df_1_to_2 = req_test_dates_df[(req_test_dates_df['year_difference'] >= 1) & (req_test_dates_df['year_difference'] < 2)].dropna()
df_2_to_3 = req_test_dates_df[(req_test_dates_df['year_difference'] >= 2) & (req_test_dates_df['year_difference'] < 3)].dropna()
df_3_to_4 = req_test_dates_df[(req_test_dates_df['year_difference'] >= 3) & (req_test_dates_df['year_difference'] < 4)].dropna()
df_4_to_5 = req_test_dates_df[(req_test_dates_df['year_difference'] >= 4) & (req_test_dates_df['year_difference'] < 5)].dropna()

In [None]:
year_ranges = []
sens = []
specs = []
roc_aucs = []
counts =[]
class_1_count = []
acc_scores = []

In [None]:
for i,buf_df in enumerate([df_less_than_1, df_1_to_2, df_2_to_3, df_3_to_4]):
     # Use the index of buf_df
    indices = buf_df.index
    # Use the index to filter X_test
    X_test_buf = X_test.loc[indices.intersection(X_test.index)]
    y_test_buf = y_test.loc[indices.intersection(X_test.index)]
    
    if i == 0:
        year ='Less than 1 year'
    if i == 1:
        year ='Between 1 and 2 year'
    if i == 2:
        year ='Between 2 and 3 year'
    if i == 3:
        year ='Between 3 and 4 year'
    
    print(year)
    sensitivity, specificity, roc_auc, acc = predict_on_data(X_test_buf, y_test_buf, model)
    year_ranges.append(year)
    sens.append(sensitivity)
    specs.append(specificity)
    roc_aucs.append(roc_auc)
    counts.append(len(y_test_buf))
    acc_scores.append(acc)

In [None]:
df = pd.DataFrame({
    'Age_Group': year_ranges,
    'counts (class 1)': counts,
    'Accuracy (class 1)': acc_scores,
    'Sensitivity': sens,
    'Specificity': specs,
    'ROC_AUC': roc_aucs,
})
df