In [None]:
import pydicom as py
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import datetime
import skimage as sk
import sys




from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_selection import RFECV


#from sklearn import metrics
from sklearn.metrics import make_scorer, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, roc_auc_score, balanced_accuracy_score, matthews_corrcoef


from sklearn.ensemble import RandomForestClassifier



import shap
from sklearn.inspection import permutation_importance

In [None]:
root= "the root/path of the folders"

In [None]:
#Load training, validation and testing data

xtrain_df= pd.read_csv(root+ "classification/"+ "xtrain_df.csv")
ytrain= np.load(root+ "classification/"+ "ytrain.npy",)
print("Done")


xval_df= pd.read_csv(root+ "classification/"+ "xval_df.csv")
yval= np.load(root+ "classification/"+ "yval.npy")
print("Done")


xtest_df= pd.read_csv(root+ "classification/"+ "xtest_df.csv")
ytest= np.load(root+ "classification/"+ "ytest.npy")
print("Done")

print(f"\nTraining size= {xtrain_df.shape}")
print(f"Validation size= {xval_df.shape}")
print(f"Testing size= {xtest_df.shape}")

print(f"\nytrain= {np.unique(ytrain, return_counts= True)}")
print(f"yval= {np.unique(yval, return_counts= True)}")
print(f"ytest= {np.unique(ytest, return_counts= True)}\n")

In [None]:
# Creating the custom scorers

# Custom scorer for True Negative Rate (TNR)
def tnr(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)



mcc_scorer= make_scorer(matthews_corrcoef)
balanced_accuracy_scorer= make_scorer(balanced_accuracy_score)
precision_scorer = make_scorer(precision_score)
recall_scorer = make_scorer(recall_score)
f1_scorer= make_scorer(f1_score)
roc_auc_scorer= make_scorer(roc_auc_score)
tnr_scorer = make_scorer(tnr)


# CCS function
def compute_ccs(metrics_dict):
    selected_metrics = [
        metrics_dict["Balanced Accuracy"],
        metrics_dict["F1 Score"],
        metrics_dict["ROC_AUC"],
        metrics_dict["MCC"]
    ]
    return gmean([max(0, metric) for metric in selected_metrics])

In [None]:
#Random Forest Classifier for training on the training data and then evaluate on the validation data

def train_and_evaluate(train_x, train_y, val_x, val_y, test_x, test_y):
    
    # Store CCS scores across multiple random runs
    val_ccs_scores = []
    test_ccs_scores= []

    
    for seed in range(20):  # Loop over 20 different random states
        model = RandomForestClassifier(250, random_state= seed, n_jobs= -1, class_weight= "balanced")
        model.fit(train_x, train_y)

        # Validate
        val_predictions = model.predict(val_x)
        val_probabilities = model.predict_proba(val_x)[:, 1]  # For ROC AUC

        val_metrics = {
            "Balanced Accuracy": balanced_accuracy_score(val_y, val_predictions),
            "F1 Score": f1_score(val_y, val_predictions),
            "ROC_AUC": roc_auc_score(val_y, val_probabilities),
            "MCC": matthews_corrcoef(val_y, val_predictions)
        }

        val_ccs = (compute_ccs(val_metrics)*100)
        val_ccs_scores.append(val_ccs)


        # Testing
        test_predictions = model.predict(test_x)
        test_probabilities = model.predict_proba(test_x)[:, 1]  # For ROC AUC

        test_metrics = {
            "Balanced Accuracy": balanced_accuracy_score(test_y, test_predictions),
            "F1 Score": f1_score(test_y, test_predictions),
            "ROC_AUC": roc_auc_score(test_y, test_probabilities),
            "MCC": matthews_corrcoef(test_y, test_predictions)
        }

        test_ccs = (compute_ccs(test_metrics)*100)
        test_ccs_scores.append(test_ccs)


    # Print final CCS
    print("\n")
    print(f"Validation CCS = {val_ccs_scores}")
    print(f"Mean validation CCS = {round(np.mean(val_ccs_scores), 3)}")

    print(f"\nTesting CCS = {test_ccs_scores}")
    print(f"Mean testing CCS = {round(np.mean(test_ccs_scores), 3)}\n")

    return np.array(val_ccs_scores), np.array(test_ccs_scores)

In [None]:
#RFC feature importance (FI)

feature_importances= 0
fi_scores= []  # Store all runs


k= 100
for i in range(k):
    rfc = RandomForestClassifier(250, random_state=i, n_jobs=-1, class_weight="balanced")
    history = rfc.fit(xtrain_df, ytrain)
    
    # Store each run's importance scores
    fi_scores.append(rfc.feature_importances_)
    feature_importances += rfc.feature_importances_
    
    print(f"{i+1}) Done.")



# Calculate mean RF_FI
fi = feature_importances/k




#MinMax scaling is used for normalizing the scores to bring them to a same range for comparison.
rfc_fi= pd.DataFrame({"feature": xtrain_df.columns, "mean_fi": fi})


# Normalize FI
rfc_fi['fi_normalized'] = MinMaxScaler(feature_range=(0, 1)).fit_transform(rfc_fi[['mean_fi']])

In [None]:
# "SHAP Averaging" or "Ensemble SHAP"

# Initialize variables
shap_score = None  # Start with None since we don't know the shape yet
shap_scores = []   # Store SHAP values for all runs


k = 100  # Number of runs

for i in range(k):
    # Train the Random Forest model
    rfc = RandomForestClassifier(250, random_state=i, n_jobs=-1, class_weight="balanced")
    history = rfc.fit(xtrain_df, ytrain)
    
    # Compute SHAP values
    explainer = shap.TreeExplainer(rfc)
    shap_value = explainer.shap_values(np.array(xtrain_df), check_additivity=False)  # Returns a list for classification
    

    
    shap_scores.append(shap_value[1])  # Append the current run's SHAP values

    
    # Accumulate SHAP values (initialize shap_score as an array on the first iteration)
    if shap_score is None:
        shap_score = np.array(shap_value[1])  # Initialize as a NumPy array
    else:
        shap_score += np.array(shap_value[1])  # Add to the accumulated SHAP values
    
    print(f"{i+1}) SHAP calculation done.")




# Absolute average SHAP values across all runs
sh = np.mean(np.abs(shap_score), axis=0) / k


#MinMax scaling is used for normalizing the scores to bring them to a same range for comparison.
rfc_shap= pd.DataFrame({"feature": xtrain_df.columns, "si": sh})


# Normalize SHAP
rfc_shap['si_normalized'] = MinMaxScaler(feature_range=(0, 1)).fit_transform(rfc_shap[['si']])

In [None]:
#Permutation importance (PI)

rfc = RandomForestClassifier(250, random_state= 61, n_jobs= -1, class_weight= "balanced")

perm_importance= permutation_importance(rfc, xtrain_df, ytrain, scoring= make_scorer(f1_score, average= "binary"),
                                        n_repeats= 100, random_state= 51, n_jobs= -1)




#MinMax scaling is used for normalizing the scores to bring them to a same range for comparison.
rfc_pi= pd.DataFrame({"feature": xtrain_df.columns, "pi": perm_importance.importances_mean})

# Normalize PI
rfc_pi['pi_normalized'] = MinMaxScaler(feature_range=(0, 1)).fit_transform(rfc_pi[['pi']])

In [None]:
#Concatenate all importance scores

rfc_fs= pd.DataFrame({"feature": xtrain_df.columns, "fi_normalized": rfc_fi['fi_normalized'], 
                          "si_normalized": rfc_shap['si_normalized'], "pi_normalized": rfc_pi["pi_normalized"]})

In [None]:
                                                        #Feature Selection Strategies

In [None]:
#RF Feature Importance

fi_mean= round(np.mean(rfc_fs["fi_normalized"]), 5)
fi_std= round(np.std(rfc_fs["fi_normalized"]), 5)
fi_ci_lower= round(np.percentile(rfc_fs["fi_normalized"], 2.5), 5)
fi_ci_upper= round(np.percentile(rfc_fs["fi_normalized"], 97.5), 5)

print(f"RF Feature Importance\nMean= {fi_mean}\nStdev= {fi_std}\n95_CI_lower= {fi_ci_lower}\n95_CI_upper= {fi_ci_upper}")

In [None]:
#SHAP values

si_mean= round(np.mean(rfc_fs["si_normalized"]), 5)
si_std= round(np.std(rfc_fs["si_normalized"]), 5)
si_ci_lower= round(np.percentile(rfc_fs["si_normalized"], 2.5), 5)
si_ci_upper= round(np.percentile(rfc_fs["si_normalized"], 97.5), 5)

print(f"SHAP values\nMean= {si_mean}\nStdev= {si_std}\n95_CI_lower= {si_ci_lower}\n95_CI_upper= {si_ci_upper}")

In [None]:
#Permutation Importance

pi_mean= round(np.mean(rfc_fs["pi_normalized"]), 5)
pi_std= round(np.std(rfc_fs["pi_normalized"]), 5)
pi_ci_lower= round(np.percentile(rfc_fs["pi_normalized"], 2.5), 5)
pi_ci_upper= round(np.percentile(rfc_fs["pi_normalized"], 97.5), 5)

print(f"Permutation Importance\nMean= {pi_mean}\nStdev= {pi_std}\n95_CI_lower= {pi_ci_lower}\n95_CI_upper= {pi_ci_upper}")

In [None]:
                    # Feature selection based on Means as thresholds (Best results was achieved using Mean as thresholds)

In [None]:
cols_rfc_fi= rfc_fs_val.loc[rfc_fs_val["fi_normalized"] >= fi_mean, "feature"].tolist()
print("Total=", np.shape(rfc_fs_val)[0])
print("Retained=", len(cols_rfc_fi))
print("Rejected=", np.shape(rfc_fs_val)[0] - len(cols_rfc_fi))

In [None]:
cols_rfc_si= rfc_fs_val.loc[rfc_fs_val["si_normalized"] >= si_mean, "feature"].tolist()
print("Total=", np.shape(rfc_fs_val)[0])
print("Retained=", len(cols_rfc_si))
print("Rejected=", np.shape(rfc_fs_val)[0] - len(cols_rfc_si))

In [None]:
cols_rfc_pi= rfc_fs_val.loc[rfc_fs_val["pi_normalized"] >= pi_mean, "feature"].tolist()
print("Total=", np.shape(rfc_fs_val)[0])
print("Retained=", len(cols_rfc_pi))
print("Rejected=", np.shape(rfc_fs_val)[0] - len(cols_rfc_pi))

In [None]:
# Best result was achieved at AND operation
# Features selected from all methods combined, where threshold is crossed in all 3 methods: AND.

cols_selected= list((set(cols_rfc_fi) & set(cols_rfc_si) & set(cols_rfc_pi)))

print("Total=", np.shape(rfc_fs_val)[0])
print("If combined, features retained=", len(cols_selected))
print("If combined, features rejected=", np.shape(rfc_fs_val)[0] - len(cols_selected))

In [None]:
                                  #RFECV on the data: from feature selection based on Means as thresholds

In [None]:
x_train_red= xtrain_df[cols_selected]
x_val_red= xval_df[cols_selected]

print(x_train_red.shape)
print(x_val_red.shape)

In [None]:
#RFECV for s-kfold= 20 for different step-sizes

for i in range(100, 4, -5):
    print("\n*************************************************\nStep-size=", i)
    
    rfc= RandomForestClassifier(n_estimators= 250, random_state= 46, n_jobs= -1, class_weight= "balanced")
    skf= StratifiedKFold(20, shuffle= True, random_state= 11)
    
    rfecv = RFECV(estimator= rfc, step= i, cv= skf, scoring= mcc_scorer, n_jobs= -1)
    
    rfecv.fit(x_train_red, ytrain)
    selected_features = x_train_red.columns[rfecv.support_]
    print(f"Optimal number of features: {rfecv.n_features_}")
    
    train_and_evaluate(x_train_red[selected_features], ytrain, x_val_red[selected_features], yval)

In [None]:
#Best result was achieved at step-size= 30

rfc= RandomForestClassifier(n_estimators= 250, random_state= 46, n_jobs= -1, class_weight= "balanced")
skf= StratifiedKFold(20, shuffle= True, random_state= 11)

rfecv = RFECV(estimator= rfc, step= 30, cv= skf, scoring= mcc_scorer, n_jobs= -1)

rfecv.fit(x_train_red, ytrain)
selected_features = x_train_red.columns[rfecv.support_]
print(f"Optimal number of features: {rfecv.n_features_}")

train_and_evaluate(x_train_red[selected_features], ytrain, x_val_red[selected_features], yval)

In [None]:
# Assuming 'selected_features' is an Index object
selected_features_df = pd.DataFrame(selected_features, columns=['Feature'])

# Save it as a CSV file
selected_features_df.to_csv(root+ "classification/rfecv_cols_df.csv", index=False)

In [None]:
                    #Save xtrain, xval and xtest datasets using the selected_features as columns only for subsequesnt analyses