In [14]:
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, average_precision_score


import pandas as pd
import numpy as np
import os

from tqdm import tqdm

In [2]:
XL_PATH = r"inputs/radiomicsFeatures.csv"
OUT_DIR = r"outputs/backwardSFS"
MASK_FEATS = ["id", "label"]

In [3]:
feats_df = pd.read_csv(XL_PATH)
feats_df.head()

Unnamed: 0,id,label,sub_wout_original_glcm_ClusterProminence,adc_original_firstorder_Minimum,sub_wout_original_glszm_LowGrayLevelZoneEmphasis,sub_wout_original_firstorder_Maximum,adc_original_glcm_ClusterShade,sub_wout_original_firstorder_Mean,sub_win_original_glcm_Autocorrelation,adc_original_glszm_LargeAreaLowGrayLevelEmphasis,...,sub_win_original_glszm_ZoneEntropy,t2w_original_glszm_SizeZoneNonUniformityNormalized,t2w_original_glcm_JointEntropy,t2w_original_glszm_LargeAreaHighGrayLevelEmphasis,sub_win_original_glszm_SizeZoneNonUniformityNormalized,sub_wout_original_glszm_SmallAreaHighGrayLevelEmphasis,sub_win_original_glcm_MaximumProbability,sub_win_original_glcm_Imc1,sub_wout_original_glcm_JointEntropy,t2w_original_glszm_LargeAreaLowGrayLevelEmphasis
0,2535039,1,4677862.0,0.0,0.003103,600.0,14835.837461,299.900214,3755.933491,0.010393,...,6.339939,0.28647,10.166389,27423.571919,0.4611,2946.8378,0.034622,-0.041978,10.452108,0.033786
1,2417361,0,4834267.0,0.0,0.001672,600.0,-17634.03485,299.918235,3941.494865,0.058145,...,7.42477,0.350004,11.649157,21732.551407,0.604518,3322.225544,0.002107,-0.109242,11.891117,0.009861
2,2602563,1,5159220.0,0.0,0.0016,600.0,-19736.4305,299.820687,2455.254084,0.019202,...,7.23927,0.350692,10.919838,15567.069802,0.574356,3407.597573,0.004002,-0.194449,11.214368,0.018991
3,2902440,0,3613791.0,0.0,0.002428,600.0,-12881.976888,299.240444,3954.079034,0.576021,...,7.45439,0.380537,11.53,18389.243521,0.566131,3121.573712,0.004134,-0.116415,11.669841,0.007846
4,2921898,0,5773968.0,0.0,0.00172,600.0,2116.811733,299.983523,3793.819336,0.011764,...,6.75517,0.265413,9.504938,245786.779116,0.469149,3175.569089,0.027634,-0.05868,11.459667,0.024444


### Stratified CV Fold Generation (Consistent across FS algorithm)

In [4]:
pids = feats_df.id.to_numpy()
labels = feats_df.label.to_numpy()

In [5]:
cv_count = 5

cv_dict = {}
skf = StratifiedKFold(n_splits = cv_count, random_state=0, shuffle=True)

for i, (train_idx, val_idx) in enumerate(skf.split(pids, labels)):
    cv_dict[i] = {"train":pids[train_idx], "val":pids[val_idx]} 

cv_dict

{0: {'train': array([2602563, 2921898, 3039346, 3110297, 3110706, 3137563, 3207798,
         3213683, 3222346, 3226033, 3303911, 3325442, 3327697, 3329611,
         3336537, 3405013, 3416781, 3419338, 3502691, 3504033, 3513664,
         3519247, 3522629, 3534419, 3536230, 3607842, 3610014, 3613524,
         3616819, 3618480, 3621681, 3621824, 3622974, 3631910, 3632788,
         3701079, 3702147, 3707565, 3713983, 3714280, 3715560, 3716356,
         3718385, 3720950, 3724846, 3725583, 3726460, 3727030, 3727850,
         3729691, 3730269, 3800022, 3802504, 3808093, 3811134, 3811967,
         3812057, 3815317, 3817381, 3819464, 3821188, 3821859, 3822353,
         3823428, 3825318, 3827579, 3828403, 3901619, 3904119, 3904751,
         3906071, 3906505, 3907211, 3907314, 3907344, 3908895, 3911843,
         9534972, 9803775, 9816715]),
  'val': array([2535039, 2417361, 2902440, 3310301, 3332798, 3534604, 3605303,
         3621917, 3702859, 3703425, 3712766, 3728041, 3805884, 3811851,
       

### Feature Selection Pipeline

In [6]:
def run_bsfs(estimator, feats_df):

    global MASK_FEATS

    features = feats_df.columns[~feats_df.columns.isin(MASK_FEATS)].to_list()
    
    n = len(features)
    rank_df = {"feature":[], "rank":[]}
    
    pbar = tqdm(range(n-1),desc=f"Running Backwards SFS with {estimator.__class__.__name__}", position=0)
    
    while n>1:
    
        X = feats_df[features].to_numpy()
        y = feats_df["label"].to_numpy().ravel()

        sfs_pipeline = make_pipeline(StandardScaler(), SFS(estimator, n_features_to_select=n-1, direction='backward', scoring="roc_auc", cv=5))
        sfs_pipeline.fit(X,y)
          

        eliminated_feature = np.array(features)[~sfs_pipeline['sequentialfeatureselector'].support_][0]
        rank_df["feature"].append(eliminated_feature)
        rank_df["rank"].append(n)

        features = np.array(features)[sfs_pipeline['sequentialfeatureselector'].support_]

        n -= 1
        
        pbar.update()

    rank_df["feature"].append(features[0])
    rank_df["rank"].append(n)
        
    return pd.DataFrame(rank_df)

In [7]:
estimators = [LogisticRegression(penalty=None, max_iter=10_000), SVC(kernel="linear", max_iter=10_000, probability=True), RandomForestClassifier(), MLPClassifier(max_iter=10_000)]

for estimator in estimators:

    out_dir = os.path.join(OUT_DIR, estimator.__class__.__name__)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    for fold in cv_dict:
        
        print(f"Running for fold - {fold}")
        print("-"*50)

        fold_feats_df = feats_df[feats_df["id"].isin(cv_dict[fold]["train"])]

        rank_df = run_bsfs(estimator, fold_feats_df)

        rank_df.to_csv(os.path.join(out_dir, f"rank_df{fold}.csv"), index=False)

        
        

Running for fold - 0
--------------------------------------------------


Running Backwards SFS with LogisticRegression: 100%|█████████████████████████████████████████████| 88/88 [01:59<00:00,  1.35s/it]


Running for fold - 1
--------------------------------------------------


Running Backwards SFS with LogisticRegression: 100%|█████████████████████████████████████████████| 88/88 [02:00<00:00,  1.37s/it]


Running for fold - 2
--------------------------------------------------


Running Backwards SFS with LogisticRegression: 100%|█████████████████████████████████████████████| 88/88 [02:03<00:00,  1.40s/it]


Running for fold - 3
--------------------------------------------------


Running Backwards SFS with LogisticRegression: 100%|█████████████████████████████████████████████| 88/88 [01:59<00:00,  1.36s/it]


Running for fold - 4
--------------------------------------------------


Running Backwards SFS with LogisticRegression: 100%|█████████████████████████████████████████████| 88/88 [02:12<00:00,  1.51s/it]


Running for fold - 0
--------------------------------------------------


Running Backwards SFS with SVC: 100%|████████████████████████████████████████████████████████████| 88/88 [01:16<00:00,  1.15it/s]


Running for fold - 1
--------------------------------------------------


Running Backwards SFS with SVC: 100%|████████████████████████████████████████████████████████████| 88/88 [01:17<00:00,  1.13it/s]


Running for fold - 2
--------------------------------------------------


Running Backwards SFS with SVC: 100%|████████████████████████████████████████████████████████████| 88/88 [01:19<00:00,  1.11it/s]


Running for fold - 3
--------------------------------------------------


Running Backwards SFS with SVC: 100%|████████████████████████████████████████████████████████████| 88/88 [01:14<00:00,  1.18it/s]


Running for fold - 4
--------------------------------------------------


Running Backwards SFS with SVC: 100%|████████████████████████████████████████████████████████████| 88/88 [01:19<00:00,  1.11it/s]


Running for fold - 0
--------------------------------------------------


Running Backwards SFS with RandomForestClassifier: 100%|█████████████████████████████████████████| 88/88 [42:41<00:00, 29.11s/it]


Running for fold - 1
--------------------------------------------------


Running Backwards SFS with RandomForestClassifier: 100%|█████████████████████████████████████████| 88/88 [42:52<00:00, 29.23s/it]


Running for fold - 2
--------------------------------------------------


Running Backwards SFS with RandomForestClassifier: 100%|█████████████████████████████████████████| 88/88 [42:53<00:00, 29.25s/it]


Running for fold - 3
--------------------------------------------------


Running Backwards SFS with RandomForestClassifier: 100%|█████████████████████████████████████████| 88/88 [43:06<00:00, 29.39s/it]


Running for fold - 4
--------------------------------------------------


Running Backwards SFS with RandomForestClassifier: 100%|█████████████████████████████████████████| 88/88 [43:01<00:00, 29.33s/it]


Running for fold - 0
--------------------------------------------------


Running Backwards SFS with MLPClassifier: 100%|██████████████████████████████████████████████████| 88/88 [51:13<00:00, 34.93s/it]


Running for fold - 1
--------------------------------------------------


Running Backwards SFS with MLPClassifier: 100%|██████████████████████████████████████████████████| 88/88 [50:08<00:00, 34.19s/it]


Running for fold - 2
--------------------------------------------------


Running Backwards SFS with MLPClassifier: 100%|██████████████████████████████████████████████████| 88/88 [56:12<00:00, 38.32s/it]


Running for fold - 3
--------------------------------------------------


Running Backwards SFS with MLPClassifier: 100%|██████████████████████████████████████████████████| 88/88 [55:30<00:00, 37.85s/it]


Running for fold - 4
--------------------------------------------------


Running Backwards SFS with MLPClassifier: 100%|██████████████████████████████████████████████████| 88/88 [53:51<00:00, 36.72s/it]


### Cross-Validation Performance

In [30]:
NUM_SELECTED_FEATS = 5

estimators = [LogisticRegression(penalty=None, max_iter=10_000), SVC(kernel="linear", max_iter=10_000, probability=True), RandomForestClassifier(), MLPClassifier(max_iter=10_000)]

In [31]:
performance_df = {"estimator":[], "fold":[], "roc_auc":[], "prc_auc":[]}

for estimator in estimators:

    print(f"Evaluating estimator - {estimator.__class__.__name__}")

    pipeline = make_pipeline(StandardScaler(), estimator)

    for fold in cv_dict:

        train_feats_df = feats_df[feats_df["id"].isin(cv_dict[fold]["train"])]
        test_feats_df = feats_df[feats_df["id"].isin(cv_dict[fold]["val"])]

        rank_df = pd.read_csv(os.path.join(OUT_DIR, estimator.__class__.__name__, f"rank_df{fold}.csv"))

        selected_features = rank_df[rank_df["rank"]<=NUM_SELECTED_FEATS]["feature"].to_list()

        train_X = train_feats_df[selected_features].to_numpy()
        train_y = train_feats_df["label"].to_numpy().ravel()

        test_X = test_feats_df[selected_features].to_numpy()
        test_y = test_feats_df["label"].to_numpy().ravel()

        pipeline.fit(train_X, train_y)
        prob_y = pipeline.predict_proba(test_X)[:,1]

        roc_auc = roc_auc_score(test_y, prob_y)
        prc_auc = average_precision_score(test_y, prob_y)

        performance_df["estimator"].append(estimator.__class__.__name__)
        performance_df["fold"].append(fold)
        performance_df["roc_auc"].append(roc_auc)
        performance_df["prc_auc"].append(prc_auc)
        
        
performance_df = pd.DataFrame(performance_df)
performance_df.to_csv(os.path.join(OUT_DIR, "performance_df.csv"), index=False)

Evaluating estimator - LogisticRegression
Evaluating estimator - SVC
Evaluating estimator - RandomForestClassifier
Evaluating estimator - MLPClassifier


In [33]:
performance_df.groupby(by="estimator").mean().mean()

fold       2.000000
roc_auc    0.502488
prc_auc    0.383078
dtype: float64