In [1]:
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, average_precision_score


import time
import pandas as pd
import numpy as np
import os

from tqdm import tqdm

In [2]:
XL_PATH = r"inputs/radiomicsFeatures.csv"
OUT_DIR = r"outputs/backwardSFS"
MASK_FEATS = ["id", "label"]

NUM_REPEATS = 100

In [3]:
feats_df = pd.read_csv(XL_PATH)
feats_df.head()

Unnamed: 0,id,label,sub_wout_original_glcm_ClusterProminence,adc_original_firstorder_Minimum,sub_wout_original_glszm_LowGrayLevelZoneEmphasis,sub_wout_original_firstorder_Maximum,adc_original_glcm_ClusterShade,sub_wout_original_firstorder_Mean,sub_win_original_glcm_Autocorrelation,adc_original_glszm_LargeAreaLowGrayLevelEmphasis,...,sub_win_original_glszm_ZoneEntropy,t2w_original_glszm_SizeZoneNonUniformityNormalized,t2w_original_glcm_JointEntropy,t2w_original_glszm_LargeAreaHighGrayLevelEmphasis,sub_win_original_glszm_SizeZoneNonUniformityNormalized,sub_wout_original_glszm_SmallAreaHighGrayLevelEmphasis,sub_win_original_glcm_MaximumProbability,sub_win_original_glcm_Imc1,sub_wout_original_glcm_JointEntropy,t2w_original_glszm_LargeAreaLowGrayLevelEmphasis
0,2535039,1,4677862.0,0.0,0.003103,600.0,14835.837461,299.900214,3755.933491,0.010393,...,6.339939,0.28647,10.166389,27423.571919,0.4611,2946.8378,0.034622,-0.041978,10.452108,0.033786
1,2417361,0,4834267.0,0.0,0.001672,600.0,-17634.03485,299.918235,3941.494865,0.058145,...,7.42477,0.350004,11.649157,21732.551407,0.604518,3322.225544,0.002107,-0.109242,11.891117,0.009861
2,2602563,1,5159220.0,0.0,0.0016,600.0,-19736.4305,299.820687,2455.254084,0.019202,...,7.23927,0.350692,10.919838,15567.069802,0.574356,3407.597573,0.004002,-0.194449,11.214368,0.018991
3,2902440,0,3613791.0,0.0,0.002428,600.0,-12881.976888,299.240444,3954.079034,0.576021,...,7.45439,0.380537,11.53,18389.243521,0.566131,3121.573712,0.004134,-0.116415,11.669841,0.007846
4,2921898,0,5773968.0,0.0,0.00172,600.0,2116.811733,299.983523,3793.819336,0.011764,...,6.75517,0.265413,9.504938,245786.779116,0.469149,3175.569089,0.027634,-0.05868,11.459667,0.024444


In [4]:
pids = feats_df.id.to_numpy()
labels = feats_df.label.to_numpy()

### Feature Selection Pipeline

In [5]:
def run_bsfs(estimator, feats_df):

    global MASK_FEATS

    features = feats_df.columns[~feats_df.columns.isin(MASK_FEATS)].to_list()
    
    n = len(features)
    rank_df = {"feature":[], "rank":[]}
    
    pbar = tqdm(range(n-1),desc=f"Running Backwards SFS with {estimator.__class__.__name__}", position=0)
    
    while n>1:
    
        X = feats_df[features].to_numpy()
        y = feats_df["label"].to_numpy().ravel()

        sfs_pipeline = make_pipeline(StandardScaler(), SFS(estimator, n_features_to_select=n-1, direction='backward', scoring="roc_auc", cv=5))
        sfs_pipeline.fit(X,y)
          

        eliminated_feature = np.array(features)[~sfs_pipeline['sequentialfeatureselector'].support_][0]
        rank_df["feature"].append(eliminated_feature)
        rank_df["rank"].append(n)

        features = np.array(features)[sfs_pipeline['sequentialfeatureselector'].support_]

        n -= 1
        
        pbar.update()

    rank_df["feature"].append(features[0])
    rank_df["rank"].append(n)
        
    return pd.DataFrame(rank_df)

In [None]:
estimators = [LogisticRegression(penalty=None, max_iter=10_000), SVC(kernel="linear", max_iter=10_000, probability=True), RandomForestClassifier(), MLPClassifier(max_iter=10_000)]

results_df = {"outer_seed":[], "estimator":[], "exe_time":[]}
for estimator in estimators:

    out_dir = os.path.join(OUT_DIR, estimator.__class__.__name__)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    for i in range(NUM_REPEATS):

        train_pids, test_pids, train_labels, test_labels = train_test_split(pids, labels, test_size=0.25, random_state=i, stratify=labels)

        print(f"Running for repeat#- {i+1}")
        print("-"*50)

        train_feats_df = feats_df[feats_df["id"].isin(train_pids)]

        start_time = time.time()
        rank_df = run_bsfs(estimator, train_feats_df)
        exe_time = time.time() - start_time

        results_df["outer_seed"].append(i)
        results_df["estimator"].append(estimator.__class__.__name__)
        results_df["exe_time"].append(exe_time)

        rank_df.to_csv(os.path.join(out_dir, f"rank_df{i}.csv"), index=False)

results_df = pd.DataFrame(results_df)
results_df.to_csv(os.path.join(OUT_DIR, "results_df.csv"))

Running for repeat#- 1
--------------------------------------------------


Running Backwards SFS with LogisticRegression: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88/88 [02:06<00:00,  1.44s/it]


Running for repeat#- 2
--------------------------------------------------


Running Backwards SFS with LogisticRegression: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88/88 [02:04<00:00,  1.42s/it]


Running for repeat#- 3
--------------------------------------------------


Running Backwards SFS with LogisticRegression: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88/88 [02:06<00:00,  1.44s/it]


Running for repeat#- 4
--------------------------------------------------


Running Backwards SFS with LogisticRegression: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88/88 [02:08<00:00,  1.46s/it]


Running for repeat#- 5
--------------------------------------------------


Running Backwards SFS with LogisticRegression: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88/88 [02:02<00:00,  1.39s/it]


Running for repeat#- 6
--------------------------------------------------


Running Backwards SFS with LogisticRegression: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88/88 [02:05<00:00,  1.43s/it]


Running for repeat#- 7
--------------------------------------------------


Running Backwards SFS with LogisticRegression: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88/88 [02:06<00:00,  1.44s/it]


Running for repeat#- 8
--------------------------------------------------


Running Backwards SFS with LogisticRegression: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88/88 [02:08<00:00,  1.47s/it]


Running for repeat#- 9
--------------------------------------------------


Running Backwards SFS with LogisticRegression: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88/88 [02:12<00:00,  1.51s/it]


Running for repeat#- 10
--------------------------------------------------


Running Backwards SFS with LogisticRegression: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88/88 [02:10<00:00,  1.48s/it]


Running for repeat#- 11
--------------------------------------------------


Running Backwards SFS with LogisticRegression: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88/88 [02:07<00:00,  1.45s/it]


Running for repeat#- 12
--------------------------------------------------


Running Backwards SFS with LogisticRegression:  34%|██████████████████████████████████████▊                                                                           | 30/88 [01:11<01:54,  1.97s/it]