In [1]:
import pandas as pd
import os
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, average_precision_score

In [10]:
NUM_REPEATS = 100
NUM_SELECTED_FEATS = 4

ESTIMATORS = [LogisticRegression(penalty=None, max_iter=10_000), SVC(kernel="linear", max_iter=10_000, probability=True), RandomForestClassifier(), MLPClassifier(max_iter=10_000)]
FS_METHODS = ["random", "oneDSAE", "bayesianDSAE", "ensembleDSAE", "backwardSFS/LogisticRegression", "backwardSFS/SVC", "backwardSFS/RandomForestClassifier", "backwardSFS/MLPClassifier"]

XL_PATH = r"inputs/radiomicsFeatures.csv"
OUT_DIR = r"outputs"

In [11]:
feats_df = pd.read_csv(XL_PATH)
feats_df.head()

Unnamed: 0,id,label,sub_wout_original_glcm_ClusterProminence,adc_original_firstorder_Minimum,sub_wout_original_glszm_LowGrayLevelZoneEmphasis,sub_wout_original_firstorder_Maximum,adc_original_glcm_ClusterShade,sub_wout_original_firstorder_Mean,sub_win_original_glcm_Autocorrelation,adc_original_glszm_LargeAreaLowGrayLevelEmphasis,...,sub_win_original_glszm_ZoneEntropy,t2w_original_glszm_SizeZoneNonUniformityNormalized,t2w_original_glcm_JointEntropy,t2w_original_glszm_LargeAreaHighGrayLevelEmphasis,sub_win_original_glszm_SizeZoneNonUniformityNormalized,sub_wout_original_glszm_SmallAreaHighGrayLevelEmphasis,sub_win_original_glcm_MaximumProbability,sub_win_original_glcm_Imc1,sub_wout_original_glcm_JointEntropy,t2w_original_glszm_LargeAreaLowGrayLevelEmphasis
0,2535039,1,4677862.0,0.0,0.003103,600.0,14835.837461,299.900214,3755.933491,0.010393,...,6.339939,0.28647,10.166389,27423.571919,0.4611,2946.8378,0.034622,-0.041978,10.452108,0.033786
1,2417361,0,4834267.0,0.0,0.001672,600.0,-17634.03485,299.918235,3941.494865,0.058145,...,7.42477,0.350004,11.649157,21732.551407,0.604518,3322.225544,0.002107,-0.109242,11.891117,0.009861
2,2602563,1,5159220.0,0.0,0.0016,600.0,-19736.4305,299.820687,2455.254084,0.019202,...,7.23927,0.350692,10.919838,15567.069802,0.574356,3407.597573,0.004002,-0.194449,11.214368,0.018991
3,2902440,0,3613791.0,0.0,0.002428,600.0,-12881.976888,299.240444,3954.079034,0.576021,...,7.45439,0.380537,11.53,18389.243521,0.566131,3121.573712,0.004134,-0.116415,11.669841,0.007846
4,2921898,0,5773968.0,0.0,0.00172,600.0,2116.811733,299.983523,3793.819336,0.011764,...,6.75517,0.265413,9.504938,245786.779116,0.469149,3175.569089,0.027634,-0.05868,11.459667,0.024444


In [12]:
pids = feats_df.id.to_numpy()
labels = feats_df.label.to_numpy()

In [13]:
performance_df = {"outer_seed":[], "fs_method":[], "estimator":[],  "roc_auc":[], "prc_auc":[]}

for i in tqdm(range(NUM_REPEATS), position=0):

    train_pids, test_pids, train_labels, test_labels = train_test_split(pids, labels, test_size=0.25, random_state=i, stratify=labels)

    for fs_method in FS_METHODS:

        for estimator in ESTIMATORS:
            
            pipeline = make_pipeline(StandardScaler(), estimator)
        
            train_feats_df = feats_df[feats_df["id"].isin(train_pids)]
            test_feats_df = feats_df[feats_df["id"].isin(test_pids)]
    
            rank_df = pd.read_csv(os.path.join(OUT_DIR, fs_method, f"rank_df{i}.csv"))
    
            selected_features = rank_df[rank_df["rank"]<=NUM_SELECTED_FEATS]["feature"].to_list()
    
            train_X = train_feats_df[selected_features].to_numpy()
            train_y = train_feats_df["label"].to_numpy().ravel()
    
            test_X = test_feats_df[selected_features].to_numpy()
            test_y = test_feats_df["label"].to_numpy().ravel()
    
            pipeline.fit(train_X, train_y)
            prob_y = pipeline.predict_proba(test_X)[:,1]
    
            roc_auc = roc_auc_score(test_y, prob_y)
            prc_auc = average_precision_score(test_y, prob_y)

            performance_df["outer_seed"].append(i)
            performance_df["fs_method"].append(fs_method)
            performance_df["estimator"].append(estimator.__class__.__name__)
            performance_df["roc_auc"].append(roc_auc)
            performance_df["prc_auc"].append(prc_auc)
                
                
performance_df = pd.DataFrame(performance_df)
performance_df.to_csv(os.path.join(OUT_DIR, "performance_df.csv"), index=False)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [13:47<00:00,  8.27s/it]


In [21]:
# MASK_FEATS = ["id", "label"]
# feats = feats_df.columns[~feats_df.columns.isin(MASK_FEATS)].to_list()

# len(feats)

89

In [22]:
# feats

['sub_wout_original_glcm_ClusterProminence',
 'adc_original_firstorder_Minimum',
 'sub_wout_original_glszm_LowGrayLevelZoneEmphasis',
 'sub_wout_original_firstorder_Maximum',
 'adc_original_glcm_ClusterShade',
 'sub_wout_original_firstorder_Mean',
 'sub_win_original_glcm_Autocorrelation',
 'adc_original_glszm_LargeAreaLowGrayLevelEmphasis',
 'sub_win_original_glrlm_ShortRunHighGrayLevelEmphasis',
 'sub_wout_original_firstorder_10Percentile',
 'adc_original_firstorder_10Percentile',
 'sub_wout_original_glszm_LargeAreaLowGrayLevelEmphasis',
 'sub_win_original_glcm_ClusterProminence',
 'sub_wout_original_firstorder_Minimum',
 'sub_wout_original_firstorder_Range',
 'sub_wout_original_glcm_ClusterTendency',
 't2w_original_glszm_GrayLevelVariance',
 'sub_wout_original_firstorder_RootMeanSquared',
 'sub_wout_original_glrlm_HighGrayLevelRunEmphasis',
 'sub_wout_original_glcm_JointAverage',
 'sub_win_original_firstorder_Mean',
 'sub_wout_original_firstorder_Variance',
 'sub_wout_original_firsto

In [14]:
performance_df.head(10)

Unnamed: 0,outer_seed,fs_method,estimator,roc_auc,prc_auc
0,0,random,LogisticRegression,0.674603,0.586271
1,0,random,SVC,0.587302,0.529796
2,0,random,RandomForestClassifier,0.666667,0.436985
3,0,random,MLPClassifier,0.595238,0.417934
4,0,oneDSAE,LogisticRegression,0.587302,0.430914
5,0,oneDSAE,SVC,0.309524,0.247613
6,0,oneDSAE,RandomForestClassifier,0.519841,0.49739
7,0,oneDSAE,MLPClassifier,0.47619,0.330474
8,0,bayesianDSAE,LogisticRegression,0.706349,0.717288
9,0,bayesianDSAE,SVC,0.301587,0.272186


In [19]:
performance_df.groupby(by=["fs_method", "estimator"]).mean().groupby(by="fs_method").mean()

Unnamed: 0_level_0,outer_seed,roc_auc,prc_auc
fs_method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
backwardSFS/LogisticRegression,49.5,0.5337,0.396037
backwardSFS/MLPClassifier,49.5,0.539782,0.395234
backwardSFS/RandomForestClassifier,49.5,0.53999,0.392295
backwardSFS/SVC,49.5,0.529276,0.393799
bayesianDSAE,49.5,0.528165,0.390728
ensembleDSAE,49.5,0.54377,0.405384
oneDSAE,49.5,0.53369,0.391805
random,49.5,0.544087,0.399354


In [None]:
# performance_df.groupby(by="estimator").mean()