In [5]:
import os
import pandas as pd
import numpy as np

In [15]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, average_precision_score

In [6]:
MASK_FEATS = ["id", "label"]
XL_PATH = r"inputs/radiomicsFeatures.csv"
OUT_DIR = r"outputs/random"

In [None]:
feats_df = pd.read_csv(XL_PATH)

### Stratified CV Fold Generation (Consistent across FS algorithm)

In [16]:
pids = feats_df.id.to_numpy()
labels = feats_df.label.to_numpy()

In [17]:
cv_count = 5

cv_dict = {}
skf = StratifiedKFold(n_splits = cv_count, random_state=0, shuffle=True)

for i, (train_idx, val_idx) in enumerate(skf.split(pids, labels)):
    cv_dict[i] = {"train":pids[train_idx], "val":pids[val_idx]} 

cv_dict

{0: {'train': array([2602563, 2921898, 3039346, 3110297, 3110706, 3137563, 3207798,
         3213683, 3222346, 3226033, 3303911, 3325442, 3327697, 3329611,
         3336537, 3405013, 3416781, 3419338, 3502691, 3504033, 3513664,
         3519247, 3522629, 3534419, 3536230, 3607842, 3610014, 3613524,
         3616819, 3618480, 3621681, 3621824, 3622974, 3631910, 3632788,
         3701079, 3702147, 3707565, 3713983, 3714280, 3715560, 3716356,
         3718385, 3720950, 3724846, 3725583, 3726460, 3727030, 3727850,
         3729691, 3730269, 3800022, 3802504, 3808093, 3811134, 3811967,
         3812057, 3815317, 3817381, 3819464, 3821188, 3821859, 3822353,
         3823428, 3825318, 3827579, 3828403, 3901619, 3904119, 3904751,
         3906071, 3906505, 3907211, 3907314, 3907344, 3908895, 3911843,
         9534972, 9803775, 9816715]),
  'val': array([2535039, 2417361, 2902440, 3310301, 3332798, 3534604, 3605303,
         3621917, 3702859, 3703425, 3712766, 3728041, 3805884, 3811851,
       

### Feature Selection Pipeline

In [7]:
class RandomFS(object):

    def __init__(self):
        pass

    def __call__(self, feats_df, mask_feats):

        
        features = feats_df.columns[~feats_df.columns.isin(mask_feats)].to_list()
        ranks = np.arange(len(features))+1
        ranks = np.random.permutation(ranks).tolist()

        rank_df = pd.DataFrame({"feature":features, "rank":ranks})

        return rank_df


In [8]:
if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)
    
for fold in range(cv_count):

    rank_df = RandomFS()(feats_df, MASK_FEATS)
    rank_df.to_csv(os.path.join(OUT_DIR, f"rank_df{fold}.csv"), index=False)

### Cross Validation Performance

In [18]:
NUM_SELECTED_FEATS = 5

estimators = [LogisticRegression(penalty=None, max_iter=10_000), SVC(kernel="linear", max_iter=10_000, probability=True), RandomForestClassifier(), MLPClassifier(max_iter=10_000)]

In [19]:
performance_df = {"estimator":[], "fold":[], "roc_auc":[], "prc_auc":[]}

for estimator in estimators:

    print(f"Evaluating estimator - {estimator.__class__.__name__}")

    pipeline = make_pipeline(StandardScaler(), estimator)

    for fold in cv_dict:

        train_feats_df = feats_df[feats_df["id"].isin(cv_dict[fold]["train"])]
        test_feats_df = feats_df[feats_df["id"].isin(cv_dict[fold]["val"])]

        rank_df = pd.read_csv(os.path.join(OUT_DIR, f"rank_df{fold}.csv"))

        selected_features = rank_df[rank_df["rank"]<=NUM_SELECTED_FEATS]["feature"].to_list()

        train_X = train_feats_df[selected_features].to_numpy()
        train_y = train_feats_df["label"].to_numpy().ravel()

        test_X = test_feats_df[selected_features].to_numpy()
        test_y = test_feats_df["label"].to_numpy().ravel()

        pipeline.fit(train_X, train_y)
        prob_y = pipeline.predict_proba(test_X)[:,1]

        roc_auc = roc_auc_score(test_y, prob_y)
        prc_auc = average_precision_score(test_y, prob_y)

        performance_df["estimator"].append(estimator.__class__.__name__)
        performance_df["fold"].append(fold)
        performance_df["roc_auc"].append(roc_auc)
        performance_df["prc_auc"].append(prc_auc)
        
        
performance_df = pd.DataFrame(performance_df)
performance_df.to_csv(os.path.join(OUT_DIR, "performance_df.csv"), index=False)

Evaluating estimator - LogisticRegression
Evaluating estimator - SVC
Evaluating estimator - RandomForestClassifier
Evaluating estimator - MLPClassifier


In [21]:
performance_df.groupby(by="estimator").mean().mean()

fold       2.000000
roc_auc    0.546881
prc_auc    0.408978
dtype: float64