In [3]:
!nvidia-smi

Tue Apr 30 08:08:52 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.27.04    Driver Version: 460.27.04    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   77C    P0    35W /  70W |   3152MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   74C    P0    33W /  70W |   1709MiB / 15109MiB |     19%      Default |
|       

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, average_precision_score

In [None]:
NUM_SELECTED_FEATS = 5

estimators = [LogisticRegression(penalty=None, max_iter=10_000), SVC(kernel="linear", max_iter=10_000, probability=True), RandomForestClassifier(), MLPClassifier(max_iter=10_000)]

In [None]:
performance_df = {"estimator":[], "fold":[], "roc_auc":[], "prc_auc":[]}

for estimator in estimators:

    print(f"Evaluating estimator - {estimator.__class__.__name__}")

    pipeline = make_pipeline(StandardScaler(), estimator)

    for fold in cv_dict:

        train_feats_df = feats_df[feats_df["id"].isin(cv_dict[fold]["train"])]
        test_feats_df = feats_df[feats_df["id"].isin(cv_dict[fold]["val"])]

        rank_df = pd.read_csv(os.path.join(OUT_DIR, f"rank_df{fold}.csv"))

        selected_features = rank_df[rank_df["rank"]<=NUM_SELECTED_FEATS]["feature"].to_list()

        train_X = train_feats_df[selected_features].to_numpy()
        train_y = train_feats_df["label"].to_numpy().ravel()

        test_X = test_feats_df[selected_features].to_numpy()
        test_y = test_feats_df["label"].to_numpy().ravel()

        pipeline.fit(train_X, train_y)
        prob_y = pipeline.predict_proba(test_X)[:,1]

        roc_auc = roc_auc_score(test_y, prob_y)
        prc_auc = average_precision_score(test_y, prob_y)

        performance_df["estimator"].append(estimator.__class__.__name__)
        performance_df["fold"].append(fold)
        performance_df["roc_auc"].append(roc_auc)
        performance_df["prc_auc"].append(prc_auc)
        
        
performance_df = pd.DataFrame(performance_df)
performance_df.to_csv(os.path.join(OUT_DIR, "performance_df.csv"), index=False)

In [None]:
performance_df.groupby(by="estimator").mean()

### Statistical Analysis (U_test between 4 stability estimates from conventional methods and 3 stability estimates from autoencoder variants)