In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

import os
import time as time
import copy as copy

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import utils as utils
import similarity_index as similarity_index

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, train_test_split

import tracemalloc
import GPUtil


In [2]:
XL_PATH = r"inputs/radiomicsFeatures.csv"
OUT_DIR = r"outputs_new/bayesianDSAE"
MASK_FEATS = ["id", "label"]

CUDA_DEVICE_ID = 2
NUM_REPEATS = 100

B = 100

In [3]:
feats_df = pd.read_csv(XL_PATH)
feats_df.head()

Unnamed: 0,id,label,sub_wout_original_glcm_ClusterProminence,adc_original_firstorder_Minimum,sub_wout_original_glszm_LowGrayLevelZoneEmphasis,sub_wout_original_firstorder_Maximum,adc_original_glcm_ClusterShade,sub_wout_original_firstorder_Mean,sub_win_original_glcm_Autocorrelation,adc_original_glszm_LargeAreaLowGrayLevelEmphasis,...,sub_win_original_glszm_ZoneEntropy,t2w_original_glszm_SizeZoneNonUniformityNormalized,t2w_original_glcm_JointEntropy,t2w_original_glszm_LargeAreaHighGrayLevelEmphasis,sub_win_original_glszm_SizeZoneNonUniformityNormalized,sub_wout_original_glszm_SmallAreaHighGrayLevelEmphasis,sub_win_original_glcm_MaximumProbability,sub_win_original_glcm_Imc1,sub_wout_original_glcm_JointEntropy,t2w_original_glszm_LargeAreaLowGrayLevelEmphasis
0,2535039,1,4677862.0,0.0,0.003103,600.0,14835.837461,299.900214,3755.933491,0.010393,...,6.339939,0.28647,10.166389,27423.571919,0.4611,2946.8378,0.034622,-0.041978,10.452108,0.033786
1,2417361,0,4834267.0,0.0,0.001672,600.0,-17634.03485,299.918235,3941.494865,0.058145,...,7.42477,0.350004,11.649157,21732.551407,0.604518,3322.225544,0.002107,-0.109242,11.891117,0.009861
2,2602563,1,5159220.0,0.0,0.0016,600.0,-19736.4305,299.820687,2455.254084,0.019202,...,7.23927,0.350692,10.919838,15567.069802,0.574356,3407.597573,0.004002,-0.194449,11.214368,0.018991
3,2902440,0,3613791.0,0.0,0.002428,600.0,-12881.976888,299.240444,3954.079034,0.576021,...,7.45439,0.380537,11.53,18389.243521,0.566131,3121.573712,0.004134,-0.116415,11.669841,0.007846
4,2921898,0,5773968.0,0.0,0.00172,600.0,2116.811733,299.983523,3793.819336,0.011764,...,6.75517,0.265413,9.504938,245786.779116,0.469149,3175.569089,0.027634,-0.05868,11.459667,0.024444


In [4]:
pids = feats_df.id.to_numpy()
labels = feats_df.label.to_numpy()

### Feature Selection Pipeline with MonteCarlo Resampling

In [5]:
feats = feats_df.columns[~feats_df.columns.isin(MASK_FEATS)].to_list()

results_df = {**{"outer_seed":[], "exe_time":[], "memory":[], "b":[], "re_mean":[]}, **{"re_"+feat:[] for feat in feats}, **{"label":[]}} # {**dict1, **dict2,...} is a way to merge multiple dictionaries

if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)

for i in range(NUM_REPEATS):

    print(f"Running for repeat#- {i+1}")
    print("-"*50)

    start_time = time.time()
    tracemalloc.start()

    num_epochs = 1_000
    batch_size = 32
    loss_fn = nn.MSELoss()
    
    lr = 1e-3
    h_lambda = 1e-2 #with l1 regularization
    
    input_dim = len(feats)
    latent_dim = 10
    
    activation_fn = nn.LeakyReLU()
    encoder_layers = [50, 30, 20] #under-complete hidden layers

    train_pids, test_pids, train_labels, test_labels = train_test_split(pids, labels, test_size=0.25, random_state=i, stratify=labels)

    X =  feats_df[feats_df["id"].isin(train_pids)][feats].to_numpy()
    y = feats_df[feats_df["id"].isin(train_pids)].label.to_numpy()

    # scaler = StandardScaler()
    # X = scaler.fit_transform(X)
    # X[X>=3] = 3
    # X[X<=-3] = -3

    X_norm, X_anomaly = utils.norm_anomaly_split(X, y)
    
    np.random.seed(0)
    idx = np.random.permutation(len(X_norm))
    
    X_train= X_norm[idx[:-len(X_anomaly)]]
    X_test_norm = X_norm[idx[-len(X_anomaly):]]
    X_test_anomaly = X_anomaly

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_train[X_train>=3] = 3
    X_train[X_train<=-3] = -3
    
    X_test_norm = scaler.transform(X_test_norm)
    X_test_norm[X_test_norm>=3] = 3
    X_test_norm[X_test_norm<=-3] = -3
    
    X_test_anomaly = scaler.transform(X_test_anomaly)
    X_test_anomaly[X_test_anomaly>=3] = 3
    X_test_anomaly[X_test_anomaly<=-3] = -3
    
    
    X_train =  torch.from_numpy(X_train).float()
    X_test_norm = torch.from_numpy(X_test_norm).float()
    X_test_anomaly = torch.from_numpy(X_test_anomaly).float()
    X_test = torch.cat([X_test_norm, X_test_anomaly])

    train_ds = utils.Dataset(X_train)
    val_ds = utils.Dataset(X_train)
    dls = {"train":torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True),"val":torch.utils.data.DataLoader(val_ds, batch_size=batch_size)}
    
    bayesian_dsae = utils.bayesianAutoencoder(input_dim, encoder_layers=encoder_layers, latent_dim=latent_dim, activation_fn = activation_fn, dropout_prob=0.5)
    model = utils.Model(bayesian_dsae)
    model.compile(lr, h_lambda, loss_fn, cuda_device_id=CUDA_DEVICE_ID)
    _ = model.fit(dls, num_epochs, verbose=False)

    gpu_mem = GPUtil.getGPUs()[CUDA_DEVICE_ID].memoryUsed
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    exe_time = time.time()-start_time

    for b in range(B):

        model.net.train() #to enable dropout for stochasticity during inference
        
        recon_X_test_norm, h_norm = model.net(X_test_norm)
        recon_X_test_anomaly, h_anomaly = model.net(X_test_anomaly)

        recon_X_test = torch.cat([recon_X_test_norm, recon_X_test_anomaly])
        y_test = torch.cat([torch.zeros(len(recon_X_test_norm)), torch.ones(len(recon_X_test_anomaly))])
        
        re_test = nn.MSELoss(reduction="none")(recon_X_test, X_test)

        for re_row, label in zip(re_test, y_test):
            results_df["outer_seed"].append(i)
            results_df["exe_time"].append(exe_time)
            results_df["memory"].append(gpu_mem + (peak/2**20))
            results_df["b"].append(b)
            results_df["re_mean"].append(re_row.mean().item())
    
            for feat, re_feat in zip(feats, re_row):
                results_df["re_"+feat].append(re_feat.item())
    
            results_df["label"].append(label.item())

        _df = pd.DataFrame(results_df)
        grp_mean_df = _df[(_df.outer_seed==i)&(_df.b==b)].groupby(by=["label"]).mean()
        
        print("b=", b, "normal_mse=",grp_mean_df.loc[0].re_mean, "anomaly_mse=", grp_mean_df.loc[1].re_mean, "anomaly_mse>normal_mse=", grp_mean_df.loc[1].re_mean>grp_mean_df.loc[0].re_mean)
       

    _df = pd.DataFrame(results_df)
    grp_mean_df = _df[_df.outer_seed==i].groupby(by=["label"]).mean()
    
    print("normal_mse=",grp_mean_df.loc[0].re_mean, "anomaly_mse=", grp_mean_df.loc[1].re_mean, "anomaly_mse>normal_mse=", grp_mean_df.loc[1].re_mean>grp_mean_df.loc[0].re_mean)

    grp_mean_df = grp_mean_df[["re_"+feat for feat in feats]]
    delta = grp_mean_df.loc[1] - grp_mean_df.loc[0]

    rank = len(delta) - (delta.argsort().argsort() + 1) + 1
    rank_df = pd.DataFrame({"feature":feats, "rank":rank})
    rank_df.to_csv(os.path.join(OUT_DIR, f"rank_df{i}.csv"), index=False)
    
    
results_df = pd.DataFrame(results_df) 
results_df.to_csv(os.path.join(OUT_DIR, "results_df.csv"), index=False)

Running for repeat#- 1
--------------------------------------------------
Training complete in 0m 6s
Best val Loss: 0.873144
b= 0 normal_mse= 0.9350703209638596 anomaly_mse= 0.852136371487921 anomaly_mse>normal_mse= False
b= 1 normal_mse= 0.9352096820419485 anomaly_mse= 0.8518519645387476 anomaly_mse>normal_mse= False
b= 2 normal_mse= 0.9354249049316753 anomaly_mse= 0.8519502404061231 anomaly_mse>normal_mse= False
b= 3 normal_mse= 0.9353347705169157 anomaly_mse= 0.8519748672842979 anomaly_mse>normal_mse= False
b= 4 normal_mse= 0.9353783645413138 anomaly_mse= 0.8517469716343012 anomaly_mse>normal_mse= False
b= 5 normal_mse= 0.9351850341666829 anomaly_mse= 0.8513988181948662 anomaly_mse>normal_mse= False
b= 6 normal_mse= 0.9355925741520795 anomaly_mse= 0.8517499613490972 anomaly_mse>normal_mse= False
b= 7 normal_mse= 0.9349590539932251 anomaly_mse= 0.8518849441950972 anomaly_mse>normal_mse= False
b= 8 normal_mse= 0.9354103424332358 anomaly_mse= 0.8518470607020638 anomaly_mse>normal_mse= 