In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

import os
import time as time
import copy as copy

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import utils as utils
import similarity_index as similarity_index

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold


In [2]:
XL_PATH = r"radiomicsFeatures.csv"
OUT_DIR = r"outputs/oneDSAE"
MASK_FEATS = ["id", "label"]

In [3]:
feats_df = pd.read_csv(XL_PATH)
feats_df.head()

Unnamed: 0,id,label,sub_wout_original_glcm_ClusterProminence,adc_original_firstorder_Minimum,sub_wout_original_glszm_LowGrayLevelZoneEmphasis,sub_wout_original_firstorder_Maximum,adc_original_glcm_ClusterShade,sub_wout_original_firstorder_Mean,sub_win_original_glcm_Autocorrelation,adc_original_glszm_LargeAreaLowGrayLevelEmphasis,...,sub_win_original_glszm_ZoneEntropy,t2w_original_glszm_SizeZoneNonUniformityNormalized,t2w_original_glcm_JointEntropy,t2w_original_glszm_LargeAreaHighGrayLevelEmphasis,sub_win_original_glszm_SizeZoneNonUniformityNormalized,sub_wout_original_glszm_SmallAreaHighGrayLevelEmphasis,sub_win_original_glcm_MaximumProbability,sub_win_original_glcm_Imc1,sub_wout_original_glcm_JointEntropy,t2w_original_glszm_LargeAreaLowGrayLevelEmphasis
0,2535039,1,4677862.0,0.0,0.003103,600.0,14835.837461,299.900214,3755.933491,0.010393,...,6.339939,0.28647,10.166389,27423.571919,0.4611,2946.8378,0.034622,-0.041978,10.452108,0.033786
1,2417361,0,4834267.0,0.0,0.001672,600.0,-17634.03485,299.918235,3941.494865,0.058145,...,7.42477,0.350004,11.649157,21732.551407,0.604518,3322.225544,0.002107,-0.109242,11.891117,0.009861
2,2602563,1,5159220.0,0.0,0.0016,600.0,-19736.4305,299.820687,2455.254084,0.019202,...,7.23927,0.350692,10.919838,15567.069802,0.574356,3407.597573,0.004002,-0.194449,11.214368,0.018991
3,2902440,0,3613791.0,0.0,0.002428,600.0,-12881.976888,299.240444,3954.079034,0.576021,...,7.45439,0.380537,11.53,18389.243521,0.566131,3121.573712,0.004134,-0.116415,11.669841,0.007846
4,2921898,0,5773968.0,0.0,0.00172,600.0,2116.811733,299.983523,3793.819336,0.011764,...,6.75517,0.265413,9.504938,245786.779116,0.469149,3175.569089,0.027634,-0.05868,11.459667,0.024444


### Stratified CV Fold Generation

In [4]:
pids = feats_df.id.to_numpy()
labels = feats_df.label.to_numpy()

In [5]:
cv_count = 5

cv_dict = {}
skf = StratifiedKFold(n_splits = cv_count, random_state=0, shuffle=True)

for i, (train_idx, val_idx) in enumerate(skf.split(pids, labels)):
    cv_dict[i] = {"train":pids[train_idx], "val":pids[val_idx]} 

cv_dict

{0: {'train': array([2602563, 2921898, 3039346, 3110297, 3110706, 3137563, 3207798,
         3213683, 3222346, 3226033, 3303911, 3325442, 3327697, 3329611,
         3336537, 3405013, 3416781, 3419338, 3502691, 3504033, 3513664,
         3519247, 3522629, 3534419, 3536230, 3607842, 3610014, 3613524,
         3616819, 3618480, 3621681, 3621824, 3622974, 3631910, 3632788,
         3701079, 3702147, 3707565, 3713983, 3714280, 3715560, 3716356,
         3718385, 3720950, 3724846, 3725583, 3726460, 3727030, 3727850,
         3729691, 3730269, 3800022, 3802504, 3808093, 3811134, 3811967,
         3812057, 3815317, 3817381, 3819464, 3821188, 3821859, 3822353,
         3823428, 3825318, 3827579, 3828403, 3901619, 3904119, 3904751,
         3906071, 3906505, 3907211, 3907314, 3907344, 3908895, 3911843,
         9534972, 9803775, 9816715]),
  'val': array([2535039, 2417361, 2902440, 3310301, 3332798, 3534604, 3605303,
         3621917, 3702859, 3703425, 3712766, 3728041, 3805884, 3811851,
       

### Feature Selection Pipeline

In [6]:
feats = feats_df.columns[~feats_df.columns.isin(MASK_FEATS)].to_list()

results_df = {**{"fold":[], "mse_mean":[]}, **{"mse_"+feat:[] for feat in feats}, **{"label":[]}} # {**dict1, **dict2,...} is a way to merge multiple dictionaries

if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)

for fold in cv_dict:

    print(f"Running for fold - {fold}")
    print("-"*50)

    num_epochs = 10_000
    batch_size = 32
    loss_fn = nn.MSELoss()
    
    lr = 1e-3
    h_lambda = 1e-2 #with l1 regularization
    
    input_dim = len(feats)
    latent_dim = 5
    
    activation_fn = nn.LeakyReLU()
    encoder_layers = [50, 25, 10] #under-complete hidden layers

    
    X =  feats_df[feats_df["id"].isin(cv_dict[fold]["train"])][feats].to_numpy()
    y = feats_df[feats_df["id"].isin(cv_dict[fold]["train"])].label.to_numpy()

    X_norm, X_anomaly = utils.norm_anomaly_split(X, y)

    # scaler = StandardScaler() #better not to use normalization
    # X_norm = scaler.fit_transform(X_norm)
    # X_anomaly = scaler.transform(X_anomaly)
    
    np.random.seed(0)
    idx = np.random.permutation(len(X_norm))
    
    X_train= X_norm[idx[:-len(X_anomaly)]]
    X_test_norm = X_norm[idx[-len(X_anomaly):]]
    X_test_anomaly = X_anomaly
    
    X_train =  torch.from_numpy(X_train).float()
    X_test_norm = torch.from_numpy(X_test_norm).float()
    X_test_anomaly = torch.from_numpy(X_test_anomaly).float()

    train_ds = utils.Dataset(X_train)
    val_ds = utils.Dataset(X_train)
    dls = {"train":torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True),"val":torch.utils.data.DataLoader(val_ds, batch_size=batch_size)}
    
    dsae = utils.Autoencoder(input_dim, encoder_layers=encoder_layers, latent_dim=latent_dim, activation_fn = activation_fn)
    model = utils.Model(dsae)
    model.compile(lr, h_lambda, loss_fn)
    _ = model.fit(dls, num_epochs, verbose=False)

    recon_X_test_norm, h_norm = model.net(X_test_norm)
    recon_X_test_anomaly, h_anomaly = model.net(X_test_anomaly)

    mse = {0:nn.MSELoss(reduction="none")(recon_X_test_norm, X_test_norm).mean(axis=0), 1:nn.MSELoss(reduction="none")(recon_X_test_anomaly, X_test_anomaly).mean(axis=0)}
    
    for label in mse:
        results_df["fold"].append(fold)
        results_df["mse_mean"].append(mse[label].mean().item())
        for feat, feat_mse in zip(feats, mse[label]):
            results_df["mse_"+feat].append(feat_mse.item())
        results_df["label"].append(label)
        
    print("normal_mse=", mse[0].mean().item(), "anomaly_mse=", mse[1].mean().item(), "anomaly_mse>normal_mse=", mse[1].mean().item()>mse[0].mean().item())

    delta = mse[1] - mse[0]
    rank = len(delta) - (delta.argsort().argsort() + 1) + 1
    
    rank_df = pd.DataFrame({"feature":feats, "rank":rank})

    rank_df.to_csv(os.path.join(OUT_DIR, f"rank_df{fold}.csv"), index=False)

    
    
results_df = pd.DataFrame(results_df) 
results_df.to_csv(os.path.join(OUT_DIR, "results_df.csv"), index=False)

Running for fold - 0
Training complete in 1m 11s
Best val Loss: 737846448.000000
normal_mse= 16893336576.0 anomaly_mse= 19473541120.0 anomaly_mse>normal_mse= True
Running for fold - 1
Training complete in 1m 10s
Best val Loss: 2010712960.000000
normal_mse= 11176532992.0 anomaly_mse= 13891337216.0 anomaly_mse>normal_mse= True
Running for fold - 2
Training complete in 1m 9s
Best val Loss: 7266706176.000000
normal_mse= 17464766464.0 anomaly_mse= 17497548800.0 anomaly_mse>normal_mse= True
Running for fold - 3
Training complete in 1m 10s
Best val Loss: 4590728800.000000
normal_mse= 14707665920.0 anomaly_mse= 15832340480.0 anomaly_mse>normal_mse= True
Running for fold - 4
Training complete in 0m 43s
Best val Loss: 1027645952.000000
normal_mse= 18043648000.0 anomaly_mse= 13705658368.0 anomaly_mse>normal_mse= False
