In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

import os
import time as time
import copy as copy

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F


from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [3]:
XL_PATH = r"curated_df.csv"

In [4]:
feats_df = pd.read_csv(XL_PATH)

feats_df.head()

Unnamed: 0,id,label,sub_wout_original_glcm_ClusterProminence,adc_original_firstorder_Minimum,sub_wout_original_glszm_LowGrayLevelZoneEmphasis,sub_wout_original_firstorder_Maximum,adc_original_glcm_ClusterShade,sub_wout_original_firstorder_Mean,sub_win_original_glcm_Autocorrelation,adc_original_glszm_LargeAreaLowGrayLevelEmphasis,...,sub_win_original_glszm_ZoneEntropy,t2w_original_glszm_SizeZoneNonUniformityNormalized,t2w_original_glcm_JointEntropy,t2w_original_glszm_LargeAreaHighGrayLevelEmphasis,sub_win_original_glszm_SizeZoneNonUniformityNormalized,sub_wout_original_glszm_SmallAreaHighGrayLevelEmphasis,sub_win_original_glcm_MaximumProbability,sub_win_original_glcm_Imc1,sub_wout_original_glcm_JointEntropy,t2w_original_glszm_LargeAreaLowGrayLevelEmphasis
0,2535039,1,4677862.0,0.0,0.003103,600.0,14835.837461,299.900214,3755.933491,0.010393,...,6.339939,0.28647,10.166389,27423.571919,0.4611,2946.8378,0.034622,-0.041978,10.452108,0.033786
1,2417361,0,4834267.0,0.0,0.001672,600.0,-17634.03485,299.918235,3941.494865,0.058145,...,7.42477,0.350004,11.649157,21732.551407,0.604518,3322.225544,0.002107,-0.109242,11.891117,0.009861
2,2602563,1,5159220.0,0.0,0.0016,600.0,-19736.4305,299.820687,2455.254084,0.019202,...,7.23927,0.350692,10.919838,15567.069802,0.574356,3407.597573,0.004002,-0.194449,11.214368,0.018991
3,2902440,0,3613791.0,0.0,0.002428,600.0,-12881.976888,299.240444,3954.079034,0.576021,...,7.45439,0.380537,11.53,18389.243521,0.566131,3121.573712,0.004134,-0.116415,11.669841,0.007846
4,2921898,0,5773968.0,0.0,0.00172,600.0,2116.811733,299.983523,3793.819336,0.011764,...,6.75517,0.265413,9.504938,245786.779116,0.469149,3175.569089,0.027634,-0.05868,11.459667,0.024444


### Autoencoder Utilities

##### Dataset

In [5]:
class Dataset(torch.utils.data.Dataset):
    
    def __init__(self, X):
        self.X = X
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, i):
        return self.X[i], self.X[i]

In [6]:
class NoisyDataset(torch.utils.data.Dataset):
    
    def __init__(self, X, p = 0.3):
        self.X = X
        self.p = p
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, i):
        
        x = self.X[i]
        
        noise = nn.Dropout(np.random.uniform(0, self.p))(torch.ones(x.shape))
        
        noisy_x = x * noise
        
        return noisy_x, x

In [7]:
class SyntheticNoisyDataset(torch.utils.data.Dataset):
    
    def __init__(self, X, p = 0.3):
        self.ds = NoisyDataset(X, p)
        
    def __len__(self):
        return len(self.ds)
    
    def __getitem__(self, i):
        noisy_x, _ = self.ds[i]
        return noisy_x, noisy_x


##### Architecture

In [8]:
class FC_Block(nn.Module):
    
    def __init__(self, in_feats, hidden_layers, activation_fn = nn.LeakyReLU()):
        
        super(FC_Block, self).__init__()
        
        layers = []
        
        for out_feats in hidden_layers:
            layers += [nn.Linear(in_feats, out_feats), activation_fn]
            in_feats = out_feats
            
        self.block = nn.Sequential(*layers)
        
    def forward(self, x):
        
        x = self.block(x)
        
        return x
        

In [9]:
class Autoencoder(nn.Module):
    
    def __init__(self, input_dim, encoder_layers=[100,50,25], latent_dim=5, activation_fn = nn.LeakyReLU()):
        
        super(Autoencoder, self).__init__()
        
        self.encoder_block = FC_Block(input_dim, encoder_layers, activation_fn)
        
        self.embedding_layer = nn.Sequential(*[nn.Linear(encoder_layers[-1], latent_dim), activation_fn])
        
        decoder_layers = list(reversed(encoder_layers))
        self.decoder_block = FC_Block(latent_dim, decoder_layers, activation_fn)
        self.scores = nn.Linear(decoder_layers[-1], input_dim)
    
    def forward(self, x):
        
        x = self.encoder_block(x)
        h = x = self.embedding_layer(x)
        x = self.decoder_block(x)
        x = self.scores(x)
        
        return x, h
            

##### Training/Validation

In [133]:
class Model:
    
    def __init__(self, net):
        self.net = net
        
    def compile(self, lr, h_lambda, loss_fn, cuda_device_id=0):
        
        self.h_lambda = h_lambda
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr)
        self.loss_fn = loss_fn 
        self.device = torch.device(f"cuda:{cuda_device_id}" if torch.cuda.is_available() else "cpu")
        
        self.net.to(self.device)
        
    def prepare_minibatch(self, mini_batch):
        
        inputs, targets = mini_batch
        
        return inputs.float().to(self.device), targets.float().to(self.device)
        
    def fit(self, dls, num_epochs, verbose=True):
        
        since = time.time()
        
        hist = {'train':{'loss':[]}, 'val':{'loss':[]}}
        
        best_loss = np.inf
        
        for epoch in range(num_epochs):
            
            if verbose:
                
                print('Epoch {}/{}'.format(epoch,num_epochs-1))
                print('-'*10)
                
            for phase in ["train", "val"]:
                
                if phase=="train":
                    self.net.train()
                else:
                    self.net.eval()
                    
                running_loss = 0.0
                
                for mini_batch in dls[phase]:
                    
                    self.optimizer.zero_grad()
                    
                    inputs, targets = self.prepare_minibatch(mini_batch)
                    
                    with torch.set_grad_enabled(phase=="train"):
                        
                        recon_inputs, h = self.net(inputs)
                        
                        loss = self.loss_fn(recon_inputs, targets) + self.h_lambda * h.flatten().abs().sum()
                        
                        if phase=="train":
                            
                            loss.backward()
                            self.optimizer.step()
                            
                        running_loss += loss.item()
                            
                epoch_loss = running_loss/len(dls[phase])
                hist[phase]["loss"].append(epoch_loss)
                
                if verbose:
                    print("{} Loss :{:.4f}".format(phase,epoch_loss))
                    
                if phase == "val":
                    
                    if epoch_loss<best_loss:
                        best_loss = epoch_loss
                        best_model_wts = copy.deepcopy(self.net.state_dict())
                        if verbose:
                            print(f"Checkpoing made at {epoch}")
                        
            if verbose:
                print()
                
            
        time_elapsed = time.time() - since
        
        
        print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
        print('Best val Loss: {:4f}'.format(best_loss)) 

        
        self.net.load_state_dict(best_model_wts)
        
        return self.net.cpu()

In [37]:
def norm_anomaly_split(X, y):
    
    normal_indeces = np.argwhere(y==0).ravel()
    anomaly_indeces = np.argwhere(y==1).ravel()
    
    X_norm = X[normal_indeces]
    X_anomaly = X[anomaly_indeces]

    return X_norm, X_anomaly
    

In [12]:
def visualize_using_tsne(X, y, n_components=2):
    
    X_transformed = TSNE(n_components = n_components, random_state=0).fit_transform(X)
    
    plt.scatter(*zip(*X_transformed[y==1]), marker='o', color='r', s=10, label='Anomalous')
    plt.scatter(*zip(*X_transformed[y==0]), marker='o', color='g', s=10, label='Normal')
    plt.legend()
    plt.show()
    
    
    

### Autoencoder Parameters

In [13]:
# https://github.com/mvaldenegro/UncertaintyML-course-ESSAI-labs
# https://github.com/mvaldenegro/UncertaintyML-course-ESSAI-labs/blob/main/02_eval_uncertainty_calibration.ipynb
# https://atcold.github.io/NYU-DLSP20/en/week01/01-3/

In [14]:
feats = [column for column in feats_df.columns if column not in ["id","label"]]
print(len(feats))

89


In [308]:
scaler = StandardScaler()
X = feats_df[feats].to_numpy()
y = feats_df["label"].to_numpy()

# X = scaler.fit_transform(X)

X_norm, X_anomaly = norm_anomaly_split(X, y)

X_norm = scaler.fit_transform(X_norm) #this works better than the alternative where you standardize the whole X
X_anomaly = scaler.transform(X_anomaly)

X_norm = torch.from_numpy(X_norm).float()
X_anomaly = torch.from_numpy(X_anomaly).float()

_, input_dim = X.shape

torch.manual_seed(0)
idx = torch.randperm(len(X_norm))

X_train = X_norm[idx[:-len(X_anomaly)]]

X_test_norm = X_norm[idx[-len(X_anomaly):]]
X_test_anomaly = X_anomaly

# X_test = torch.concat([X_test_norm, X_test_anomaly])


In [None]:
# visualize_using_tsne(X,y)

In [309]:
num_epochs = 10_000
batch_size = 32
loss_fn = nn.MSELoss()

lr = 1e-3

latent_dim = 5

activation_fn = nn.LeakyReLU()

##### 1. Single Standard Autoencoder

In [288]:
train_ds = Dataset(X_train)
val_ds = Dataset(X_train)
dls = {"train":torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True),"val":torch.utils.data.DataLoader(val_ds, batch_size=batch_size)}

h_lambda = 0.0 #disabling l1 sparsity constraint
encoder_layers = [50, 25, 10] #under-complete hidden layers
standard_ae = Autoencoder(input_dim, encoder_layers=encoder_layers, latent_dim=latent_dim, activation_fn = activation_fn)

model = Model(standard_ae)
model.compile(lr, h_lambda, loss_fn)
_ = model.fit(dls, num_epochs, verbose=False)

Training complete in 0m 43s
Best val Loss: 0.000004


In [289]:
recon_X_test_norm, h = model.net(X_test_norm)
recon_X_test_anomaly, h = model.net(X_test_anomaly)

normal_mse = nn.MSELoss(reduction="none")(recon_X_test_norm, X_test_norm).mean(axis=0).detach().numpy()
anomaly_mse = nn.MSELoss(reduction="none")(recon_X_test_anomaly, X_test_anomaly).mean(axis=0).detach().numpy()

delta = anomaly_mse - normal_mse
rank = len(delta) - (delta.argsort().argsort() + 1) + 1

print(normal_mse.mean(), anomaly_mse.mean())


1.0178026 1.083022


In [None]:
#2. Mixup augmentation

##### 2. Single Sparse Autoencoder

In [310]:
train_ds = Dataset(X_train)
val_ds = Dataset(X_train)
dls = {"train":torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True),"val":torch.utils.data.DataLoader(val_ds, batch_size=batch_size)}

h_lambda = 1e-2 #with l1 regularization
encoder_layers = [50, 25, 10] #under-complete hidden layers
sparse_ae = Autoencoder(input_dim, encoder_layers=encoder_layers, latent_dim=latent_dim, activation_fn = activation_fn)

model = Model(sparse_ae)
model.compile(lr, h_lambda, loss_fn)
_ = model.fit(dls, num_epochs, verbose=False)

Training complete in 0m 58s
Best val Loss: 1.155962


In [311]:
recon_X_test_norm, h = model.net(X_test_norm)
recon_X_test_anomaly, h = model.net(X_test_anomaly)

normal_mse = nn.MSELoss(reduction="none")(recon_X_test_norm, X_test_norm).mean(axis=0).detach().numpy()
anomaly_mse = nn.MSELoss(reduction="none")(recon_X_test_anomaly, X_test_anomaly).mean(axis=0).detach().numpy()

delta = anomaly_mse - normal_mse
rank = len(delta) - (delta.argsort().argsort() + 1) + 1

print(normal_mse.mean(), anomaly_mse.mean())


0.86655784 1.0333871


In [313]:
rank.argmin(), rank.argmax()

(53, 3)

##### 3. Ensemble Sparse Autoencoders 

In [335]:
ensemble_count = 100

num_epochs = 10_000
batch_size = 32
loss_fn = nn.MSELoss()

lr = 1e-3

latent_dim = 5

activation_fn = nn.LeakyReLU()

h_lambda = 1e-2 #with l1 regularization
encoder_layers = [50, 25, 10] #under-complete hidden layers
sparse_ae = Autoencoder(input_dim, encoder_layers=encoder_layers, latent_dim=latent_dim, activation_fn = activation_fn)

results_df = {**{"ae_id":[], "mse_mean":[]}, **{"mse_"+f:[] for f in feats}, **{"label":[]}}

In [None]:
for ae_id in range(ensemble_count):
    
    print("*"*50)
    print(f"Autoencoder ID: {ae_id+1}")

    scaler = StandardScaler()
    X = feats_df[feats].to_numpy()
    y = feats_df["label"].to_numpy()

    # X = scaler.fit_transform(X)

    X_norm, X_anomaly = norm_anomaly_split(X, y)

    X_norm = scaler.fit_transform(X_norm) #this works better than the alternative where you standardize the whole X
    X_anomaly = scaler.transform(X_anomaly)

    X_norm = torch.from_numpy(X_norm).float()
    X_anomaly = torch.from_numpy(X_anomaly).float()

    _, input_dim = X.shape

    # torch.manual_seed(0) #this is where each ensemble run essentially differ, 
    idx = torch.randperm(len(X_norm))

    X_train = X_norm[idx[:-len(X_anomaly)]]

    X_test_norm = X_norm[idx[-len(X_anomaly):]]
    X_test_anomaly = X_anomaly

    
    train_ds = Dataset(X_train)
    val_ds = Dataset(X_train)
    dls = {"train":torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True),"val":torch.utils.data.DataLoader(val_ds, batch_size=batch_size)}

    model = Model(sparse_ae)
    model.compile(lr, h_lambda, loss_fn)
    _ = model.fit(dls, num_epochs, verbose=False)
    
    recon_X_test_norm, h = model.net(X_test_norm)
    recon_X_test_anomaly, h = model.net(X_test_anomaly)

    normal_mse = nn.MSELoss(reduction="none")(recon_X_test_norm, X_test_norm).mean(axis=0).detach().numpy()
    anomaly_mse = nn.MSELoss(reduction="none")(recon_X_test_anomaly, X_test_anomaly).mean(axis=0).detach().numpy()
    
    
    results_df["ae_id"].append(ae_id+1)
    results_df["mse_mean"].append(normal_mse.mean())
    for f, f_normal_mse in zip(feats, normal_mse):
        results_df["mse_"+f].append(f_normal_mse)
    results_df["label"].append(0)
        
    results_df["ae_id"].append(ae_id+1)
    results_df["mse_mean"].append(anomaly_mse.mean())
    for f, f_anomaly_mse in zip(feats, anomaly_mse):
        results_df["mse_"+f].append(f_anomaly_mse)
    results_df["label"].append(1)
        
    print("normal_mse=", normal_mse.mean(), "anomaly_mse=", anomaly_mse.mean())
    
results_df = pd.DataFrame(results_df)

**************************************************
Autoencoder ID: 1
Training complete in 0m 43s
Best val Loss: 0.193056
normal_mse= 1.1711873 anomaly_mse= 1.3523108
**************************************************
Autoencoder ID: 2
Training complete in 0m 43s
Best val Loss: 0.142379
normal_mse= 11.432494 anomaly_mse= 1.3132628
**************************************************
Autoencoder ID: 3
Training complete in 0m 43s
Best val Loss: 0.113647
normal_mse= 1.1749511 anomaly_mse= 1.7340851
**************************************************
Autoencoder ID: 4


In [None]:
results_df.to_csv("results_df.csv", index=False)