In [501]:
import matplotlib.pyplot as plt
import seaborn as sns
import os 
import scipy.sparse
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import prince

sns.set()
%config InlineBackend.figure_format = 'retina'
from ucimlrepo import fetch_ucirepo 
from sklearn.model_selection import StratifiedKFold, KFold , train_test_split
from sklearn.neighbors import BallTree, KDTree, LocalOutlierFactor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_curve
from sklearn import metrics
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.feature_extraction import FeatureHasher
from sklearn.datasets import fetch_openml
from dfencoder import AutoEncoder #for encoding categorical data
#from scipy.spatial.distance import hamming

from torchvision import models
from torchsummary import summary


In [503]:
#Transform train and test data using One Hot encoding
def transform_one_hot_encoding(X_train, X_test):
    #print(X_train.shape, X_test.shape)
    encoder = OneHotEncoder(sparse_output=False,handle_unknown='ignore')  # Initialize the OneHotEncoder
    encoder.fit(X_train)
    X_train_transformed = encoder.transform(X_train)
    X_test_transformed = encoder.transform(X_test)
    #print(X_train_transformed.shape, X_test_transformed.shape)
    X_train_transformed = pd.DataFrame(X_train_transformed)
    X_test_transformed = pd.DataFrame(X_test_transformed)
    return X_train_transformed, X_test_transformed
    

In [505]:
#Transform train and test data using Multiple Correspondence Analysis
def transform_mca(X_train, X_test):
    #First perform one-hot-encoding
    X_train_ohe, X_test_ohe =  transform_one_hot_encoding(X_train, X_test)
    #print("*",X_train.shape, X_test.shape)
    #print("#",X_train_ohe.shape, X_test_ohe.shape)
    #Perform MCA
    mca = prince.MCA(n_components=X_train.shape[1], n_iter=3, copy=True, check_input=True, engine='sklearn', one_hot=False)
    mca = mca.fit(pd.DataFrame(X_train_ohe))
    X_train_mca=mca.transform(pd.DataFrame(X_train_ohe))
    X_test_mca=mca.transform(pd.DataFrame(X_test_ohe))
    #print("@",X_train_mca.shape, X_test_mca.shape)
    X_train_mca=pd.DataFrame(X_train_mca)
    X_test_mca=pd.DataFrame(X_test_mca)
    return X_train_mca, X_test_mca

In [507]:
#Transform train and test data using Feature Hashing
def transform_feature_hashing(X_train, X_test):
    X_train_hash = X_train.copy()
    X_test_hash = X_test.copy()
    for c in X_train.columns:
        X_train_hash[c]=X_train[c].astype('str') 
        X_test_hash[c]=X_test[c].astype('str') 
    h = FeatureHasher(n_features=int(X_train.shape[1]), input_type='string')
    X_train_f = h.transform(X_train_hash.values).toarray()
    X_test_f = h.transform(X_test_hash.values).toarray()
    #print('dim=', X_train.shape[1],'train-->',X_train_f, '\ntest-->', X_test_f)
    X_train_f=pd.DataFrame(X_train_f)
    
    return X_train_f, X_test_f
    

In [509]:
#Transform data into feature space with euclidean distance
def transform_nearest_neighbor(X_train, X_test, size):
    tree = BallTree(X_train, leaf_size=2, metric='euclidean')
    # find the nearest neighbors of each training sample
    dist_o, ind = tree.query(X_train, k=X_train.shape[0]) 
    #print("\nDistances:", dist_o.shape)  # Output the distances
    dist = dist_o[:,1:size+1]
    # find the nearest neighbors of each test sample
    dist1_o, ind1 = tree.query(X_test, k=X_train.shape[0]) 
    dist1 = dist1_o[:,:size]
    dist = pd.DataFrame(dist)
    dist1 = pd.DataFrame(dist1)
    
    return dist, dist1

In [511]:
#Transform data into feature space with hamming distance
def transform_nearest_neighbor_cat(X_train, X_test, size):
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    f_train = np.empty((0, size)) 
    f_test = np.empty((0, size)) 
    for sample in X_train:
        # Find distances of training samples with all training data points
        distances = np.array([hamming_distance(sample, row) for row in X_train])
        # Find the first nearest neighbor of test sample
        ind = np.argsort(distances)[1:size+1]
        dist = distances[ind]
        #print("Distances:", dist, 'dist-shape', dist.shape,' ind=',ind)  # Output the distances
        f_train = np.vstack([f_train, dist])  
        #print(f_train, f_train.shape)
        
    for sample in X_test:
        # Find distances of testing samples with all training data points
        distances1 = np.array([hamming_distance(sample, row) for row in X_train])
        #print('d=', distances);
        # Find the first nearest neighbor of test sample
        ind1 = np.argsort(distances1)[:size]
        dist1 = distances1[ind1]
        #print("Distances:", dist, ', ind=',ind)  # Output the distances
        f_test = np.vstack([f_test, dist1])
        
    #print(f_test, f_test.shape)
    f_train = pd.DataFrame(f_train)
    f_test = pd.DataFrame(f_test)
    
    return f_train, f_test

In [513]:
#One class NN for categorical data (Canberra Distance)
def ocnn_canberra(X_train_transformed, X_test_transformed):
    tree = BallTree(X_train_transformed, leaf_size=2, metric='canberra')
    # find the first neighbor of test sample
    dist, ind = tree.query(X_test_transformed, k=1) 
    ind = ind.squeeze()  # Flatten if necessary
    ind = ind.tolist()   # Convert to a list if working with a DataFrame
    #print("Distances:", dist.shape);
    #print('test=',X_test_transformed.shape, 'ind=',ind)  # Output the distances
    #finds the first neighbor of this neighbor
    # Properly index into the DataFrame
    X_neighbors = X_train_transformed.iloc[ind]
    dist1, ind1 = tree.query(X_neighbors,k=2)
    #print('test1=', X_train_transformed[ind].shape, 'ind1=',ind1)
    #print("Distances1:", dist1, "Indices1:", ind1)  # Output the distances
    score = dist[:,0]/(dist1[:,1]+sys.float_info.epsilon);
    #print('score-shape=',score.shape)
    return score 
    

In [515]:
#One class NN for categorical data (Euclidean Distance)
def ocnn_euclidean(X_train_transformed, X_test_transformed):
    tree = BallTree(X_train_transformed, leaf_size=2, metric='euclidean')
    # find the first neighbor of test sample
    dist, ind = tree.query(X_test_transformed, k=1) 
    ind = ind.squeeze()  # Flatten if necessary
    ind = ind.tolist()   # Convert to a list if working with a DataFrame
    #print("Distances:", dist.shape, dist, ind.shape, ind)  # Output the distances
    #finds the first neighbor of this neighbor
    # Properly index into the DataFrame
    X_neighbors = X_train_transformed.iloc[ind]
    dist1, ind1 = tree.query(X_neighbors,k=2)
    #print('test1=', X_train_transformed[ind1], 'ind1=',ind1)
    #print("Distances1:", dist1, "Indices1:", ind1)  # Output the distances
    score = dist[:,0]/(dist1[:,1]+sys.float_info.epsilon);
    #print('score-shape=',score.shape)
    return score 


In [703]:
def occ_classifiers (X_train, X_test):       
        #OCNN Euclidean
        score1 = ocnn_euclidean(X_train, X_test)
        #OCC Canberra
        score2 = ocnn_canberra(X_train, X_test)
        # Isolation Forest
        clf = IsolationForest().fit(X_train)
        score3 = clf.decision_function(X_test)
        # Local Outlier Factor
        clf = LocalOutlierFactor(novelty=True).fit(X_train)
        score4 = clf.decision_function(X_test)
        # OSVM
        clf = OneClassSVM().fit(X_train)
        score5 = clf.decision_function(X_test)

        return score1, score2, score3, score4, score5

In [673]:
#OC-KNN without one-hot-encoding and directly computing hamming distance
def cat_ocknn(X_train, X_test, y_test):
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    #print('t=', type(X_train), 'train=', X_train.shape, 'test=', X_test.shape)
    # Find distances of test samples with all training data points
    score_occ_categorical_hamming = np.empty((0, 1)) 
    for sample in X_test:
        #print('sample=',sample, 'len=',len(sample))
        distances = np.array([hamming_distance(sample, row) for row in X_train])
        #print('d=', distances);
        # Find the first nearest neighbor of test sample
        ind = np.argsort(distances)[:1]
        dist = distances[ind]
        #print("Distances:", dist, ', ind=',ind)  # Output the distances
        n_sample = X_train[ind][0]
        #print("n_sample=", n_sample, 'len=',len(n_sample))
        
        
        #finds the first neighbor of this neighbor
        distances1 = np.array([hamming_distance(n_sample, row) for row in X_train])
        #print('d1=', distances1);
        # Find the first nearest neighbor of this nearest neighbor sample
        ind1 = np.argsort(distances1)[:2]
        #print("Distances1 shape:", distances1[ind1].shape, "Distances1:", distances1[ind1], ", ind1:", ind1)  # Output the distances
        dist1 = distances1[ind1[1]]
        #print("Distances1:", dist1, "ind1:", ind1[1])  # Output the distances
        score_occ_categorical_hamming = np.vstack([score_occ_categorical_hamming, dist/(dist1+sys.float_info.epsilon)]);

    return score_occ_categorical_hamming
    
    # fpr, tpr, thresholds = metrics.roc_curve(y_test, -1*score_occ_categorical_hamming, pos_label='e')
    # auc_roc=metrics.auc(fpr, tpr)
    # precision, recall, thresholds = precision_recall_curve(y_test, -1*score_occ_categorical_hamming, pos_label='e')
    # auc_pr=metrics.auc(recall, precision)
    # print('ascore_occ_categorical_hamming= %.4f' % auc_roc, ' auc_pr_score_occ_categorical_hamming= %.4f' % auc_pr)


In [675]:
def performance_metrics_distance (score, y_test,poslabel):
    fpr, tpr, thresholds = metrics.roc_curve(y_test, -1*score, pos_label=poslabel)
    auc_roc=metrics.auc(fpr, tpr)
    precision, recall, thresholds = precision_recall_curve(y_test, -1*score, pos_label=poslabel)
    auc_pr=metrics.auc(recall, precision)
    return auc_roc, auc_pr
    #print('aocc_knn_euclidean= %.4f' % auc_roc, ' auc_pr_occ_knn_euclidean= %.4f' % auc_pr)


In [677]:
def performance_metrics (score, y_test,poslabel):
    fpr, tpr, thresholds = metrics.roc_curve(y_test, score, pos_label=poslabel)
    auc_roc=metrics.auc(fpr, tpr)
    precision, recall, thresholds = precision_recall_curve(y_test, score, pos_label=poslabel)
    auc_pr=metrics.auc(recall, precision)
    return auc_roc, auc_pr
    #print('aocc_knn_euclidean= %.4f' % auc_roc, ' auc_pr_occ_knn_euclidean= %.4f' % auc_pr)


In [679]:
#To calculate hamming distance directly without one-hot-encoding
def hamming_distance(vector1, vector2):
    """Calculate Hamming distance between two categorical vectors."""
    return np.sum(vector1 != vector2) / len(vector1)


In [681]:
# Define a function to round to 4 significant digits
def round_to_4_sf(x):
    return round(x, 4 - int(f'{x:.0f}'.find('.') - 1))

In [683]:
def cat_Autoencoder(X_train, X_test, epochs, size):
    model = AutoEncoder(
    encoder_layers = [size], #model architecture
    decoder_layers = [], #decoder optional - you can create bottlenecks if you like
    activation='relu',
    swap_p=0.2, #noise parameter
    lr = 0.01,
    lr_decay=.99,
    batch_size=32,
    logger='ipynb', #special logging for jupyter notebooks
    verbose=False,
    optimizer='sgd',
    scaler='gauss_rank', #gauss rank scaling forces your numeric features into standard normal distributions
    min_cats=3 #Define cutoff for minority categories, default 10
    )
    model.fit(X_train, epochs=epochs)  
    z_train = model.get_deep_stack_features(X_train)
    z_test = model.get_deep_stack_features(X_test)
    scores = model.get_anomaly_score(X_test)

    #Convert tensors to panda dataframe
    z_train = tensor_to_dataframe(z_train)
    z_test = tensor_to_dataframe(z_test)
    #print('type-score=',type(scores))
    
    return scores, z_train, z_test

In [685]:
def tensor_to_dataframe(tensor):
   # Convert the tensor to a NumPy array
    numpy_array = tensor.detach().numpy()

    # Create a Pandas DataFrame from the NumPy array
    df = pd.DataFrame(numpy_array)

    return df


In [1038]:
#fetch dataset from csv files stored on disk
def fetch_dataset_csv():
    class_label='class'
    df = pd.read_csv("C:\OCC - Categorical Data\\diabetes_data_upload.csv")  

    # Separate features (X) and target variable (y)  
    X = df.iloc[:, :-1]  # All columns except the last one  
    y = df.iloc[:, -1].to_frame()   # Only the last column, preserving column name  
    poslabel='Positive'

    return X,y, poslabel, class_label
    

In [1040]:
def fetch_dataset_openml():
    #https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_openml.html
    class_label='binaryClass'
    X,y = fetch_openml(data_id=986, target_column=class_label, return_X_y=True) 
    #X = X.astype(int)  # Ensure it's integer type
    #X = X.replace({0: 'x', 1: 'y'})
    y = y.to_frame() # to retain the class column name
    #y = y.astype(int)  # Ensure it's integer type
    y = y.replace({0: 'x', 1: 'y'})
    poslabel='N'
    #y.value_counts()
    return X,y, poslabel, class_label

In [1042]:
# fetch dataset from uci
def fetch_dataset_uci():
    class_label='class'
    # fetch dataset 
    ucidata = fetch_ucirepo(id=101) 
  
    # data (as pandas dataframes) 
    X = ucidata.data.features 
    y = ucidata.data.targets  
    #X = X.replace({0: 'a', 1: 'b'})
    #y = y.astype(int)  # Ensure it's integer type
    y = y.replace({0: 'x', 1: 'y'})
    poslabel='positive'
    #print(y.value_counts(), '\nposlabel=', poslabel)
    
    return X,y, poslabel, class_label

In [1044]:
def main():
    X, y, poslabel, class_label = fetch_dataset_csv() #read data
    skf = StratifiedKFold(n_splits=10, shuffle=False)
    test_labels = np.empty((0,1))
    epochs = 100

    #Declaring variables 
    score_ae = np.empty((0,1))
    
    score_cat_hamming = np.empty((0, 1)) 
    #score_cat_euclidean = np.empty((0,1))
    score_cat_euclidean_nn = np.empty((0,1))
    #score_cat_canberra = np.empty((0,1))
    score_cat_canberra_nn = np.empty((0,1))
    #score_cat_isof = np.empty((0,1))
    score_cat_isof_nn = np.empty((0,1))
    #score_cat_lof = np.empty((0,1))
    score_cat_lof_nn = np.empty((0,1))
    score_cat_osvm_nn = np.empty((0,1))
    
    score_euclidean = np.empty((0, 1)) 
    score_euclidean_ae = np.empty((0, 1)) 
    score_euclidean_nn = np.empty((0, 1)) 
    score_euclidean_nn_ae = np.empty((0, 1)) 
    score_canberra = np.empty((0, 1)) 
    score_canberra_ae = np.empty((0, 1)) 
    score_canberra_nn = np.empty((0, 1)) 
    score_canberra_nn_ae = np.empty((0, 1)) 
    score_isof = np.empty((0, 1)) 
    score_isof_ae = np.empty((0, 1)) 
    score_isof_nn = np.empty((0, 1)) 
    score_isof_nn_ae = np.empty((0, 1)) 
    score_lof = np.empty((0, 1)) 
    score_lof_ae = np.empty((0, 1)) 
    score_lof_nn = np.empty((0, 1)) 
    score_lof_nn_ae = np.empty((0, 1)) 
    score_osvm = np.empty((0, 1)) 
    score_osvm_ae = np.empty((0, 1)) 
    score_osvm_nn = np.empty((0, 1)) 
    score_osvm_nn_ae = np.empty((0, 1)) 
    
    score_mca_euclidean = np.empty((0, 1)) 
    score_mca_euclidean_nn = np.empty((0, 1)) 
    score_mca_canberra = np.empty((0, 1)) 
    score_mca_canberra_nn = np.empty((0, 1)) 
    score_mca_isof = np.empty((0, 1)) 
    score_mca_isof_nn = np.empty((0, 1)) 
    score_mca_lof = np.empty((0, 1)) 
    score_mca_lof_nn = np.empty((0, 1)) 
    score_mca_osvm = np.empty((0, 1)) 
    score_mca_osvm_nn = np.empty((0, 1)) 
    
    score_fh_euclidean = np.empty((0, 1)) 
    score_fh_euclidean_nn = np.empty((0, 1)) 
    score_fh_canberra = np.empty((0, 1)) 
    score_fh_canberra_nn = np.empty((0, 1)) 
    score_fh_isof = np.empty((0, 1)) 
    score_fh_isof_nn = np.empty((0, 1)) 
    score_fh_lof = np.empty((0, 1)) 
    score_fh_lof_nn = np.empty((0, 1)) 
    score_fh_osvm = np.empty((0, 1)) 
    score_fh_osvm_nn = np.empty((0, 1)) 
    
    auc_roc = []
    auc_roc_cat =[]
    auc_pr = []
    auc_pr_cat = []
        
    i=1
    for train, test in skf.split(X, y):
        print('\nFold =', i)
        y_train = y.iloc[train] #indices of training labels
        #print(y_train.shape, y_train)
        y_test = y.iloc[test] #indices of testing labels
        test_labels = np.vstack([test_labels, y_test])
        #print('tl=',test_labels.shape)
        # Create pos_train to only contain values where y_train is 'e'
        y_train_t=y_train.reset_index(drop=True)
        pos_indices = y_train_t[y_train_t[class_label] == poslabel].index  # Filter y_train for only positive label
        #pos_indices = y_train[y_train['class'] == poslabel].index  # Filter y_train for only positive label
        #print(pos_indices.shape, pos_indices)  # Print the matching indices
        #X_train=X.iloc[y_train.index]
        #print('--before--',X_train.shape)
        X_train = X.iloc[pos_indices]
        #print('\n--after--',X_train.shape)
        X_test = X.iloc[test]
        #print('\n--xtest--',X_test.shape)
        #Randomly choose F features
        size = X_train.shape[1]
        #print('size=', size)
        
        # # OCC without One-Hot-Encoding
        print("\n***Without one-hot-encoding***")
        score = cat_ocknn(X_train, X_test,y_test)
        # # print('score-shape=', score.shape)
        score_cat_hamming = np.vstack([score_cat_hamming, score])

        # #Transform to NN features without One-Hot-Encoding
        print("Without one-hot-encoding - Transform to NN features (Top K Distances)")
        X_train_cat_nn, X_test_cat_nn = transform_nearest_neighbor_cat(X_train, X_test, size)
        # # Call OCC classifiers
        score1, score2, score3, score4, score5 = occ_classifiers(X_train_cat_nn, X_test_cat_nn)
        score_cat_euclidean_nn = np.vstack([score_cat_euclidean_nn, score1.reshape(-1,1)]) 
        score_cat_canberra_nn = np.vstack([score_cat_canberra_nn, score2.reshape(-1,1)]) 
        score_cat_isof_nn = np.vstack([score_cat_isof_nn,score3.reshape(-1,1)])
        score_cat_lof_nn = np.vstack([score_cat_lof_nn, score4.reshape(-1,1)])
        score_cat_osvm_nn = np.vstack([score_cat_osvm_nn, score5.reshape(-1,1)])
        
        #Transform with One-Hot-encoding 
        print("\n***After One-hot-Encoding***")
        X_train_ohe, X_test_ohe = transform_one_hot_encoding(X_train, X_test)
        # Call OCC classifiers
        score1, score2, score3, score4, score5 = occ_classifiers(X_train_ohe, X_test_ohe)
        score_euclidean = np.vstack([score_euclidean, score1.reshape(-1,1)]) 
        score_canberra = np.vstack([score_canberra, score2.reshape(-1,1)]) 
        score_isof = np.vstack([score_isof,score3.reshape(-1,1)])
        score_lof = np.vstack([score_lof, score4.reshape(-1,1)])
        score_osvm = np.vstack([score_osvm, score5.reshape(-1,1)])
        
        print("Autoencoder on latent dimension")
        # Call autoencoder
        score1, z_train, z_test = cat_Autoencoder(X_train_ohe, X_test_ohe, epochs, size)
        #plt.plot(score1)
        score_ae = np.vstack([score_ae, score1.reshape(-1,1)])
        # Call OCC classifiers
        score1, score2, score3, score4, score5 = occ_classifiers(z_train, z_test)
        score_euclidean_ae = np.vstack([score_euclidean_ae, score1.reshape(-1,1)]) 
        score_canberra_ae = np.vstack([score_canberra_ae, score2.reshape(-1,1)]) 
        score_isof_ae = np.vstack([score_isof_ae,score3.reshape(-1,1)])
        score_lof_ae = np.vstack([score_lof_ae, score4.reshape(-1,1)])
        score_osvm_ae = np.vstack([score_osvm_ae, score5.reshape(-1,1)])
        
        #Transform to nearest neighbor feaures
        print("After One-hot-Encoding - Transform to NN features (Top K Distances)")
        X_train_ohe_nn, X_test_ohe_nn = transform_nearest_neighbor(X_train_ohe, X_test_ohe, size)
        # Call OCC classifiers
        score1, score2, score3, score4, score5 = occ_classifiers(X_train_ohe_nn, X_test_ohe_nn)
        score_euclidean_nn = np.vstack([score_euclidean_nn, score1.reshape(-1,1)]) 
        score_canberra_nn = np.vstack([score_canberra_nn, score2.reshape(-1,1)]) 
        score_isof_nn = np.vstack([score_isof_nn,score3.reshape(-1,1)])
        score_lof_nn = np.vstack([score_lof_nn, score4.reshape(-1,1)])
        score_osvm_nn = np.vstack([score_osvm_nn, score5.reshape(-1,1)])        
        
        print("After One-hot-Encoding - AutoEncoder Latent Space - Transform to NN features (Top K Distances)")
        X_train_ohe_nn_ae, X_test_ohe_nn_ae = transform_nearest_neighbor(z_train, z_test, size)
        # Call OCC classifiers
        score1, score2, score3, score4, score5 = occ_classifiers(X_train_ohe_nn_ae, X_test_ohe_nn_ae)
        score_euclidean_nn_ae = np.vstack([score_euclidean_nn_ae, score1.reshape(-1,1)]) 
        score_canberra_nn_ae = np.vstack([score_canberra_nn_ae, score2.reshape(-1,1)]) 
        score_isof_nn_ae = np.vstack([score_isof_nn_ae,score3.reshape(-1,1)])
        score_lof_nn_ae = np.vstack([score_lof_nn_ae, score4.reshape(-1,1)])
        score_osvm_nn_ae = np.vstack([score_osvm_nn_ae, score5.reshape(-1,1)])        

        #Transform with Multiple Correspondence Analysis
        print("\nAfter Multiple Correspondence Analysis")
        X_train_mca, X_test_mca = transform_mca(X_train, X_test)
        # Call OCC classifiers
        score1, score2, score3, score4, score5 = occ_classifiers(X_train_mca, X_test_mca) 
        score_mca_euclidean = np.vstack([score_mca_euclidean, score1.reshape(-1,1)]) 
        score_mca_canberra = np.vstack([score_mca_canberra, score2.reshape(-1,1)]) 
        score_mca_isof = np.vstack([score_mca_isof, score3.reshape(-1,1)]) 
        score_mca_lof = np.vstack([score_mca_lof, score4.reshape(-1,1)]) 
        score_mca_osvm = np.vstack([score_mca_osvm, score5.reshape(-1,1)]) 
        
        #Transform to nearest neighbor feaures
        print("After Multiple Correspondence Analysis - Transform to NN features (Top K Distances)")
        size = X_train.shape[1]
        X_train_mca_nn, X_test_mca_nn = transform_nearest_neighbor(X_train_mca, X_test_mca, size)
        # Call OCC classifiers
        score1, score2, score3, score4, score5 = occ_classifiers(X_train_mca_nn, X_test_mca_nn) 
        score_mca_euclidean_nn = np.vstack([score_mca_euclidean_nn, score1.reshape(-1,1)]) 
        score_mca_canberra_nn = np.vstack([score_mca_canberra_nn, score2.reshape(-1,1)]) 
        score_mca_isof_nn = np.vstack([score_mca_isof_nn, score3.reshape(-1,1)]) 
        score_mca_lof_nn = np.vstack([score_mca_lof_nn, score4.reshape(-1,1)]) 
        score_mca_osvm_nn = np.vstack([score_mca_osvm_nn, score5.reshape(-1,1)]) 
        
        #Transform with Feature Hashing
        print("\nAfter Feature Hashing Transformation")
        X_train_fh, X_test_fh = transform_feature_hashing(X_train, X_test)
        # Call OCC classifiers
        score1, score2, score3, score4, score5 = occ_classifiers(X_train_fh, X_test_fh) 
        score_fh_euclidean = np.vstack([score_fh_euclidean, score1.reshape(-1,1)]) 
        score_fh_canberra = np.vstack([score_fh_canberra, score2.reshape(-1,1)]) 
        score_fh_isof = np.vstack([score_fh_isof, score3.reshape(-1,1)]) 
        score_fh_lof = np.vstack([score_fh_lof, score4.reshape(-1,1)]) 
        score_fh_osvm = np.vstack([score_fh_osvm, score5.reshape(-1,1)]) 

        #Transform to nearest neighbor feaures
        print("After Feature Hashing Transformation - Transform to NN features (Top K Distances)")
        size = X_train.shape[1]
        X_train_fh_nn, X_test_fh_nn = transform_nearest_neighbor(X_train_fh, X_test_fh, size)
        # Call OCC classifiers
        score1, score2, score3, score4, score5 = occ_classifiers(X_train_fh_nn, X_test_fh_nn) 
        score_fh_euclidean_nn = np.vstack([score_fh_euclidean_nn, score1.reshape(-1,1)]) 
        score_fh_canberra_nn = np.vstack([score_fh_canberra_nn, score2.reshape(-1,1)]) 
        score_fh_isof_nn = np.vstack([score_fh_isof_nn, score3.reshape(-1,1)]) 
        score_fh_lof_nn = np.vstack([score_fh_lof_nn, score4.reshape(-1,1)]) 
        score_fh_osvm_nn = np.vstack([score_fh_osvm_nn, score5.reshape(-1,1)]) 

        i=i+1

    print('***AUC ROC and AUC PR***')
    print("For categorical-->")

    auc_roc_cat_hamming, auc_pr_cat_hamming = performance_metrics_distance(score_cat_hamming, test_labels, poslabel)
    
    roc1, pr1 = performance_metrics_distance(score_cat_euclidean_nn, test_labels, poslabel)
    roc2, pr2= performance_metrics_distance(score_cat_canberra_nn, test_labels, poslabel)
    roc3, pr3= performance_metrics(score_cat_isof_nn, test_labels, poslabel)
    roc4, pr4= performance_metrics(score_cat_lof_nn, test_labels, poslabel)
    roc5, pr5= performance_metrics(score_cat_osvm_nn, test_labels, poslabel)
    auc_roc_cat.append([roc1, roc2, roc3, roc4, roc5])
    auc_pr_cat.append([pr1, pr2, pr3, pr4, pr5])

    #print(auc_roc_cat, auc_pr_cat)
    #print(auc_roc_cat_hamming, auc_pr_cat_hamming)
    
    # Print AUC ROC and PR for categorical classifiers
    df_auc_roc_cat = pd.DataFrame(auc_roc_cat)
    df_auc_roc_cat.columns= ['Euclidean', 'Canberra', 'ISOF', 'LOF', 'OSVM']
    df_auc_roc_cat.index= ['CAT-SF']
    df_auc_pr_cat = pd.DataFrame(auc_pr_cat)
    df_auc_pr_cat.columns= ['Euclidean', 'Canberra', 'ISOF', 'LOF', 'OSVM']
    df_auc_pr_cat.index= [ 'CAT-SF']
    #print(f"Data saved to {file_auc_roc_cat}  and {file_auc_pr_cat}")
    print("***auc_roc_cat_hamming=", "{:.4f}".format(auc_roc_cat_hamming), "auc_pr_cat_hamming=", "{:.4f}".format(auc_pr_cat_hamming))
    #Print AUC ROC and PR for numerical classifiers
    print("For numerical-->")
    auc_roc_ae, auc_pr_ae = performance_metrics_distance(score_ae, test_labels, poslabel)
    print('***auc_roc_ae=', "{:.4f}".format(auc_roc_ae), 'auc_pr_ae=', "{:.4f}".format(auc_pr_ae))
    
    roc1, pr1 = performance_metrics_distance(score_euclidean, test_labels, poslabel)
    roc2, pr2 = performance_metrics_distance(score_canberra, test_labels, poslabel)
    roc3, pr3 = performance_metrics(score_isof, test_labels, poslabel)
    roc4, pr4 = performance_metrics(score_lof, test_labels, poslabel)
    roc5, pr5 = performance_metrics(score_osvm, test_labels, poslabel)
    auc_roc.append([roc1, roc2, roc3, roc4, roc5])
    auc_pr.append([pr1, pr2, pr3, pr4, pr5])

    roc1, pr1 = performance_metrics_distance(score_euclidean_nn, test_labels, poslabel)
    roc2, pr2 = performance_metrics_distance(score_canberra_nn, test_labels, poslabel)
    roc3, pr3 = performance_metrics(score_isof_nn, test_labels, poslabel)
    roc4, pr4 = performance_metrics(score_lof_nn, test_labels, poslabel)
    roc5, pr5 = performance_metrics(score_osvm_nn, test_labels, poslabel)
    auc_roc.append([roc1, roc2, roc3, roc4, roc5])
    auc_pr.append([pr1, pr2, pr3, pr4, pr5])

    roc1, pr1 = performance_metrics_distance(score_euclidean_ae, test_labels, poslabel)
    roc2, pr2 = performance_metrics_distance(score_canberra_ae, test_labels, poslabel)
    roc3, pr3 = performance_metrics(score_isof_ae, test_labels, poslabel)
    roc4, pr4 = performance_metrics(score_lof_ae, test_labels, poslabel)
    roc5, pr5 = performance_metrics(score_osvm_ae, test_labels, poslabel)
    auc_roc.append([roc1, roc2, roc3, roc4, roc5])
    auc_pr.append([pr1, pr2, pr3, pr4, pr5])

    roc1, pr1 = performance_metrics_distance(score_euclidean_nn_ae, test_labels, poslabel)
    roc2, pr2 = performance_metrics_distance(score_canberra_nn_ae, test_labels, poslabel)
    roc3, pr3 = performance_metrics(score_isof_nn_ae, test_labels, poslabel)
    roc4, pr4 = performance_metrics(score_lof_nn_ae, test_labels, poslabel)
    roc5, pr5 = performance_metrics(score_osvm_nn_ae, test_labels, poslabel)
    auc_roc.append([roc1, roc2, roc3, roc4, roc5])
    auc_pr.append([pr1, pr2, pr3, pr4, pr5])

    roc1, pr1 = performance_metrics_distance(score_mca_euclidean, test_labels, poslabel)
    roc2, pr2 = performance_metrics_distance(score_mca_canberra, test_labels, poslabel)
    roc3, pr3 = performance_metrics(score_mca_isof, test_labels, poslabel)
    roc4, pr4 = performance_metrics(score_mca_lof, test_labels, poslabel)
    roc5, pr5 = performance_metrics(score_mca_osvm, test_labels, poslabel)
    auc_roc.append([roc1, roc2, roc3, roc4, roc5])
    auc_pr.append([pr1, pr2, pr3, pr4, pr5])

    roc1, pr1 = performance_metrics_distance(score_mca_euclidean_nn, test_labels, poslabel)
    roc2, pr2 = performance_metrics_distance(score_mca_canberra_nn, test_labels, poslabel)
    roc3, pr3 = performance_metrics(score_mca_isof_nn, test_labels, poslabel)
    roc4, pr4 = performance_metrics(score_mca_lof_nn, test_labels, poslabel)
    roc5, pr5 = performance_metrics(score_mca_osvm_nn, test_labels, poslabel)
    auc_roc.append([roc1, roc2, roc3, roc4, roc5])
    auc_pr.append([pr1, pr2, pr3, pr4, pr5])

    roc1, pr1 = performance_metrics_distance(score_fh_euclidean, test_labels, poslabel)
    roc2, pr2 = performance_metrics_distance(score_fh_canberra, test_labels, poslabel)
    roc3, pr3 = performance_metrics(score_fh_isof, test_labels, poslabel)
    roc4, pr4 = performance_metrics(score_fh_lof, test_labels, poslabel)
    roc5, pr5 = performance_metrics(score_fh_osvm, test_labels, poslabel)
    auc_roc.append([roc1, roc2, roc3, roc4, roc5])
    auc_pr.append([pr1, pr2, pr3, pr4, pr5])

    roc1, pr1 = performance_metrics_distance(score_fh_euclidean_nn, test_labels, poslabel)
    roc2, pr2 = performance_metrics_distance(score_fh_canberra_nn, test_labels, poslabel)
    roc3, pr3 = performance_metrics(score_fh_isof_nn, test_labels, poslabel)
    roc4, pr4 = performance_metrics(score_fh_lof_nn, test_labels, poslabel)
    roc5, pr5 = performance_metrics(score_fh_osvm_nn, test_labels, poslabel)
    auc_roc.append([roc1, roc2, roc3, roc4, roc5])
    auc_pr.append([pr1, pr2, pr3, pr4, pr5])

    file_auc_roc = 'output_roc.csv'
    file_auc_pr = 'output_pr.csv'
   
    df_auc_roc = pd.DataFrame(auc_roc)
    df_auc_roc.columns= ['Euclidean', 'Canberra', 'ISOF', 'LOF', 'OSVM']
    df_auc_roc.index= ['OHE', 'OHE-SF', 'OHE-AE', 'OHE-AE-SF', 'OHE-MCA', 'OHE-MCA-SF','CAT-FH', 'CAT-FH-SF']
    df_auc_pr = pd.DataFrame(auc_pr)
    df_auc_pr.columns= ['Euclidean', 'Canberra', 'ISOF', 'LOF', 'OSVM']
    df_auc_pr.index= ['OHE', 'OHE-SF', 'OHE-AE', 'OHE-AE-SF', 'OHE-MCA', 'OHE-MCA-SF','CAT-FH', 'CAT-FH-SF']
    merged_auc_roc = pd.concat([df_auc_roc_cat, df_auc_roc], axis=0)
    merged_auc_pr = pd.concat([df_auc_pr_cat, df_auc_pr], axis=0)
    merged_auc_roc = merged_auc_roc.map(round_to_4_sf)
    merged_auc_pr = merged_auc_pr.map(round_to_4_sf)
    #print(merged_auc_roc)
    #print( merged_auc_pr)
    merged_auc_roc.to_csv(file_auc_roc, index=True)
    merged_auc_pr.to_csv(file_auc_pr, index=True)
    print(f"Data saved to {file_auc_roc}  and {file_auc_pr}")
    #Convert to Latex table format
    # Convert the DataFrame to LaTeX format
    latex_table_roc = merged_auc_roc.to_latex(index=True)
    latex_table_pr = merged_auc_pr.to_latex(index=True)
    # Save the LaTeX table to a text file
    with open('output_roc.tex', 'w') as f:
        f.write(latex_table_roc)
    with open('output_pr.tex', 'w') as f:
        f.write(latex_table_pr)

  

In [1046]:
# Call the main function
if __name__ == "__main__":
    main()


Fold = 1

***Without one-hot-encoding***
Without one-hot-encoding - Transform to NN features (Top K Distances)

***After One-hot-Encoding***
Autoencoder on latent dimension


100%|██████████| 9/9 [00:00<00:00, 43.34it/s]
100%|██████████| 9/9 [00:00<00:00, 60.11it/s]
100%|██████████| 9/9 [00:00<00:00, 67.83it/s]
100%|██████████| 9/9 [00:00<00:00, 69.82it/s]
100%|██████████| 9/9 [00:00<00:00, 37.17it/s]
100%|██████████| 9/9 [00:00<00:00, 66.13it/s]
100%|██████████| 9/9 [00:00<00:00, 26.89it/s]
100%|██████████| 9/9 [00:00<00:00, 65.15it/s]
100%|██████████| 9/9 [00:00<00:00, 62.63it/s]
100%|██████████| 9/9 [00:00<00:00, 65.64it/s]
100%|██████████| 9/9 [00:00<00:00, 68.46it/s]
100%|██████████| 9/9 [00:00<00:00, 64.42it/s]
100%|██████████| 9/9 [00:00<00:00, 64.42it/s]
100%|██████████| 9/9 [00:00<00:00, 20.55it/s]
100%|██████████| 9/9 [00:00<00:00, 37.76it/s]
100%|██████████| 9/9 [00:00<00:00, 62.46it/s]
100%|██████████| 9/9 [00:00<00:00, 25.76it/s]
100%|██████████| 9/9 [00:00<00:00, 42.15it/s]
100%|██████████| 9/9 [00:00<00:00, 41.71it/s]
100%|██████████| 9/9 [00:00<00:00, 30.14it/s]
100%|██████████| 9/9 [00:00<00:00, 41.86it/s]
100%|██████████| 9/9 [00:00<00:00,

After One-hot-Encoding - Transform to NN features (Top K Distances)
After One-hot-Encoding - AutoEncoder Latent Space - Transform to NN features (Top K Distances)

After Multiple Correspondence Analysis
After Multiple Correspondence Analysis - Transform to NN features (Top K Distances)

After Feature Hashing Transformation
After Feature Hashing Transformation - Transform to NN features (Top K Distances)

Fold = 2

***Without one-hot-encoding***
Without one-hot-encoding - Transform to NN features (Top K Distances)

***After One-hot-Encoding***
Autoencoder on latent dimension


100%|██████████| 9/9 [00:00<00:00, 73.70it/s]
100%|██████████| 9/9 [00:00<00:00, 75.61it/s]
100%|██████████| 9/9 [00:00<00:00, 72.91it/s]
100%|██████████| 9/9 [00:00<00:00, 72.72it/s]
100%|██████████| 9/9 [00:00<00:00, 73.69it/s]
100%|██████████| 9/9 [00:00<00:00, 74.44it/s]
100%|██████████| 9/9 [00:00<00:00, 75.49it/s]
100%|██████████| 9/9 [00:00<00:00, 74.59it/s]
100%|██████████| 9/9 [00:00<00:00, 69.16it/s]
100%|██████████| 9/9 [00:00<00:00, 65.16it/s]
100%|██████████| 9/9 [00:00<00:00, 42.72it/s]
100%|██████████| 9/9 [00:00<00:00, 67.52it/s]
100%|██████████| 9/9 [00:00<00:00, 74.95it/s]
100%|██████████| 9/9 [00:00<00:00, 73.89it/s]
100%|██████████| 9/9 [00:00<00:00, 74.32it/s]
100%|██████████| 9/9 [00:00<00:00, 74.01it/s]
100%|██████████| 9/9 [00:00<00:00, 69.44it/s]
100%|██████████| 9/9 [00:00<00:00, 72.31it/s]
100%|██████████| 9/9 [00:00<00:00, 75.70it/s]
100%|██████████| 9/9 [00:00<00:00, 72.83it/s]
100%|██████████| 9/9 [00:00<00:00, 76.84it/s]
100%|██████████| 9/9 [00:00<00:00,

After One-hot-Encoding - Transform to NN features (Top K Distances)
After One-hot-Encoding - AutoEncoder Latent Space - Transform to NN features (Top K Distances)

After Multiple Correspondence Analysis
After Multiple Correspondence Analysis - Transform to NN features (Top K Distances)

After Feature Hashing Transformation
After Feature Hashing Transformation - Transform to NN features (Top K Distances)

Fold = 3

***Without one-hot-encoding***
Without one-hot-encoding - Transform to NN features (Top K Distances)

***After One-hot-Encoding***
Autoencoder on latent dimension


100%|██████████| 9/9 [00:00<00:00, 69.12it/s]
100%|██████████| 9/9 [00:00<00:00, 66.30it/s]
100%|██████████| 9/9 [00:00<00:00, 64.93it/s]
100%|██████████| 9/9 [00:00<00:00, 72.82it/s]
100%|██████████| 9/9 [00:00<00:00, 74.82it/s]
100%|██████████| 9/9 [00:00<00:00, 74.75it/s]
100%|██████████| 9/9 [00:00<00:00, 74.46it/s]
100%|██████████| 9/9 [00:00<00:00, 74.84it/s]
100%|██████████| 9/9 [00:00<00:00, 72.82it/s]
100%|██████████| 9/9 [00:00<00:00, 74.87it/s]
100%|██████████| 9/9 [00:00<00:00, 73.11it/s]
100%|██████████| 9/9 [00:00<00:00, 75.54it/s]
100%|██████████| 9/9 [00:00<00:00, 69.00it/s]
100%|██████████| 9/9 [00:00<00:00, 68.92it/s]
100%|██████████| 9/9 [00:00<00:00, 70.84it/s]
100%|██████████| 9/9 [00:00<00:00, 73.86it/s]
100%|██████████| 9/9 [00:00<00:00, 69.94it/s]
100%|██████████| 9/9 [00:00<00:00, 67.10it/s]
100%|██████████| 9/9 [00:00<00:00, 68.12it/s]
100%|██████████| 9/9 [00:00<00:00, 67.59it/s]
100%|██████████| 9/9 [00:00<00:00, 70.14it/s]
100%|██████████| 9/9 [00:00<00:00,

After One-hot-Encoding - Transform to NN features (Top K Distances)
After One-hot-Encoding - AutoEncoder Latent Space - Transform to NN features (Top K Distances)

After Multiple Correspondence Analysis
After Multiple Correspondence Analysis - Transform to NN features (Top K Distances)

After Feature Hashing Transformation
After Feature Hashing Transformation - Transform to NN features (Top K Distances)

Fold = 4

***Without one-hot-encoding***
Without one-hot-encoding - Transform to NN features (Top K Distances)

***After One-hot-Encoding***
Autoencoder on latent dimension


100%|██████████| 9/9 [00:00<00:00, 63.88it/s]
100%|██████████| 9/9 [00:00<00:00, 60.45it/s]
100%|██████████| 9/9 [00:00<00:00, 65.28it/s]
100%|██████████| 9/9 [00:00<00:00, 66.58it/s]
100%|██████████| 9/9 [00:00<00:00, 62.92it/s]
100%|██████████| 9/9 [00:00<00:00, 34.07it/s]
100%|██████████| 9/9 [00:00<00:00, 62.66it/s]
100%|██████████| 9/9 [00:00<00:00, 61.09it/s]
100%|██████████| 9/9 [00:00<00:00, 11.33it/s]
100%|██████████| 9/9 [00:00<00:00, 56.27it/s]
100%|██████████| 9/9 [00:00<00:00, 61.04it/s]
100%|██████████| 9/9 [00:00<00:00, 57.45it/s]
100%|██████████| 9/9 [00:00<00:00, 73.45it/s]
100%|██████████| 9/9 [00:00<00:00, 72.79it/s]
100%|██████████| 9/9 [00:00<00:00, 72.53it/s]
100%|██████████| 9/9 [00:00<00:00, 68.78it/s]
100%|██████████| 9/9 [00:00<00:00, 71.69it/s]
100%|██████████| 9/9 [00:00<00:00, 65.13it/s]
100%|██████████| 9/9 [00:00<00:00, 72.18it/s]
100%|██████████| 9/9 [00:00<00:00, 71.28it/s]
100%|██████████| 9/9 [00:00<00:00, 40.61it/s]
100%|██████████| 9/9 [00:00<00:00,

After One-hot-Encoding - Transform to NN features (Top K Distances)
After One-hot-Encoding - AutoEncoder Latent Space - Transform to NN features (Top K Distances)

After Multiple Correspondence Analysis
After Multiple Correspondence Analysis - Transform to NN features (Top K Distances)

After Feature Hashing Transformation
After Feature Hashing Transformation - Transform to NN features (Top K Distances)

Fold = 5

***Without one-hot-encoding***
Without one-hot-encoding - Transform to NN features (Top K Distances)

***After One-hot-Encoding***
Autoencoder on latent dimension


100%|██████████| 9/9 [00:00<00:00, 64.57it/s]
100%|██████████| 9/9 [00:00<00:00, 63.35it/s]
100%|██████████| 9/9 [00:00<00:00, 66.34it/s]
100%|██████████| 9/9 [00:00<00:00, 68.29it/s]
100%|██████████| 9/9 [00:00<00:00, 69.30it/s]
100%|██████████| 9/9 [00:00<00:00, 67.98it/s]
100%|██████████| 9/9 [00:00<00:00, 67.62it/s]
100%|██████████| 9/9 [00:00<00:00, 69.32it/s]
100%|██████████| 9/9 [00:00<00:00, 68.59it/s]
100%|██████████| 9/9 [00:00<00:00, 69.70it/s]
100%|██████████| 9/9 [00:00<00:00, 70.55it/s]
100%|██████████| 9/9 [00:00<00:00, 67.64it/s]
100%|██████████| 9/9 [00:00<00:00, 68.65it/s]
100%|██████████| 9/9 [00:00<00:00, 69.97it/s]
100%|██████████| 9/9 [00:00<00:00, 34.61it/s]
100%|██████████| 9/9 [00:00<00:00, 66.31it/s]
100%|██████████| 9/9 [00:00<00:00, 69.48it/s]
100%|██████████| 9/9 [00:00<00:00, 65.63it/s]
100%|██████████| 9/9 [00:00<00:00, 64.10it/s]
100%|██████████| 9/9 [00:00<00:00, 62.28it/s]
100%|██████████| 9/9 [00:00<00:00, 62.25it/s]
100%|██████████| 9/9 [00:00<00:00,

After One-hot-Encoding - Transform to NN features (Top K Distances)
After One-hot-Encoding - AutoEncoder Latent Space - Transform to NN features (Top K Distances)

After Multiple Correspondence Analysis
After Multiple Correspondence Analysis - Transform to NN features (Top K Distances)

After Feature Hashing Transformation
After Feature Hashing Transformation - Transform to NN features (Top K Distances)

Fold = 6

***Without one-hot-encoding***
Without one-hot-encoding - Transform to NN features (Top K Distances)

***After One-hot-Encoding***
Autoencoder on latent dimension


100%|██████████| 9/9 [00:00<00:00, 59.80it/s]
100%|██████████| 9/9 [00:00<00:00, 51.81it/s]
100%|██████████| 9/9 [00:00<00:00, 65.40it/s]
100%|██████████| 9/9 [00:00<00:00, 66.37it/s]
100%|██████████| 9/9 [00:00<00:00, 62.46it/s]
100%|██████████| 9/9 [00:00<00:00, 57.94it/s]
100%|██████████| 9/9 [00:00<00:00, 58.32it/s]
100%|██████████| 9/9 [00:00<00:00, 67.25it/s]
100%|██████████| 9/9 [00:00<00:00, 68.39it/s]
100%|██████████| 9/9 [00:00<00:00, 66.93it/s]
100%|██████████| 9/9 [00:00<00:00, 66.79it/s]
100%|██████████| 9/9 [00:00<00:00, 66.56it/s]
100%|██████████| 9/9 [00:00<00:00, 68.25it/s]
100%|██████████| 9/9 [00:00<00:00, 65.04it/s]
100%|██████████| 9/9 [00:00<00:00, 63.53it/s]
100%|██████████| 9/9 [00:00<00:00, 19.11it/s]
100%|██████████| 9/9 [00:00<00:00, 66.68it/s]
100%|██████████| 9/9 [00:00<00:00, 69.14it/s]
100%|██████████| 9/9 [00:00<00:00, 64.68it/s]
100%|██████████| 9/9 [00:00<00:00, 65.87it/s]
100%|██████████| 9/9 [00:00<00:00, 67.92it/s]
100%|██████████| 9/9 [00:00<00:00,

After One-hot-Encoding - Transform to NN features (Top K Distances)
After One-hot-Encoding - AutoEncoder Latent Space - Transform to NN features (Top K Distances)

After Multiple Correspondence Analysis
After Multiple Correspondence Analysis - Transform to NN features (Top K Distances)

After Feature Hashing Transformation
After Feature Hashing Transformation - Transform to NN features (Top K Distances)

Fold = 7

***Without one-hot-encoding***
Without one-hot-encoding - Transform to NN features (Top K Distances)

***After One-hot-Encoding***
Autoencoder on latent dimension


100%|██████████| 9/9 [00:00<00:00, 69.42it/s]
100%|██████████| 9/9 [00:00<00:00, 72.57it/s]
100%|██████████| 9/9 [00:00<00:00, 69.48it/s]
100%|██████████| 9/9 [00:00<00:00, 72.18it/s]
100%|██████████| 9/9 [00:00<00:00, 72.80it/s]
100%|██████████| 9/9 [00:00<00:00, 64.66it/s]
100%|██████████| 9/9 [00:00<00:00, 65.00it/s]
100%|██████████| 9/9 [00:00<00:00, 71.27it/s]
100%|██████████| 9/9 [00:00<00:00, 71.65it/s]
100%|██████████| 9/9 [00:00<00:00, 70.58it/s]
100%|██████████| 9/9 [00:00<00:00, 67.70it/s]
100%|██████████| 9/9 [00:00<00:00, 70.56it/s]
100%|██████████| 9/9 [00:00<00:00, 68.16it/s]
100%|██████████| 9/9 [00:00<00:00, 63.35it/s]
100%|██████████| 9/9 [00:00<00:00, 70.48it/s]
100%|██████████| 9/9 [00:00<00:00, 70.50it/s]
100%|██████████| 9/9 [00:00<00:00, 66.57it/s]
100%|██████████| 9/9 [00:00<00:00, 18.06it/s]
100%|██████████| 9/9 [00:00<00:00, 68.03it/s]
100%|██████████| 9/9 [00:00<00:00, 70.54it/s]
100%|██████████| 9/9 [00:00<00:00, 69.36it/s]
100%|██████████| 9/9 [00:00<00:00,

After One-hot-Encoding - Transform to NN features (Top K Distances)
After One-hot-Encoding - AutoEncoder Latent Space - Transform to NN features (Top K Distances)

After Multiple Correspondence Analysis
After Multiple Correspondence Analysis - Transform to NN features (Top K Distances)

After Feature Hashing Transformation
After Feature Hashing Transformation - Transform to NN features (Top K Distances)

Fold = 8

***Without one-hot-encoding***
Without one-hot-encoding - Transform to NN features (Top K Distances)

***After One-hot-Encoding***
Autoencoder on latent dimension


100%|██████████| 9/9 [00:00<00:00, 67.02it/s]
100%|██████████| 9/9 [00:00<00:00, 65.82it/s]
100%|██████████| 9/9 [00:00<00:00, 61.61it/s]
100%|██████████| 9/9 [00:00<00:00, 11.05it/s]
100%|██████████| 9/9 [00:00<00:00, 72.56it/s]
100%|██████████| 9/9 [00:00<00:00, 65.40it/s]
100%|██████████| 9/9 [00:00<00:00, 67.19it/s]
100%|██████████| 9/9 [00:00<00:00, 62.98it/s]
100%|██████████| 9/9 [00:00<00:00, 63.50it/s]
100%|██████████| 9/9 [00:00<00:00, 63.52it/s]
100%|██████████| 9/9 [00:00<00:00, 64.85it/s]
100%|██████████| 9/9 [00:00<00:00, 68.66it/s]
100%|██████████| 9/9 [00:00<00:00, 67.89it/s]
100%|██████████| 9/9 [00:00<00:00, 71.61it/s]
100%|██████████| 9/9 [00:00<00:00, 66.07it/s]
100%|██████████| 9/9 [00:00<00:00, 64.30it/s]
100%|██████████| 9/9 [00:00<00:00, 69.33it/s]
100%|██████████| 9/9 [00:00<00:00, 64.86it/s]
100%|██████████| 9/9 [00:00<00:00, 65.83it/s]
100%|██████████| 9/9 [00:00<00:00, 66.12it/s]
100%|██████████| 9/9 [00:00<00:00, 63.12it/s]
100%|██████████| 9/9 [00:00<00:00,

After One-hot-Encoding - Transform to NN features (Top K Distances)
After One-hot-Encoding - AutoEncoder Latent Space - Transform to NN features (Top K Distances)

After Multiple Correspondence Analysis
After Multiple Correspondence Analysis - Transform to NN features (Top K Distances)

After Feature Hashing Transformation
After Feature Hashing Transformation - Transform to NN features (Top K Distances)

Fold = 9

***Without one-hot-encoding***
Without one-hot-encoding - Transform to NN features (Top K Distances)

***After One-hot-Encoding***
Autoencoder on latent dimension


100%|██████████| 9/9 [00:00<00:00, 57.42it/s]
100%|██████████| 9/9 [00:00<00:00, 23.09it/s]
100%|██████████| 9/9 [00:00<00:00, 46.07it/s]
100%|██████████| 9/9 [00:00<00:00, 66.13it/s]
100%|██████████| 9/9 [00:00<00:00, 72.82it/s]
100%|██████████| 9/9 [00:00<00:00, 70.13it/s]
100%|██████████| 9/9 [00:00<00:00, 72.21it/s]
100%|██████████| 9/9 [00:00<00:00,  9.22it/s]
100%|██████████| 9/9 [00:00<00:00, 72.57it/s]
100%|██████████| 9/9 [00:00<00:00, 72.46it/s]
100%|██████████| 9/9 [00:00<00:00, 73.69it/s]
100%|██████████| 9/9 [00:00<00:00, 53.53it/s]
100%|██████████| 9/9 [00:00<00:00, 65.13it/s]
100%|██████████| 9/9 [00:00<00:00, 68.86it/s]
100%|██████████| 9/9 [00:00<00:00, 68.69it/s]
100%|██████████| 9/9 [00:00<00:00, 69.44it/s]
100%|██████████| 9/9 [00:00<00:00, 72.13it/s]
100%|██████████| 9/9 [00:00<00:00, 68.09it/s]
100%|██████████| 9/9 [00:00<00:00, 71.07it/s]
100%|██████████| 9/9 [00:00<00:00, 67.31it/s]
100%|██████████| 9/9 [00:00<00:00, 69.61it/s]
100%|██████████| 9/9 [00:00<00:00,

After One-hot-Encoding - Transform to NN features (Top K Distances)
After One-hot-Encoding - AutoEncoder Latent Space - Transform to NN features (Top K Distances)

After Multiple Correspondence Analysis
After Multiple Correspondence Analysis - Transform to NN features (Top K Distances)

After Feature Hashing Transformation
After Feature Hashing Transformation - Transform to NN features (Top K Distances)

Fold = 10

***Without one-hot-encoding***
Without one-hot-encoding - Transform to NN features (Top K Distances)

***After One-hot-Encoding***
Autoencoder on latent dimension


100%|██████████| 9/9 [00:00<00:00, 32.18it/s]
100%|██████████| 9/9 [00:00<00:00, 44.00it/s]
100%|██████████| 9/9 [00:00<00:00, 26.20it/s]
100%|██████████| 9/9 [00:00<00:00, 31.48it/s]
100%|██████████| 9/9 [00:00<00:00, 48.80it/s]
100%|██████████| 9/9 [00:00<00:00, 57.82it/s]
100%|██████████| 9/9 [00:00<00:00, 52.48it/s]
100%|██████████| 9/9 [00:00<00:00, 55.30it/s]
100%|██████████| 9/9 [00:00<00:00, 62.37it/s]
100%|██████████| 9/9 [00:00<00:00, 43.04it/s]
100%|██████████| 9/9 [00:00<00:00, 59.13it/s]
100%|██████████| 9/9 [00:00<00:00, 70.09it/s]
100%|██████████| 9/9 [00:00<00:00, 70.85it/s]
100%|██████████| 9/9 [00:00<00:00, 70.54it/s]
100%|██████████| 9/9 [00:00<00:00, 72.23it/s]
100%|██████████| 9/9 [00:00<00:00, 69.95it/s]
100%|██████████| 9/9 [00:00<00:00, 60.71it/s]
100%|██████████| 9/9 [00:00<00:00, 68.87it/s]
100%|██████████| 9/9 [00:00<00:00, 69.22it/s]
100%|██████████| 9/9 [00:00<00:00, 66.90it/s]
100%|██████████| 9/9 [00:00<00:00, 57.27it/s]
100%|██████████| 9/9 [00:00<00:00,

After One-hot-Encoding - Transform to NN features (Top K Distances)
After One-hot-Encoding - AutoEncoder Latent Space - Transform to NN features (Top K Distances)

After Multiple Correspondence Analysis
After Multiple Correspondence Analysis - Transform to NN features (Top K Distances)

After Feature Hashing Transformation
After Feature Hashing Transformation - Transform to NN features (Top K Distances)
***AUC ROC and AUC PR***
For categorical-->
***auc_roc_cat_hamming= 0.6185 auc_pr_cat_hamming= 0.8326
For numerical-->
***auc_roc_ae= 0.3367 auc_pr_ae= 0.5037
Data saved to output_roc.csv  and output_pr.csv
