In [1]:
import os
import pandas as pd
import similarity_index as similarity_index

In [2]:
OUT_DIR = "outputs"
XL_PATH = r"inputs/radiomicsFeatures.csv"

In [3]:
num_folds = 5

feats_df = pd.read_csv(XL_PATH)

In [10]:
stability_df = {"fs_method":[], "similarity_measure":[], "top_k":[], "estimate":[]}

fs_methods = ["backwardSFS/LogisticRegression", "backwardSFS/RandomForestClassifier", "backwardSFS/SVC", "random", "oneDSAE", "bayesianDSAE", "ensembleDSAE"]
similarity_methods = {"jaccard":similarity_index.jaccard, "dice":similarity_index.dice, "kuncheva":similarity_index.kuncheva, "mwm":similarity_index.mwm}
top_ks = [5, 10, 15, 20, 25]


for fs_method in fs_methods:

    for i in range(num_folds):
    
        for j in range(i+1, num_folds):
    
            df1 = pd.read_csv(os.path.join(OUT_DIR, fs_method, f"rank_df{i}.csv"))
            df2 = pd.read_csv(os.path.join(OUT_DIR, fs_method, f"rank_df{j}.csv"))

            for similarity_measure, similarity_fn in similarity_methods.items():

                for k in top_ks:

                    estimate = similarity_fn(df1=df1, df2=df2, k=k, feats_df = feats_df)

                    stability_df["fs_method"].append(fs_method)
                    stability_df["similarity_measure"].append(similarity_measure)
                    stability_df["top_k"].append(k)
                    stability_df["estimate"].append(estimate)

                
            estimate = similarity_index.global_spearman(df1, df2)

            stability_df["fs_method"].append(fs_method)
            stability_df["similarity_measure"].append("global_spearman")
            stability_df["top_k"].append("NA")
            stability_df["estimate"].append(estimate)

In [11]:
stability_df = pd.DataFrame(stability_df)

In [12]:
stability_df.to_csv(os.path.join(OUT_DIR, "stability_df.csv"), index=False)

In [13]:
mean_stability_df = stability_df.groupby(by=["fs_method", "similarity_measure", "top_k"]).mean()

In [15]:
mean_stability_df.loc['ensembleDSAE']

Unnamed: 0_level_0,Unnamed: 1_level_0,estimate
similarity_measure,top_k,Unnamed: 2_level_1
dice,5.0,0.3
dice,10.0,0.54
dice,15.0,0.6
dice,20.0,0.64
dice,25.0,0.62
global_spearman,,0.597326
jaccard,5.0,0.187302
jaccard,10.0,0.382207
jaccard,15.0,0.433135
jaccard,20.0,0.474576


In [14]:
mean_stability_df.loc['bayesianDSAE']

Unnamed: 0_level_0,Unnamed: 1_level_0,estimate
similarity_measure,top_k,Unnamed: 2_level_1
dice,5.0,0.32
dice,10.0,0.56
dice,15.0,0.633333
dice,20.0,0.665
dice,25.0,0.648
global_spearman,,0.632623
jaccard,5.0,0.209127
jaccard,10.0,0.391941
jaccard,15.0,0.467088
jaccard,20.0,0.501845


In [9]:
mean_stability_df.loc['oneDSAE']

Unnamed: 0_level_0,Unnamed: 1_level_0,estimate
similarity_measure,top_k,Unnamed: 2_level_1
dice,5.0,0.44
dice,10.0,0.54
dice,15.0,0.633333
dice,20.0,0.665
dice,25.0,0.664
global_spearman,,0.636004
jaccard,5.0,0.309524
jaccard,10.0,0.374084
jaccard,15.0,0.467088
jaccard,20.0,0.501845


In [None]:
mean_stability_df.loc['random']

### Statistical Analysis

##### <> 1. Random v/s FS Methods -Wilcoxon Signed Rank Test
- global, top_k = [5, 10, 15, 20, 25]

### Statistical Analysis (U_test between 4 stability estimates from conventional methods and 3 stability estimates from autoencoder variants)