In [4]:
import sys
sys.path.append("..") #to access custom "utils" package

In [5]:
import os
import pandas as pd
import seaborn as sns
import utils.similarity_index as similarity_index
from scipy import stats
import statsmodels as sms

In [9]:
from tqdm.notebook import tqdm

In [6]:
OUT_DIR = os.path.join("..", r"scripts/outputs")
XL_PATH = os.path.join("..", r"inputs/radiomicsFeatures.csv")

In [7]:
num_repeats = 100

feats_df = pd.read_csv(XL_PATH)

In [10]:
stability_df = {"fs_method":[], "similarity_measure":[], "top_k":[], "estimate":[]}

fs_methods = ["random", "oneDSAE", "bayesianDSAE", "ensembleDSAE", "backwardSFS/LogisticRegression", "backwardSFS/SVC", "backwardSFS/RandomForestClassifier", "backwardSFS/MLPClassifier"]
similarity_methods = {"jaccard":similarity_index.jaccard, "dice":similarity_index.dice, "kuncheva":similarity_index.kuncheva, "mwm":similarity_index.mwm}
top_ks = [5, 10, 15, 20, 25]


for fs_method in tqdm(fs_methods):

    for i in range(num_repeats):
    
        for j in range(i+1, num_repeats):
    
            df1 = pd.read_csv(os.path.join(OUT_DIR, fs_method, f"rank_df{i}.csv"))
            df2 = pd.read_csv(os.path.join(OUT_DIR, fs_method, f"rank_df{j}.csv"))

            for similarity_measure, similarity_fn in similarity_methods.items():

                for k in top_ks:

                    estimate = similarity_fn(df1=df1, df2=df2, k=k, feats_df = feats_df)

                    stability_df["fs_method"].append(fs_method)
                    stability_df["similarity_measure"].append(similarity_measure)
                    stability_df["top_k"].append(k)
                    stability_df["estimate"].append(estimate)

                
            estimate = similarity_index.global_spearman(df1, df2)

            stability_df["fs_method"].append(fs_method)
            stability_df["similarity_measure"].append("global_spearman")
            stability_df["top_k"].append("NA")
            stability_df["estimate"].append(estimate)

  0%|          | 0/8 [00:00<?, ?it/s]

In [11]:
stability_df = pd.DataFrame(stability_df)

In [66]:
stability_df.to_csv("stability_df.csv", index=False)

In [None]:
stability_df = pd.read_csv("stability_df.csv")

mean_stability_df = stability_df.groupby(by=["fs_method", "similarity_measure", "top_k"]).mean()

In [58]:
stability_df.fs_method.unique()

array(['random', 'oneDSAE', 'bayesianDSAE', 'ensembleDSAE',
       'backwardSFS/LogisticRegression', 'backwardSFS/SVC',
       'backwardSFS/RandomForestClassifier', 'backwardSFS/MLPClassifier'],
      dtype=object)

In [65]:
mean_stability_df
mean_stability_df.groupby(by=["fs_method", "similarity_measure", "top_k"]).mean().xs("ensembleDSAE")

Unnamed: 0_level_0,Unnamed: 1_level_0,estimate
similarity_measure,top_k,Unnamed: 2_level_1
dice,5.0,0.452404
dice,10.0,0.584384
dice,15.0,0.678478
dice,20.0,0.69398
dice,25.0,0.702966
global_spearman,,0.759545
jaccard,5.0,0.316288
jaccard,10.0,0.427787
jaccard,15.0,0.525472
jaccard,20.0,0.540514


In [None]:
# Figure 1

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Define the custom order and mapping
custom_order = [
    "random",
    "backwardSFS/LogisticRegression",
    "backwardSFS/SVC",
    "backwardSFS/RandomForestClassifier",
    "backwardSFS/MLPClassifier",
    "oneDSAE",
    "bayesianDSAE",
    "ensembleDSAE"
]

label_mapping = {
    "random": "random",
    "backwardSFS/LogisticRegression": "bSFS+LR",
    "backwardSFS/SVC": "bSFS+L-SVM",
    "backwardSFS/RandomForestClassifier": "bSFS+RF",
    "backwardSFS/MLPClassifier": "bSFS+MLP",
    "oneDSAE": "singleAE",
    "bayesianDSAE": "bayesianAE",
    "ensembleDSAE": "ensembleAE"
}

# Filter and map
plot_data = stability_df[
    (stability_df.similarity_measure.isin(["global_spearman", "kuncheva", "mwm"])) &
    (stability_df.top_k.isin(["NA", 5]))
].copy()

plot_data['fs_method'] = pd.Categorical(plot_data['fs_method'], categories=custom_order, ordered=True)
plot_data['fs_method'] = plot_data['fs_method'].map(label_mapping)

# # Calculate mean estimate for "random"
# random_mean = stability_df[
#     (stability_df.similarity_measure == "global_spearman") &
#     (stability_df.top_k == "NA") &
#     (stability_df.fs_method == "random")
# ]['estimate'].mean()

# Plotting
plt.figure(figsize=(12, 6))

sns.lineplot(
    data=plot_data,
    x='fs_method',
    y='estimate',
    errorbar='sd',
    marker='o',
    hue='similarity_measure',
    style='similarity_measure',
    markers={
        'kuncheva': 'o',           # Circle
        'mwm': 's',                # Square
        'global_spearman': 'D'     # Diamond
    },
    hue_order=["global_spearman", "kuncheva", "mwm"],
    style_order=["global_spearman", "kuncheva", "mwm"]
)

# Horizontal line for random
# plt.axhline(y=random_mean, color='blue', linestyle='--', label='random')

# Draw horizontal lines for random for each similarity_measure
# random_means = stability_df[
#     (stability_df.top_k.isin(["NA",5])) &
#     (stability_df.fs_method == "random") &
#     (stability_df.similarity_measure.isin(["global_spearman", "kuncheva", "mwm"]))
# ].groupby("similarity_measure")["estimate"].mean()

# colors = {
#     'kuncheva': 'blue',
#     'mwm': 'orange',
#     'global_spearman': 'green'
# }

# for measure, mean in random_means.items():
#     plt.axhline(y=mean, linestyle='--', color=colors.get(measure, 'gray'), label=f'random ({measure})')


# plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()

plt.savefig("stability_plot.tif", format="tiff", dpi=600)

plt.show()

In [None]:
# Figure 2

# top-5 frequent features

In [None]:
fs_methods = ["backwardSFS/LogisticRegression", "backwardSFS/SVC", "backwardSFS/RandomForestClassifier", "backwardSFS/MLPClassifier", "oneDSAE", "bayesianDSAE", "ensembleDSAE"]
top_k = 5
freq_df = {}
freq_feats = {"fs_method":[], "freq_feats":[]}

for fs_method in fs_methods:
    
    freq_dict = {}
    
    for i in range(num_repeats):
     
        df = pd.read_csv(os.path.join(OUT_DIR, fs_method, f"rank_df{i}.csv"))
        
        selected_feats = df.sort_values(by="rank").head(top_k).feature.to_list()
        
        for feat in selected_feats:
            freq_dict[feat] = freq_dict.get(feat,0)+1

    feats, freq = zip(*freq_dict.items())
    
    freq_df[fs_method] = pd.DataFrame({"feature":feats, "frequency":freq})
    
    freq_feats["fs_method"].append(fs_method)
    freq_feats["freq_feats"].append(sorted(freq_df[fs_method].sort_values(by="frequency", ascending=False).head(top_k).feature.to_list()))
    

freq_feats = pd.DataFrame(freq_feats)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Seaborn styling for a cleaner look
sns.set(style="whitegrid", context="notebook", font_scale=1.1)

df_exploded = freq_feats.explode("freq_feats")

# Pivot table creation (as you already did)
pivot = pd.crosstab(df_exploded["fs_method"], df_exploded["freq_feats"])

# Sort features by frequency (most to least)
top_features = pivot.sum().sort_values(ascending=False).index
pivot = pivot[top_features]

fs_methods = [
    "backwardSFS/LogisticRegression", "backwardSFS/SVC", 
    "backwardSFS/RandomForestClassifier", "backwardSFS/MLPClassifier", 
    "oneDSAE", "bayesianDSAE", "ensembleDSAE"
]
label_mapping = {
    "random": "random",
    "backwardSFS/LogisticRegression": "bSFS+LR",
    "backwardSFS/SVC": "bSFS+L-SVM",
    "backwardSFS/RandomForestClassifier": "bSFS+RF",
    "backwardSFS/MLPClassifier": "bSFS+MLP",
    "oneDSAE": "singleAE",
    "bayesianDSAE": "bayesianAE",
    "ensembleDSAE": "ensembleAE"
}

pivot = pivot.reindex(fs_methods)

# Create the plot
fig, ax = plt.subplots(figsize=(12, 6))

# Plot 'x' markers for each selected feature-method pair
for i, method in enumerate(pivot.index):
    for j, feat in enumerate(pivot.columns):
        if pivot.loc[method, feat]:
            ax.scatter(j, i, marker='x', color='black', s=60, linewidths=1.5)

# Format axes
y_labels = [label_mapping.get(method, method) for method in pivot.index]
ax.set_yticks(range(len(pivot.index)))
ax.set_yticklabels(y_labels, fontsize=10)
ax.set_xticks(range(len(pivot.columns)))
ax.set_xticklabels(pivot.columns, rotation=45, ha='right', fontsize=9)

# Add labels and styling
ax.set_xlabel("Top Frequent Features", fontsize=12)
ax.set_ylabel("Feature Selection Method", fontsize=12)
# ax.set_title("Top Features Selected by Different Methods", fontsize=14)
ax.grid(axis='y', linestyle='--', alpha=0.3)
ax.tick_params(axis='both', which='major', length=0)

plt.tight_layout()

plt.savefig("freq_plot.tif", format="tiff", dpi=600)

plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    train_size=0.3,           # 30% for training
    test_size=0.7,            # 70% for validation (optional, since it's implied)
    stratify=y,               # Ensures stratified split
    random_state=42           # For reproducibility
)


for fs_method in fs_methods:
    
    print('*'*25, fs_method, '*'*25)
    
    feats = pivot.columns[pivot.loc[fs_method]==1].to_list()
    
    X, y = feats_df[feats], feats_df['label']
    
    
    
    for clf in [LogisticRegression(penalty='none', max_iter=10_000), LinearSVC(max_iter=100_000), MLPClassifier(), RandomForestClassifier()]:

        # Repeated Stratified K-Fold Cross Validator
        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=0)

        # ROC AUC and Average Precision scorers
        results = cross_validate(clf, X, y, scoring=['roc_auc', 'average_precision'], cv=cv)

        print(clf.__class__.__name__,results["test_roc_auc"].mean(), results["test_average_precision"].mean())


#     # Report mean and standard deviation
#     print(f"ROC AUC: Mean = {np.mean(roc_auc_scores):.4f}, Std = {np.std(roc_auc_scores):.4f}")
#     print(f"Average Precision: Mean = {np.mean(avg_precision_scores):.4f}, Std = {np.std(avg_precision_scores):.4f}")
    
#     print(fs_method, )

In [None]:
# pivot = pd.crosstab(df_exploded["fs_method"], df_exploded["freq_feats"])
# # Limit to most common features (optional)
# top_features = pivot.sum().sort_values(ascending=False).index
# pivot = pivot[top_features]

# # Plotting
# plt.figure(figsize=(10, 6))
# for i, method in enumerate(pivot.index):
#     for j, feat in enumerate(pivot.columns):
#         if pivot.loc[method, feat]:
#             plt.scatter(j, i, marker='x', color='black')

# plt.yticks(range(len(pivot.index)), pivot.index)
# plt.xticks(range(len(pivot.columns)), pivot.columns, rotation=45, ha='right')
# plt.xlabel("Top Frequent Features")
# plt.ylabel("Feature Selection Method")
# # plt.title("Top Features Selected by Different Methods")
# plt.tight_layout()
# plt.show()

In [None]:
ae_freq_feats = freq_feats[freq_feats.fs_method.isin(["oneDSAE", "bayesianDSAE", "ensembleDSAE"])].freq_feats.to_list()

conv_freq_feats = freq_feats[freq_feats.fs_method.isin(["backwardSFS/LogisticRegression", "backwardSFS/SVC", "backwardSFS/RandomForestClassifier", "backwardSFS/MLPClassifier"])].freq_feats.to_list()

lconv_freq_feats = freq_feats[freq_feats.fs_method.isin(["backwardSFS/LogisticRegression", "backwardSFS/SVC"])].freq_feats.to_list()
nlconv_freq_feats = freq_feats[freq_feats.fs_method.isin(["backwardSFS/RandomForestClassifier", "backwardSFS/MLPClassifier"])].freq_feats.to_list()

In [None]:
overlap_ae = set(ae_freq_feats[0]).intersection(*ae_freq_feats[1:])
overlap_conv = set(conv_freq_feats[0]).intersection(*conv_freq_feats[1:])

In [None]:
overlap_ae, overlap_conv

In [None]:
overlap_lconv = set(lconv_freq_feats[0]).intersection(*lconv_freq_feats[1:])
overlap_nlconv = set(nlconv_freq_feats[0]).intersection(*nlconv_freq_feats[1:])

In [None]:
overlap_lconv

In [None]:
overlap_nlconv

In [None]:
# Figure 3

In [None]:
fs_methods = [
    "backwardSFS/LogisticRegression", "backwardSFS/SVC", 
    "backwardSFS/RandomForestClassifier", "backwardSFS/MLPClassifier", 
    "oneDSAE", "bayesianDSAE", "ensembleDSAE"
]

In [None]:
complexity_df = {"estimator":[], "exe_time":[], "mem_usage":[]}

for fs_method in fs_methods:

    result_dir = os.path.join(OUT_DIR, fs_method, "results_df.csv")
    
    results = pd.read_csv(result_dir, index_col=0)
    
    if "DSAE" in fs_method:
        
        if "bayesian" in fs_method:
            results = results.groupby(by="b").max()
        elif "ensemble" in fs_method:
            results = results.groupby(by="b").sum()
        mem_usage = (results.cpu_mem + results.gpu_mem).to_list()
    else:
        results = results[results.estimator==fs_method.split("/")[-1]]
        mem_usage = results.mem_usage
        
    exe_time = results.exe_time.to_list()
    
    complexity_df["estimator"].append(fs_method)
    complexity_df["exe_time"].append(exe_time)
    complexity_df["mem_usage"].append(mem_usage)

complexity_df = pd.DataFrame(complexity_df)
    

In [None]:
df = complexity_df.copy()
sns.set(style="whitegrid", palette="gray")

fs_methods = [
    "backwardSFS/LogisticRegression", "backwardSFS/SVC", 
    "backwardSFS/RandomForestClassifier", "backwardSFS/MLPClassifier", 
    "oneDSAE", "bayesianDSAE", "ensembleDSAE"
]
label_mapping = {
    "random": "random",
    "backwardSFS/LogisticRegression": "bSFS+LR",
    "backwardSFS/SVC": "bSFS+L-SVM",
    "backwardSFS/RandomForestClassifier": "bSFS+RF",
    "backwardSFS/MLPClassifier": "bSFS+MLP",
    "oneDSAE": "singleAE",
    "bayesianDSAE": "bayesianAE",
    "ensembleDSAE": "ensembleAE"
}


df['fs_method'] = df['estimator'].map(label_mapping)

# Simulated: Load your DataFrame here
# df = pd.read_csv("your_file.csv") or define directly

# Normalize memory values (bytes to GiB)
df['mem_usage'] = df['mem_usage'].apply(lambda x: [float(i) for i in x])

# Compute cumulative stats
df['cumulative_time'] = df['exe_time'].apply(lambda x: sum(x) / 3600)  # hours
df['cumulative_memory'] = df['mem_usage'].apply(lambda x: sum(x) / (1024 ** 3))  # GiB

# Explode lists to individual rows for boxplots
exploded_time = df[['fs_method', 'exe_time']].explode('exe_time')
exploded_time['exe_time'] = exploded_time['exe_time'].astype(float) / 60  # minutes

exploded_mem = df[['fs_method', 'mem_usage']].explode('mem_usage')
exploded_mem['mem_usage'] = exploded_mem['mem_usage'].astype(float) / (1024 ** 3)  # GiB

# Plot
fig, axs = plt.subplots(2, 2, figsize=(14, 10))

# Top-left: Boxplot of Execution time
sns.boxplot(
    x='exe_time', y='fs_method', data=exploded_time, ax=axs[0, 0],
    color='white', fliersize=3, linewidth=1, width=0.4,
    boxprops=dict(edgecolor='black'),
    whiskerprops=dict(color='black'),
    capprops=dict(color='black'),
    medianprops=dict(color='black')
)
axs[0, 0].set_xlabel("exe_time (mins)")
axs[0, 0].set_ylabel("fs_method")

# Top-right: Boxplot of Memory usage
sns.boxplot(
    x='mem_usage', y='fs_method', data=exploded_mem, ax=axs[0, 1],
    color='white', fliersize=3, linewidth=1, width=0.4,
    boxprops=dict(edgecolor='black'),
    whiskerprops=dict(color='black'),
    capprops=dict(color='black'),
    medianprops=dict(color='black')
)
axs[0, 1].set_xlabel("memory (GiB)")
axs[0, 1].set_ylabel("fs_method")

# Bottom-left: Cumulative Execution Time
sns.barplot(
    x='cumulative_time', y='fs_method', data=df, ax=axs[1, 0],
    color='white', edgecolor='black', width=0.4
)
axs[1, 0].set_xlabel("cumulative_exe_time (hrs)")
axs[1, 0].set_ylabel("fs_method")

# Bottom-right: Cumulative Memory Usage
sns.barplot(
    x='cumulative_memory', y='fs_method', data=df, ax=axs[1, 1],
    color='white', edgecolor='black', width=0.4
)
axs[1, 1].set_xlabel("cumulative_memory (GiB)")
axs[1, 1].set_ylabel("fs_method")

for ax in axs.flat:
    ax.grid(False)

plt.tight_layout()
plt.savefig("complexity_plot.tif", format="tiff", dpi=600)
plt.show()

In [None]:
df['avg_time'] = df['exe_time'].apply(lambda x: sum(x) / len(x) / 60)  # convert to mins

df['avg_memory'] = df['mem_usage'].apply(lambda x: sum(x) / len(x) / (1024 ** 3))  # convert to GiB

In [None]:
df

### Statistical Analysis (Wilcoxon Signed Rank Test)

##### <> 1. SFS+LR v/s Ensemble AE
- global, and Kuncheva Top-5

In [None]:
similarity_measure="global_spearman"
top_k = "NA"

In [None]:
x = stability_df[(stability_df.fs_method=="backwardSFS/LogisticRegression")&(stability_df.similarity_measure==similarity_measure)&(stability_df.top_k==top_k)].estimate.to_list()
y = stability_df[(stability_df.fs_method=="random")&(stability_df.similarity_measure==similarity_measure)&(stability_df.top_k==top_k)].estimate.to_list()

sns_data = pd.DataFrame({"LR":x, "ensembleAE":y})
sns.boxplot(data=sns_data)

print("Wilcoxon Signed Rank Test: p-value = ", stats.wilcoxon(x, y).pvalue)

print("Paired T-Test: ", "normality, x-", stats.shapiro(x).pvalue, ",y-", stats.shapiro(y).pvalue, "ttest-", stats.ttest_rel(x, y).pvalue)

In [None]:
similarity_measure="kuncheva"
top_k = 5

In [None]:
x = stability_df[(stability_df.fs_method=="backwardSFS/LogisticRegression")&(stability_df.similarity_measure==similarity_measure)&(stability_df.top_k==top_k)].estimate.to_list()
y = stability_df[(stability_df.fs_method=="ensembleDSAE")&(stability_df.similarity_measure==similarity_measure)&(stability_df.top_k==top_k)].estimate.to_list()

sns_data = pd.DataFrame({"LR":x, "ensembleAE":y})
sns.boxplot(data=sns_data)

print("Wilcoxon Signed Rank Test: p-value = ", stats.wilcoxon(x, y).pvalue)

print("Paired T-Test: ", "normality, x-", stats.shapiro(x).pvalue, ",y-", stats.shapiro(y).pvalue, "ttest-", stats.ttest_rel(x, y).pvalue)

##### <> 1. bayesian AE v/s Ensemble AE
- global, and Kuncheva Top-5

In [None]:
similarity_measure="global_spearman"
top_k = "NA"

In [None]:
x = stability_df[(stability_df.fs_method=="bayesianDSAE")&(stability_df.similarity_measure==similarity_measure)&(stability_df.top_k==top_k)].estimate.to_list()
y = stability_df[(stability_df.fs_method=="ensembleDSAE")&(stability_df.similarity_measure==similarity_measure)&(stability_df.top_k==top_k)].estimate.to_list()

sns_data = pd.DataFrame({"bayesianAE":x, "ensembleAE":y})
sns.boxplot(data=sns_data)

print("Wilcoxon Signed Rank Test: p-value = ", stats.wilcoxon(x, y).pvalue)

print("Paired T-Test: ", "normality, x-", stats.shapiro(x).pvalue, ",y-", stats.shapiro(y).pvalue, "ttest-", stats.ttest_rel(x, y).pvalue)

In [None]:
similarity_measure="kuncheva"
top_k = 5

In [None]:
x = stability_df[(stability_df.fs_method=="bayesianDSAE")&(stability_df.similarity_measure==similarity_measure)&(stability_df.top_k==top_k)].estimate.to_list()
y = stability_df[(stability_df.fs_method=="ensembleDSAE")&(stability_df.similarity_measure==similarity_measure)&(stability_df.top_k==top_k)].estimate.to_list()

sns_data = pd.DataFrame({"LR":x, "ensembleAE":y})
sns.boxplot(data=sns_data)

print("Wilcoxon Signed Rank Test: p-value = ", stats.wilcoxon(x, y).pvalue)

print("Paired T-Test: ", "normality, x-", stats.shapiro(x).pvalue, ",y-", stats.shapiro(y).pvalue, "ttest-", stats.ttest_rel(x, y).pvalue)

### Complexity Analysis

In [None]:
DATA_DIR = r"outputs"
FS_METHODS = ["backwardSFS", "oneDSAE", "bayesianDSAE", "ensembleDSAE"]

In [None]:
sfs_results_df = pd.read_csv(os.path.join(DATA_DIR, "backwardSFS/results_df.csv"), index_col=0)
sAE_results_df = pd.read_csv(os.path.join(DATA_DIR, "oneDSAE", "results_df.csv"))
bAE_results_df = pd.read_csv(os.path.join(DATA_DIR, "bayesianDSAE", "results_df.csv"))
eAE_results_df = pd.read_csv(os.path.join(DATA_DIR, "ensembleDSAE", "results_df.csv"))

In [None]:
time_df = {"fs_method":[], "# runs":[], "exe_time":[]}

In [None]:
sAE_time_df = pd.DataFrame({"# runs":sAE_results_df.groupby(["outer_seed"]).first().index.to_numpy(), "exe_time":sAE_results_df.groupby(["outer_seed"]).first()["exe_time"].to_numpy()})
time_df["fs_method"] += ['singleAE'] * len(sAE_time_df)
time_df["# runs"] += list(sAE_time_df["# runs"].to_numpy() + 1)
time_df["exe_time"] += list(sAE_time_df["exe_time"].to_numpy())

bAE_time_df = pd.DataFrame({"# runs":bAE_results_df.groupby(["outer_seed"]).first().index.to_numpy(), "exe_time":bAE_results_df.groupby(["outer_seed"]).first()["exe_time"].to_numpy()})
time_df["fs_method"] += ['bayesianAE'] * len(bAE_time_df)
time_df["# runs"] += list(bAE_time_df["# runs"].to_numpy() + 1)
time_df["exe_time"] += list(bAE_time_df["exe_time"].to_numpy())

eAE_time_df = pd.DataFrame({"# runs":eAE_results_df.groupby(["outer_seed", "b"]).first().groupby("outer_seed").sum().index.to_numpy(), "exe_time":eAE_results_df.groupby(["outer_seed", "b"]).first().groupby("outer_seed").sum()["exe_time"].to_numpy()})
time_df["fs_method"] += ['ensembleAE'] * len(eAE_time_df)
time_df["# runs"] += list(eAE_time_df["# runs"].to_numpy()+1)
time_df["exe_time"] += list(eAE_time_df["exe_time"].to_numpy())

In [None]:
def acronym(estimator):

    acronym = ""
    
    if estimator=="LogisticRegression":
        acronym = "SFS+LR"
    elif estimator=="SVC":
        acronym = "SFS+L-SVM"
    elif estimator=="RandomForestClassifier":
        acronym = "SFS+RF"
    elif estimator=="MLPClassifier":
        acronym = "SFS+MLP"
    else:
        print("Invalid estimator name")

    return acronym

In [None]:


for estimator in ["LogisticRegression", "SVC", "RandomForestClassifier", "MLPClassifier"]:

    _df = sfs_results_df.groupby(["estimator","outer_seed"]).sum().loc[estimator]
    sfs_time_df = pd.DataFrame({"# runs":_df.index.to_numpy()+1, "exe_time":_df["exe_time"].to_numpy()})

    time_df["fs_method"] += [acronym(estimator)] * len(sfs_time_df)
    time_df["# runs"] += list(sfs_time_df["# runs"].to_numpy()+1)
    time_df["exe_time"] += list(sfs_time_df["exe_time"].to_numpy())


    

In [None]:
time_df = pd.DataFrame(time_df)
time_df["cumulative_exe_time"] = time_df.groupby('fs_method')['exe_time'].cumsum()

display(time_df)

In [None]:
time_df['exe_time (mins)'] = time_df['exe_time']/60
time_df['cumulative_exe_time (mins)'] = time_df['cumulative_exe_time']/60

In [None]:
time_df

In [None]:
sns.boxplot(data=time_df, x='exe_time (mins)', y='fs_method', width=.2, fill=False, color=".1")

In [None]:
# sns.barplot(data=time_df, x='cumulative_exe_time', y='fs_method', width=.2, fill=False, color=".4")
# sns.boxplot(data=time_df, x='exe_time', y='fs_method', width=.2, fill=False, color=".4")
sns.barplot(data=time_df, x='cumulative_exe_time (mins)', y='fs_method', width=.2, fill=False, color=".1", errorbar=None)

In [None]:
sns.lineplot(x='# runs', y=)

In [None]:
eAE_results_df.groupby(["outer_seed", "b"]).first().groupby("outer_seed").sum()["exe_time"]

In [None]:
eAE_results_df.groupby(["outer_seed"]).mean()

In [None]:
eAE_results_df.groupby(["outer_seed"]).sum()

In [None]:
eAE_results_df.groupby(["outer_seed","b", "permute_seed"]).sum()

In [None]:
eAE_results_df.groupby(["outer_seed", "permute_seed"]).sum()

In [None]:
bAE_results_df.groupby("outer_seed").max()

In [None]:
sAE_results_df.groupby("outer_seed").max()

In [None]:
for fs_method in FS_METHODS:

    results_df = pd.read_csv(os.path.join(DATA_DIR, fs_method, "results_df.csv"))

    display(results_df.head(10))
    