In [None]:
import numpy as np
import pandas as pd

import soft_label_learning.experiments.statistical_tests as stac
from soft_label_learning.config import path_output
from soft_label_learning.experiments.experiment_settings import (
    methods_with_threshold,
    methods_without_threshold,
    non_ens_methods,
    q1_settings,
)
from soft_label_learning.experiments.process_synthetic_data import (
    get_q1_result_dict,
    plot_heatmap,
    q1_statistics_result_dict,
    q1_statistics_table,
)

Obtaining the experiment base parameters

In [None]:
# TODO set datetime to the desired result folder
settings_dict, result_path = q1_settings, "date_hh_mm_ss"

Initialising settings dictionary

In [None]:
fixed_settings = {}

# Code to take the first option
for key in settings_dict.keys():
    if key == "dataset":
        fixed_settings[key] = settings_dict[key][0][:-4]
    elif key == "eval_set":
        fixed_settings[key] = settings_dict[key][fixed_settings["label_eval"]][0]
    elif key == "metric":
        fixed_settings[key] = settings_dict[key][fixed_settings["label_eval"]][0]
    elif key in ["ens_propagation", "smoothing"]:
        fixed_settings[key] = settings_dict[key]
    else:
        fixed_settings[key] = settings_dict[key][0]

In [None]:
fixed_settings["ens_propagation"] = True
fixed_settings["gt"] = "rf"

In [None]:
fixed_settings

In [None]:
# Execute the analyses either with or without the threshold methods
threshold = False

if threshold:
    methods = ["PluralityBootstrapClf"] + methods_with_threshold
    base_row = 0
else:
    base_row = 4
    methods = methods_without_threshold.copy()

datasets = [x[:-4] for x in settings_dict["dataset"]]
classifiers = [x for x in settings_dict["clf"]]
metrics = ["TVD", "hard_soft_AUC"]

In [None]:
method_labels = methods.copy()
method_labels = [s.replace("_", " - ") for s in method_labels]
dataset_labels = [s.replace("_", " ") for s in datasets]

#### Loading the results

In [None]:
rf_1 = fixed_settings.copy()
rf_1["gt"] = "rf"
rf_1["mtvd"] = "1"
rf_2 = fixed_settings.copy()
rf_2["gt"] = "rf"
rf_2["mtvd"] = "2"
lr_1 = fixed_settings.copy()
lr_1["gt"] = "lr"
lr_1["mtvd"] = "1"
lr_2 = fixed_settings.copy()
lr_2["gt"] = "lr"
lr_2["mtvd"] = "2"

result_dict_rf_1 = get_q1_result_dict(
    rf_1,
    methods,
    metrics,
    classifiers,
    datasets,
    non_ens_methods,
    result_path,
)

result_dict_rf_2 = get_q1_result_dict(
    rf_2,
    methods,
    metrics,
    classifiers,
    datasets,
    non_ens_methods,
    result_path,
)

result_dict_lr_1 = get_q1_result_dict(
    lr_1,
    methods,
    metrics,
    classifiers,
    datasets,
    non_ens_methods,
    result_path,
)
result_dict_lr_2 = get_q1_result_dict(
    lr_2,
    methods,
    metrics,
    classifiers,
    datasets,
    non_ens_methods,
    result_path,
)

#### Generate the heatmaps

In [None]:
heatmap_dict = {}

for metric in result_dict_rf_1.keys():
    for clf in result_dict_rf_1[metric].keys():
        results = [
            result_dict_rf_1[metric][clf],
            result_dict_rf_2[metric][clf],
            result_dict_lr_1[metric][clf],
            result_dict_lr_2[metric][clf],
        ]

        average_results = np.mean(results, axis=0)

        fig = plot_heatmap(
            average_results,
            xlabels=dataset_labels,
            ylabels=method_labels,
            base_row=base_row,
        )

        heatmap_dict[(metric, clf)] = fig

In [None]:
if True:
    for key, fig in heatmap_dict.items():
        if not threshold:
            fig.savefig(
                path_output / "Q1" / f"q1_heatmap_{key[0]}_{key[1]}.png",
                dpi=300,
                bbox_inches="tight",
            )
        if threshold:
            fig.savefig(
                path_output / "Q1" / f"q1_heatmap_threshold_{key[0]}_{key[1]}.png",
                bbox_inches="tight",
                dpi=300,
            )

#### Heatmap over all settings

In [None]:
specific_settings = settings_dict.copy()
specific_settings["ens_propagation"] = True
specific_settings["noise_type"] = "noiseless"
specific_settings["noise"] = "0"
specific_settings["alpha"] = 1

metrics = ["TVD", "hard_soft_AUC"]

Get the results

In [None]:
result_dict = q1_statistics_result_dict(
    specific_settings,
    settings_dict,
    result_path,
    methods,
    metrics,
    non_ens_methods,
    classifiers,
    datasets,
)

Generate the Heatmap

In [None]:
table_array, clf_metric = q1_statistics_table(
    result_dict,
    settings_dict,
    methods,
    metrics,
    classifiers,
    datasets,
    threshold=threshold,
)

In [None]:
fig = plot_heatmap(
    table_array,
    xlabels=clf_metric,
    ylabels=method_labels,
    base_row=base_row,
    add_mean=False,
    figsize=(8, 8),
)

if True:
    if threshold:
        fig.savefig(
            path_output / "Q1" / "q1_statistics_threshold.png",
            dpi=300,
            bbox_inches="tight",
        )
    else:
        fig.savefig(
            path_output / "Q1" / "q1_statistics.png", dpi=300, bbox_inches="tight"
        )

### Statistical significance testing

In [None]:
df_columns = [
    "Friedman",
    "Hard",
    "Soft",
    "HardEns",
    "SoftEns",
    "vs Hard",
    "vs Soft",
    "vs HardEns",
]
representative_methods = [
    "PluralityClf",
    "DuplicateWeightsClf",
    "PluralityEnsClf",
    "BootstrapDupWeightsClf",
]

complete_stat_df = pd.DataFrame(columns=df_columns)

complete_stat_dict = {}

# Best performing of each group: PluralityClf, DuplicateWeightsClf,
# PluralityEnsClf, BootstrapDupWeightsClf
method_key_dict = {
    "PluralityClf": 1,
    "DuplicateWeightsClf": 3,
    "PluralityEnsClf": 6,
    "BootstrapDupWeightsClf": 10,
}

complete_stat_dict = {}

for metric in result_dict_rf_1.keys():
    for clf in result_dict_rf_1[metric].keys():
        results = [
            result_dict_rf_1[metric][clf],
            result_dict_rf_2[metric][clf],
            result_dict_lr_1[metric][clf],
            result_dict_lr_2[metric][clf],
        ]

        method_stat_dict = {}

        for method in method_key_dict.keys():
            method_stat_dict[method] = np.array(results)[
                :, method_key_dict[method], :
            ].reshape(4 * len(q1_settings["dataset"]))

        samples = np.array(
            [method_stat_dict[method] for method in method_stat_dict.keys()]
        )

        friedman_result = stac.friedman_aligned_ranks_test(
            samples[0], samples[1], samples[2], samples[3]
        )
        print(f"Friedman p-value: {friedman_result[1]}")

        if metric == "hard_soft_AUC":
            metric_adjusted = "AUC"
        else:
            metric_adjusted = r"$\overline{TVD}$"

        complete_stat_dict[(metric_adjusted, clf)] = [friedman_result[1]]
        complete_stat_dict[(metric_adjusted, clf)] += friedman_result[2]

        pivot_dict = {
            "PluralityClf": friedman_result[3][0],
            "DuplicateWeightsClf": friedman_result[3][1],
            "PluralityEnsClf": friedman_result[3][2],
            "BootstrapDupWeightsClf": friedman_result[3][3],
        }

        finner_result = stac.finner_test(pivot_dict, "BootstrapDupWeightsClf")

        print(finner_result[2])
        print(finner_result[3])

        complete_stat_dict[(metric_adjusted, clf)].append(
            finner_result[3][
                finner_result[0].index("BootstrapDupWeightsClf vs PluralityClf")
            ]
        )
        complete_stat_dict[(metric_adjusted, clf)].append(
            finner_result[3][
                finner_result[0].index("BootstrapDupWeightsClf vs DuplicateWeightsClf")
            ]
        )
        complete_stat_dict[(metric_adjusted, clf)].append(
            finner_result[3][
                finner_result[0].index("BootstrapDupWeightsClf vs PluralityEnsClf")
            ]
        )


stat_frame = pd.DataFrame.from_dict(
    complete_stat_dict, orient="index", columns=df_columns
)
stat_frame.index = pd.MultiIndex.from_tuples(stat_frame.index)

In [None]:
formatted_df = stat_frame.copy()
new_columns = pd.MultiIndex.from_tuples(
    [
        ("$p$-value", formatted_df.columns[0]),
        ("Rank", formatted_df.columns[1]),
        ("Rank", formatted_df.columns[2]),
        ("Rank", formatted_df.columns[3]),
        ("Rank", formatted_df.columns[4]),
        ("adjusted $p$-value: SoftEns", formatted_df.columns[5]),
        ("adjusted $p$-value: SoftEns", formatted_df.columns[6]),
        ("adjusted $p$-value: SoftEns", formatted_df.columns[7]),
    ]
)

formatted_df.columns = new_columns

latex_df = formatted_df.style.format(
    {
        new_columns[0]: "{:.2e}",
        new_columns[1]: "{:.2f}",
        new_columns[2]: "{:.2f}",
        new_columns[3]: "{:.2f}",
        new_columns[4]: "{:.2f}",
        new_columns[5]: "{:.2e}",
        new_columns[6]: "{:.2e}",
        new_columns[7]: "{:.2e}",
    }
)

In [None]:
print(latex_df.to_latex().replace("0.00e+00", "0e0"))