In [None]:
import datetime
import json

import matplotlib.pyplot as plt

from soft_label_learning.config import path_repository

This notebook is used to obtain the number of features to hide to obtain the corresponding low and high uncertainty datasets for running the
experiments using synthetic data, generated using the SYNLABEL framework.

#### Creating different datasets

take two different classifiers as the ground truth model:
- Random Forest
- Logistic Regression

#### Process the results

Ensure the files run_mtvd_per_set_lr.py and run_mtvd_per_set_rf.py have been 
exectuted, as their output is used here.

In [None]:
load_data = True
model_to_use = "lr"

if load_data:
    # get path to parent folder
    save_path = path_repository / "experiments" / "feature_hiding_mtvd"

    # TODO set to the datetime string of the result
    time_string_lr = "date_hh_mm_ss"
    time_string_rf = "date_hh_mm_ss"

    if model_to_use == "lr":
        temp_path = f"{time_string_lr}_dataset_mtvd_lr.json"
    elif model_to_use == "rf":
        temp_path = f"{time_string_rf}_dataset_mtvd_rf.json"
    else:
        raise ValueError("Wrong model value")
    complete_path = save_path / temp_path

    with open(complete_path, "r") as file:
        loaded_dict = json.load(file)

In [None]:
feature_hide_dict = {}

for dataset, result_dict in loaded_dict.items():
    feature_hide_dict[dataset] = {}

    int_keys = [int(key) for key in result_dict.keys()]

    min_mtvd = min(result_dict.values())
    max_mtvd = max(result_dict.values())
    delta_mtvd = max_mtvd - min_mtvd
    c1 = min_mtvd + ((1 / 3) * delta_mtvd)
    c2 = min_mtvd + ((2 / 3) * delta_mtvd)

    # determine the values for which the MTVD is closest to c1 and c2
    min_diff_c1 = 1
    min_diff_c2 = 1
    for key, value in result_dict.items():
        diff_c1 = abs(c1 - value)
        diff_c2 = abs(c2 - value)
        if diff_c1 < min_diff_c1:
            min_diff_c1 = diff_c1
            cutoff_1 = int(key)
        if diff_c2 < min_diff_c2:
            min_diff_c2 = diff_c2
            cutoff_2 = int(key)

    if cutoff_1 == cutoff_2:
        int_keys.sort()
        index_c1 = int_keys.index(cutoff_1)
        if index_c1 < len(int_keys) - 1:
            index_c2 = index_c1 + 1
            cutoff_2 = int_keys[index_c2]
            diff_c2 = result_dict[str(cutoff_2)]
        else:
            print("!!")
            print("couldn't find cutoff_2 for", dataset)

    feature_hide_dict[dataset]["cutoff_1"] = cutoff_1
    feature_hide_dict[dataset]["diff_c1"] = min_diff_c1
    feature_hide_dict[dataset]["cutoff_2"] = cutoff_2
    feature_hide_dict[dataset]["diff_c2"] = min_diff_c2

    plt.vlines(cutoff_1, 0, 1, colors="gray", linestyles="dashed", linewidth=1)
    plt.vlines(cutoff_2, 0, 1, colors="gray", linestyles="dashed", linewidth=1)
    plt.hlines(
        c1,
        min(int_keys),
        max(int_keys),
        colors="gray",
        linewidth=1,
        alpha=0.5,
    )
    plt.hlines(
        c2,
        min(int_keys),
        max(int_keys),
        colors="gray",
        linewidth=1,
        alpha=0.5,
    )

    plt.plot(int_keys, result_dict.values(), label=dataset)
    plt.title(dataset)
    plt.xlabel("Number of features excluded")
    plt.ylabel("MTVD (Y_G, Y_PG)")

    plt.show()

avg_1 = 0
avg_2 = 0

for key, value in feature_hide_dict.items():
    avg_1 += feature_hide_dict[key]["diff_c1"]
    avg_2 += feature_hide_dict[key]["diff_c2"]

avg_1 = avg_1 / len(feature_hide_dict)
avg_2 = avg_2 / len(feature_hide_dict)
print(f"avg_1: {avg_1}, avg_2: {avg_2}")

In [None]:
save_feature_hide_dict = True

if save_feature_hide_dict:
    # get current time
    now = datetime.datetime.now()
    time_string = now.strftime("%Y%m%d_%H_%M_%S")
    result_string = time_string + f"_features_to_hide_{model_to_use}.json"

    # get path to parent folder
    save_path = path_repository / "experiments" / "feature_hiding_mtvd"

    complete_path = save_path / result_string

    # Convert and write JSON object to file
    with open(complete_path, "w") as outfile:
        json.dump(feature_hide_dict, outfile)