In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from conformal_data_cleaning.evaluation.utils import normalize_improvement, normalize_performance

In [None]:
processed_path = Path("../../processed")
results_file = processed_path / "final-experiments" / "results_cache.csv"
dataset_descriptions_file = Path("../../data/dataset_descriptions.csv")
figures_path = Path("../../plots")

In [None]:
plt.rcParams["pdf.fonttype"] = 42

sns.set(
    style="whitegrid",
)
sns.set_context("paper", font_scale=1.5)

# Garf failed those datasets because of wrong dtypes: 4135, 251, 1200, 218, 1046

In [None]:
dataset_descriptions = pd.read_csv(dataset_descriptions_file).convert_dtypes().set_index("task_id")
tabular_task_id = dataset_descriptions[dataset_descriptions["tabular"]].index

results = (
    pd.concat(
        [
            pd.read_csv(results_file)
            .convert_dtypes()
            .query("cleaner_type == 'ConformalAutoGluon'")
            .copy()
            .drop(columns="cleaner_type")
            .rename(columns={"confidence_level": "hyperparameter"})
            .assign(method="CDC (ours)"),
            pd.read_csv(results_file)
            .convert_dtypes()
            .query("cleaner_type == 'Garf'")
            .copy()
            .drop(columns="cleaner_type")
            .rename(columns={"confidence_level": "hyperparameter"})
            .assign(method="Garf")
            .assign(hyperparameter=0.5),
            pd.read_csv(results_file)
            .convert_dtypes()
            .query("cleaner_type == 'AutoGluon'")
            .copy()
            .drop(columns="cleaner_type")
            .rename(columns={"confidence_level": "hyperparameter"})
            .assign(method="ML"),
        ],
        axis=0,
        ignore_index=True,
    )
    .assign(
        error_bins=lambda df: pd.cut(
            df["actual_error_fraction"], [x / 10 for x in range(6)], include_lowest=True,
        ).astype(str),
        method=lambda df: df["method"].astype("category"),
        hyperparameter=lambda df: df["hyperparameter"].astype("category"),
        method_hyperparameter=lambda df: df["method"].astype(str) + ", " + df["hyperparameter"].astype(str),
        error_detection_fraction__mean=lambda df: df["error_detection_fraction__mean"],
    )
    .pipe(lambda df: df.join(df.groupby("task_id").apply(normalize_performance)))
    .pipe(lambda df: df.join(df.groupby("task_id").apply(normalize_improvement)))
)

rel_set_size_quantiles = dict()
for q in [0.2, 0.8]:
    rel_set_size_quantiles[q] = {
        value["hyperparameter"]: value["relative_average_set_size__mean"]
        for _, value in (
            results.groupby(["hyperparameter"])[["relative_average_set_size__mean"]]
            .agg(lambda x: x.quantile(q))
            .reset_index()
            .to_dict("index")
            .items()
        )
    }

results = results.assign(
    set_size_groups=lambda df: [
        rel_set_size
        if pd.isna(rel_set_size)
        else "top"  # if NAN
        if rel_set_size < rel_set_size_quantiles[0.2][hyperparameter]
        else "middle"
        if rel_set_size < rel_set_size_quantiles[0.8][hyperparameter]
        else "worst"
        for rel_set_size, hyperparameter in df[["relative_average_set_size__mean", "hyperparameter"]].values
    ],
)

In [None]:
id_columns = ["task_id", "error_type", "error_fraction", "actual_error_fraction"]

# Dataset Statistics

In [None]:
error_statistics = pd.read_csv("../../data/corrupted/error_statistics.csv")
error_statistics.describe()

# Outlier Detection TPR vs. FPR

In [None]:
id_columns = ["error_bins"]
metric_columns = ["error_detection_fraction__mean", "error_wrong_detection_fraction__mean"]

data = (
    results.groupby(id_columns + ["hyperparameter", "method"])[metric_columns]
    .mean()
    .reset_index()
    .melt(
        id_vars=id_columns + ["method", "hyperparameter"],
        value_vars=metric_columns,
    )
    .assign(
        variable=lambda df: df["variable"].astype("category"),
    )
    .replace(
        {
            "error_detection_fraction__mean": "Outlier Detection TPR",
            "error_wrong_detection_fraction__mean": "Outlier Detection FPR",
        },
    )
    .rename(columns={"hyperparameter": "Hyperparameter", "method": "Method", "value": "mean"})
)


plot: sns.FacetGrid = sns.relplot(
    x="error_bins",
    y="mean",
    hue="Hyperparameter",
    style="Method",
    col="variable",
    kind="line",
    data=data.dropna(),
)


plot.figure.axes[0].set_ylim((0, 1))
plot.figure.axes[1].set_ylim((0, 1))

plot.set_xticklabels(["[0-10]", "(10-20]", "(20-30]", "(30-40]", "(40-50]"])
plot.set_xlabels(r"Error Fraction ($\%$)")
plot.set_ylabels("")
plot.set_titles(col_template="{col_name}")

plot.tight_layout()
plot.savefig(figures_path / "outlier_detection_TPR_vs_FPR_plot.pdf", bbox_inches="tight")
plt.show()

# Cleaning Performance

In [None]:
id_columns = ["error_bins"]
metric_columns = [
    "cleaned_performance__mean_normalized",
    "improvement_in_percent__mean_normalized",
    "cleaned_performance__mean",
    "improvement_in_percent__mean",
]

plot = sns.relplot(
    x="error_bins",
    y="improvement_in_percent__mean",
    hue="Hyperparameter",
    style="Method",
    kind="line",
    facet_kws={"sharey": False, "sharex": False},
    data=(
        results.groupby(id_columns + ["hyperparameter", "method"])[metric_columns]
        .median()
        .reset_index()
        .rename(columns={"hyperparameter": "Hyperparameter", "method": "Method"})
        .dropna()
    ),
)

plot.set_xticklabels(["[0-10]", "(10-20]", "(20-30]", "(30-40]", "(40-50]"])
plot.set_xlabels(r"Error Fraction ($\%$)")
plot.set_ylabels(r"Downstream Improvement ($\%$)")


plot.tight_layout()
plot.savefig(figures_path / "performance_improvement_plot.pdf", bbox_inches="tight")
plt.show()

In [None]:
plot = sns.catplot(
    x="error_bins",
    y="improvement_in_percent__mean",
    hue="Hyperparameter",
    col="Method",
    kind="box",
    data=(
        results.query('method != "Garf"')
        .assign(
            method=lambda df: df["method"].cat.remove_unused_categories(),
        )
        .rename(columns={"hyperparameter": "Hyperparameter", "method": "Method"})
    ),
)

plot.set(ylim=(-100, 100))
plot.set_xticklabels(["[0-10]", "(10-20]", "(20-30]", "(30-40]", "(40-50]"])
plot.set_xlabels(r"Error Fraction ($\%$)")
plot.set_ylabels(r"Downstream Improvement ($\%$)")


plot.tight_layout()
plot.savefig(figures_path / "performance_improvement_box_plot.pdf", bbox_inches="tight")
plt.show()

In [None]:
id_columns = ["error_bins"]
metric_columns = ["improvement_in_percent__mean"]

plot = sns.relplot(
    x="error_bins",
    y="improvement_in_percent__mean",
    hue="Hyperparameter",
    col="set_size_groups",
    col_order=[
        r"$20\%$ Easiest Experiments",
        r"$20 - 80\%$ Moderately Difficult Experiments",
        r"$20\%$ Difficult Experiments",
    ],
    kind="line",
    facet_kws={"sharey": False, "sharex": False},
    data=(
        results.groupby(id_columns + ["hyperparameter", "method", "set_size_groups"])[metric_columns]
        .median()
        .reset_index()
        .rename(columns={"hyperparameter": "Hyperparameter", "method": "Method"})
        .replace(
            {
                "top": r"$20\%$ Easiest Experiments",
                "middle": r"$20 - 80\%$ Moderately Difficult Experiments",
                "worst": r"$20\%$ Difficult Experiments",
            },
        )
        .dropna()
    ),
)

plot.set_xticklabels(["[0-10]", "(10-20]", "(20-30]", "(30-40]", "(40-50]"])
plot.set_xlabels(r"Error Fraction ($\%$)")
plot.set_ylabels(r"Downstream Improvement ($\%$)")
plot.set_titles(col_template="{col_name}")

plot.savefig(figures_path / "confidence_set_size_performance_improvement_plot.pdf", bbox_inches="tight")
plot.tight_layout()
plt.show()