In [None]:
from pathlib import Path

import pandas as pd

from conformal_data_cleaning.data import _ID_TO_TASK_TYPE
from conformal_data_cleaning.evaluation.utils import calculate_percent_improvements

In [None]:
processed_path = Path("../../processed")
results_file = processed_path / "final-experiments" / "results_cache.csv"

In [None]:
results: pd.DataFrame = (
    pd.read_csv(results_file)
    .convert_dtypes()
    .query("cleaner_type == 'ConformalAutoGluon'")
    .copy()
    .drop(columns="cleaner_type")
    .rename(columns={"confidence_level": "hyperparameter"})
)
baseline_garf_results: pd.DataFrame = (
    pd.read_csv(results_file)
    .convert_dtypes()
    .query("cleaner_type == 'Garf'")
    .copy()
    .drop(columns="cleaner_type")
    .rename(columns={"confidence_level": "hyperparameter"})
)
baseline_ml_results: pd.DataFrame = (
    pd.read_csv(results_file)
    .convert_dtypes()
    .query("cleaner_type == 'AutoGluon'")
    .copy()
    .drop(columns="cleaner_type")
    .rename(columns={"confidence_level": "hyperparameter"})
)

# How often (in %) improves cleaning the downstream performance


## Additionally Group by Downstream Task

In [None]:
apply_function = calculate_percent_improvements

pd.concat(
    {
        "baseline_garf": baseline_garf_results.groupby("task_type").apply(
            lambda x: x.groupby("hyperparameter").apply(apply_function),
        ),
        "baseline_ml": baseline_ml_results.groupby("task_type").apply(
            lambda x: x.groupby("hyperparameter").apply(apply_function),
        ),
        "conformal_cleaning": results.groupby("task_type").apply(
            lambda x: x.groupby("hyperparameter").apply(apply_function),
        ),
    },
    axis="columns",
).style.highlight_max(axis=1, color="green")

## Group By Task ID

In [None]:
apply_function = calculate_percent_improvements

(
    pd.concat(
        {
            "baseline_garf": baseline_garf_results.groupby("task_id").apply(
                lambda x: x.groupby("hyperparameter").apply(apply_function),
            ),
            "baseline_ml": baseline_ml_results.groupby("task_id").apply(
                lambda x: x.groupby("hyperparameter").apply(apply_function),
            ),
            "conformal_cleaning": results.groupby("task_id").apply(
                lambda x: x.groupby("hyperparameter").apply(apply_function),
            ),
        },
        axis="columns",
    )
    .sort_values(("baseline_ml", 0.5))
    .assign(task_type=lambda df: [_ID_TO_TASK_TYPE[x].value for x in df.index])
    .style.background_gradient(cmap="RdYlGn", vmin=0, vmax=100, axis=1)
    .format(precision=2)
)

# Quantiles over 'Percentage Improvements per Dataset'

In general, quantiles represent how many percent of the numbers are equal or less than the given value ($\hat{y}$).
Here, it shows in how many percent of the datasets (*quantile*) applying cleaning increases the downstream performance in max $\hat{y}$% of the cases (error type and fraction combinations). Vice versa, in $1 - quantile$ of the datasets we can expect improvement in at least $\hat{y}$% of the cases.

In [None]:
apply_function = calculate_percent_improvements

(
    pd.concat(
        {
            "baseline_garf": baseline_garf_results.groupby("task_id").apply(
                lambda x: x.groupby("hyperparameter").apply(apply_function),
            ),
            "baseline_ml": baseline_ml_results.groupby("task_id").apply(
                lambda x: x.groupby("hyperparameter").apply(apply_function),
            ),
            "conformal_cleaning": results.groupby("task_id").apply(
                lambda x: x.groupby("hyperparameter").apply(apply_function),
            ),
        },
        axis="columns",
    )
    .sort_values(("baseline_ml", 0.5))
    .quantile((0.25, 0.5, 0.75))
    .style.highlight_max(color="green", axis=1)
)