In [None]:
from pathlib import Path

import pandas as pd

from conformal_data_cleaning.evaluation.utils import (
    calculate_median_percent_error_detection,
    calculate_median_percent_error_wrong_detection,
)

In [None]:
processed_path = Path("../../processed")
results_file = processed_path / "final-experiments" / "results_cache.csv"

In [None]:
results: pd.DataFrame = (
    pd.read_csv(results_file)
    .convert_dtypes()
    .query("cleaner_type == 'ConformalAutoGluon'")
    .copy()
    .drop(columns="cleaner_type")
    .rename(columns={"confidence_level": "hyperparameter"})
)
baseline_garf_results: pd.DataFrame = (
    pd.read_csv(results_file)
    .convert_dtypes()
    .query("cleaner_type == 'Garf'")
    .copy()
    .drop(columns="cleaner_type")
    .rename(columns={"confidence_level": "hyperparameter"})
)
baseline_ml_results: pd.DataFrame = (
    pd.read_csv(results_file)
    .convert_dtypes()
    .query("cleaner_type == 'AutoGluon'")
    .copy()
    .drop(columns="cleaner_type")
    .rename(columns={"confidence_level": "hyperparameter"})
)

# Error Detection True Positive Rate (TPR)

Showing `median` values of all the experiments for a given model-hyperparameter-task combination

In [None]:
apply_function = calculate_median_percent_error_detection


median_percent_error_detection = pd.concat(
    {
        "baseline_garf": baseline_garf_results.groupby("task_id").apply(
            lambda x: x.groupby("hyperparameter").apply(apply_function),
        ),
        "baseline_ml": baseline_ml_results.groupby("task_id").apply(
            lambda x: x.groupby("hyperparameter").apply(apply_function),
        ),
        "conformal_cleaning": results.groupby("task_id").apply(
            lambda x: x.groupby("hyperparameter").apply(apply_function),
        ),
    },
    axis="columns",
).sort_values(("conformal_cleaning", 0.999))

(median_percent_error_detection.style.background_gradient(cmap="RdYlGn", vmin=50, vmax=100, axis=1).format(precision=2))

# Error Detection - False Positive Rate (FPR)

Showing `median` values of all the experiments for a given model-hyperparameter-task combination

In [None]:
apply_function = calculate_median_percent_error_wrong_detection


median_percent_error_wrong_detection = pd.concat(
    {
        "baseline_garf": baseline_garf_results.groupby("task_id").apply(
            lambda x: x.groupby("hyperparameter").apply(apply_function),
        ),
        "baseline_ml": baseline_ml_results.groupby("task_id").apply(
            lambda x: x.groupby("hyperparameter").apply(apply_function),
        ),
        "conformal_cleaning": results.groupby("task_id").apply(
            lambda x: x.groupby("hyperparameter").apply(apply_function),
        ),
    },
    axis="columns",
).sort_values(("conformal_cleaning", 0.999), ascending=False)
(
    median_percent_error_wrong_detection.style.background_gradient(cmap="RdYlGn_r", vmin=0, vmax=20, axis=1).format(
        precision=2,
    )
)

In [None]:
(
    pd.concat(
        [
            median_percent_error_detection.median(axis=0).to_frame("median_percent_error_detection"),
            median_percent_error_wrong_detection.median(axis=0).to_frame("median_percent_error_wrong_detection"),
        ],
        axis=1,
    )
    .style.highlight_max(color="green", axis=0, subset="median_percent_error_detection")
    .highlight_min(color="green", axis=0, subset="median_percent_error_wrong_detection")
)