In [None]:
import pandas as pd
from jenga.corruptions.generic import CategoricalShift, SwappedValues
from jenga.corruptions.numerical import GaussianNoise, Scaling

from conformal_data_cleaning.config import error_fractions
from conformal_data_cleaning.data import (
    _ID_TO_TASK_TYPE,
    AVAILABLE_DATASETS,
    fetch_and_save_dataset,
    get_X_y_paths,
    read_dataset,
)
from conformal_data_cleaning.jenga_extension import get_OpenMLTask

## Download and split datasets

This downloads and splits the datasets in [`data/training`](../data/training/) and [`data/test`](../data/test/).

In [None]:
for task_id in AVAILABLE_DATASETS:
    if not fetch_and_save_dataset(task_id=task_id):
        print(f"Downloading dataset with ID {task_id:>5} failed.")

## Corrupt the datasets

Use `jenga` to create the corruptions. We train on clean data and simulate how corrupted data impacts the downstream task performance. For this, we load the test sets from [`data/test`](../data/test/) again, create errors, and save them to [`data/corrupted/<corruption type>/<fraction of errors>`](../data/corrupted/) accordingly to their corruption types and fraction of errors.

In [None]:
corruptions = [Scaling, GaussianNoise, SwappedValues, CategoricalShift]

errors_list = []

for corruption in corruptions:
    for fraction in error_fractions:
        for dataset_id in AVAILABLE_DATASETS:
            X, y = read_dataset(dataset_id, training=False)
            X_corrupted = X.copy()

            for column in X_corrupted.columns:
                corruption_instance = corruption(column=column, fraction=fraction)
                X_corrupted = corruption_instance.transform(X_corrupted)

            percent_errors = (X_corrupted != X).sum().sum() / X_corrupted.size
            errors_list.append(
                (
                    dataset_id,
                    _ID_TO_TASK_TYPE[dataset_id].value,
                    fraction,
                    type(corruption_instance).__name__,
                    percent_errors,
                ),
            )

            X_path, y_path = get_X_y_paths(
                dataset_id,
                False,
                corruption=type(corruption_instance).__name__,
                fraction=fraction,
            )
            X_corrupted.to_csv(X_path, index=False)
            y.to_csv(y_path, index=False)

            assert X_path.exists() and X_path.is_file()
            assert y_path.exists() and y_path.is_file()

errors = pd.DataFrame(errors_list, columns=["dataset_id", "dataset_type", "fraction", "error_type", "percent_errors"])
errors.to_csv("../data/corrupted/error_statistics.csv", index=False)

## Test downloaded datasets

Since `jenga` tests whether or not the loaded data correspond to the given type, we once create all `OpenMLTask` objects.

This should finish without an error and present 30 available datasets. These are a subset from this benchmark paper: [https://www.frontiersin.org/articles/10.3389/fdata.2021.693674/full](https://www.frontiersin.org/articles/10.3389/fdata.2021.693674/full).

- min 50k cells
- fewer columns are better

### Clean Data

In [None]:
number_of_types = {}
for task_id in AVAILABLE_DATASETS:
    open_ml_task = get_OpenMLTask(task_id=task_id)
    number_of_types[type(open_ml_task).__name__] = number_of_types.get(type(open_ml_task).__name__, 0) + 1

for type_, number in number_of_types.items():
    print(f"- {number} {type_}")

total_number_of_datasets = sum(number_of_types.values())
print(f"=> in total {total_number_of_datasets} datasets.")
print()

### Corrupted Data

In [None]:
how_many_corrupted_versions = {}
for corruption in corruptions:
    for fraction in error_fractions:
        for task_id in AVAILABLE_DATASETS:
            open_ml_task = get_OpenMLTask(task_id=task_id, corruption=corruption.__name__, fraction=fraction)
            how_many_corrupted_versions[task_id] = how_many_corrupted_versions.get(task_id, 0) + 1

assert set(how_many_corrupted_versions.values()) == set(
    [len(corruptions) * len(error_fractions)],
), "Something went wrong during corruption of the data. Expecting 20 versions (with corruptions) of each dataset."

In [None]:
equal_dtypes = []
for task_id in AVAILABLE_DATASETS:
    for corruption in corruptions:
        for fraction in error_fractions:
            open_ml_task = get_OpenMLTask(task_id=task_id, corruption=corruption.__name__, fraction=fraction)
            open_ml_task_orig = get_OpenMLTask(task_id=task_id)

            if open_ml_task.train_data.dtypes.to_dict() != open_ml_task_orig.train_data.dtypes.to_dict():
                equal_dtypes.append(False)

            elif open_ml_task.test_data.dtypes.to_dict() != open_ml_task_orig.test_data.dtypes.to_dict():
                equal_dtypes.append(False)

            else:
                equal_dtypes.append(True)

assert all(equal_dtypes), "Corruptions should not change the data types as this can cause downstream issues!"

## Error statistics

In [None]:
error_statistics = pd.read_csv("../data/corrupted/error_statistics.csv")

error_statistics["delta_error"] = error_statistics["percent_errors"] - error_statistics["fraction"]

error_statistics_grouped = error_statistics.groupby(["fraction", "error_type"])
mean_errors = error_statistics_grouped.mean()[["percent_errors", "delta_error"]]

In [None]:
mean_errors

Creating 30% `GaussianNoise` only leads to 30% changed values if all columns are numerical. If this is not the case, we will see less `percent_errors`. This is why we `GaussianNoise` to each column if possible. However, it only can be applied to numerical columns.
Others, e.g., `SwappedValues`, need at least two columns of the same `dtype` or categorical columns, e.g., `CategoricalShift`.