In [None]:
import pandas as pd

from conformal_data_cleaning.data import _ID_TO_TASK_TYPE, AVAILABLE_DATASETS
from conformal_data_cleaning.jenga_extension import get_OpenMLTask

In [None]:
dataset_statistics = []

for task_id in AVAILABLE_DATASETS:
    open_ml_task = get_OpenMLTask(task_id=task_id)

    X = pd.concat([open_ml_task.train_data, open_ml_task.test_data])
    dataset_statistics.append(
        (
            task_id,
            _ID_TO_TASK_TYPE[task_id].value,
            len(open_ml_task.categorical_columns),
            len(open_ml_task.numerical_columns),
            len(X),
            X.size,
        ),
    )

dataset_statistics = pd.DataFrame(
    dataset_statistics,
    columns=["ID", "Task Type", r"\#Cat.", r"\#Num.", r"\#Obs.", r"\#Cells"],
).sort_values(["Task Type", "ID"])

## Removing Datasets

Following definition in:
```
Grinsztajn, L., Oyallon, E., & Varoquaux, G. (2022). Why do tree-based models still outperform deep learning on tabular data? ArXiv, abs/2207.08815.
```

In [None]:
dataset_descriptions = pd.read_csv("../data/dataset_descriptions.csv").convert_dtypes().set_index("task_id")
tabular_task_id = dataset_descriptions[dataset_descriptions["tabular"]].index

dataset_statistics = dataset_statistics.query("ID in @tabular_task_id").assign(
    Garf=lambda df: df["ID"].isin([4135, 251, 1200, 218, 1046]),
)

In [None]:
caption = r"Datasets overview. \emph{ID} is the assigned OpenML id, \emph{\#} means the number of, \emph{Cat.} and \emph{Num.} stand for categorical and numerical columns, and \emph{Obs.} means observations, i.e., the number of rows of the tabular dataset. \emph{Garf} shows whether or not Garf was able to clean the dataset."

formatters = {
    r"\#Obs.": lambda x: f"${x:,}$",
    r"\#Cells": lambda x: f"${x:,}$",
    r"\#Cat.": lambda x: f"${x}$",
    r"\#Num.": lambda x: f"${x}$",
    "ID": lambda x: f"${x}$",
    "Garf": lambda x: "\\ding{55}" if x else "\\ding{51}",
}


print(
    dataset_statistics.replace({"multi_class": "Multi Class", "regression": "Regression", "binary": "Binary"})
    .style.format(
        formatters,
    )
    .hide(axis="index")
    .to_latex(
        caption=caption,
        label="tab:datasets",
        position="h",
    ),
)