In [2]:
import pandas as pd
import numpy as np
from src.reproduce_utils import Results

In [3]:
def get_result(methods, metrics, dataset_names, result_df):
    headers = ["metric", "dataset"]
    indices = [
        "method",
        "seed",
    ]
    columns = pd.MultiIndex.from_product([metrics, dataset_names], names=headers)
    index = pd.MultiIndex.from_product([methods, [1]], names=indices)
    df = pd.DataFrame(columns=columns, index=index)
    df.sort_index(inplace=True)
    for index, row in result_df.iterrows():
        for method in methods:
            if int(row["dataset_id"]) not in dataset_names:
                continue
            if "logistic" in method:
                score_method = "linear"
            else:
                score_method = method
            row_id = (method, 1)
            col = ("acc", row["dataset_id"])
            df.loc[row_id, col] = row[f"score_{score_method}"]
    return Results(df=df)

In [4]:
def get_average_rank_table(results: Results):
    datasets = results.datasets
    metrics = sorted(results.metrics, reverse=True)
    # print(results.methods)
    df = results.df
    results_rank = {}
    results_score = {}
    for metric in metrics:
        if "time" in metric:
            continue
        metric_df = df[metric]
        dataset_rank_dfs = []
        dataset_mean_dfs = []
        for dataset in datasets:
            if dataset not in metric_df.columns:
                continue
            dataset_rank_df = metric_df[dataset].groupby('method').mean().rank(ascending=False)
            dataset_rank_dfs.append(dataset_rank_df)
            dataset_mean_dfs.append(metric_df[dataset])

        results_rank[metric] = pd.concat(dataset_rank_dfs).groupby("method").mean()
        
        results_score[metric] = pd.concat(dataset_mean_dfs).groupby("method").mean()
    score_df = pd.DataFrame(results_score).reset_index()
    rank_df = pd.DataFrame(results_rank).reset_index()
    final_table = rank_df.merge(score_df, on="method", suffixes=[" Mean Rank", " Mean Score"]).T
    final_table.columns = final_table.iloc[0]
    final_table = final_table.iloc[1:]
    return final_table

def pprint(df):
    for column in df:
        df[column] = df[column].astype('float').round(decimals=4)

    print(df.to_markdown())

In [5]:
methods = ["rf", "tree", "mlp", "hgbt", "logistic"]
metrics = ["acc"]



In [6]:
too_easy_result_df = pd.read_csv("too_easy_without_resnet.csv").drop(["Unnamed: 0"], axis=1)
too_easy_dataset_names = too_easy_result_df["dataset_id"].astype(int).copy().to_list()
too_easy_result = get_result(methods, metrics, too_easy_dataset_names, too_easy_result_df)
too_easy_all_datasets_ranks = get_average_rank_table(too_easy_result)

selected_result_df = pd.read_csv("benchmark_without_resnet.csv").drop(["Unnamed: 0"], axis=1)
selected_dataset_names = selected_result_df["dataset_id"].astype(int).copy().to_list()
selected_result = get_result(methods, metrics, selected_dataset_names, selected_result_df)
selected_all_datasets_ranks = get_average_rank_table(selected_result)

In [7]:
print(f"\n\n\nToo easy ({len(too_easy_dataset_names)} datasets from latest csv)")
pprint(too_easy_all_datasets_ranks)
print("\n\n\ngrinzstjan et al")
pprint(selected_all_datasets_ranks)




Too easy (65 datasets from latest csv)
|                |   hgbt |   logistic |    mlp |     rf |   tree |
|:---------------|-------:|-----------:|-------:|-------:|-------:|
| acc Mean Rank  | 2.1231 |     3.0769 | 2.6308 | 2.5154 | 4.6538 |
| acc Mean Score | 0.8312 |     0.8194 | 0.825  | 0.8256 | 0.7783 |



grinzstjan et al
|                |   hgbt |   logistic |    mlp |     rf |   tree |
|:---------------|-------:|-----------:|-------:|-------:|-------:|
| acc Mean Rank  | 1.5238 |     4.6667 | 3.1429 | 1.619  | 4.0476 |
| acc Mean Score | 0.8025 |     0.7167 | 0.7714 | 0.8045 | 0.7478 |


In [8]:
all_result = Results(pd.concat([too_easy_result.df, selected_result.df]))
all_result_ranks = get_average_rank_table(all_result)
print(f"\nToo easy ({len(too_easy_dataset_names)} datasets from latest csv) + grinzstjan et al")
pprint(all_result_ranks)


Too easy (65 datasets from latest csv) + grinzstjan et al
|                |   hgbt |   logistic |    mlp |     rf |   tree |
|:---------------|-------:|-----------:|-------:|-------:|-------:|
| acc Mean Rank  | 1.9767 |     3.4651 | 2.7558 | 2.2965 | 4.5058 |
| acc Mean Score | 0.8242 |     0.7943 | 0.8119 | 0.8205 | 0.7708 |


In [9]:
too_easy_result_df.to_csv("")

FileNotFoundError: [Errno 2] No such file or directory: ''

In [11]:
df = pd.read_csv("Datasets tabular data benchmark - Feuille 12.csv")

In [12]:
too_easy_result_df = pd.read_csv("too_easy_without_resnet.csv").drop(["Unnamed: 0"], axis=1)
too_easy_dataset_names = too_easy_result_df["dataset_id"].copy().to_list()
too_easy_result = get_result(methods, metrics, too_easy_dataset_names, too_easy_result_df)
too_easy_all_datasets_ranks = get_average_rank_table(too_easy_result)
too_easy_dids = []
for index, row in df.iterrows():
    try:
        if not pd.isnull(row["dataset_id"]) and row["Remove"] != 1 and (row["too_easy"] == 1 or args.all) and row["Redundant"] != 1:
            prefix_to_skip = ["BNG", "RandomRBF", "GTSRB", "CovPokElec", "PCam"]
            if not (np.any([row["dataset_name"].startswith(prefix) for prefix in
                            prefix_to_skip]) or "mnist" in row["dataset_name"].lower() or "image" in row[
                        "dataset_name"].lower() or "cifar" in row["dataset_name"].lower() or row["dataset_id"] == 1414):
                        too_easy_dids.append(int(row["dataset_id"]))
    except:
        continue
    

In [13]:
len(too_easy_dids)

65

In [14]:
len(too_easy_dataset_names)

65

In [10]:
full_datasets_info = pd.read_csv("Datasets tabular data benchmark - categorical_classif-2.csv").fillna(0)

real_too_easy_dataset_ids = []
for index, row in full_datasets_info.iterrows():
    if int(row["too_easy"]) and int(row["dataset_id"]) != 0:
        real_too_easy_dataset_ids.append(int(row["dataset_id"]))

In [11]:
real_too_easy_dataset_ids


[24,
 26,
 154,
 179,
 274,
 350,
 720,
 881,
 923,
 959,
 981,
 993,
 1110,
 1112,
 1113,
 1119,
 1169,
 1240,
 1461,
 1486,
 1503,
 1568,
 1590,
 4534,
 4541,
 40517,
 40672,
 40997,
 40998,
 41000,
 41002,
 41003,
 41006,
 41147,
 41162,
 41440,
 41672,
 42132,
 42192,
 42193,
 42206,
 42343,
 42344,
 42345,
 42477,
 42493,
 42732,
 42734,
 42742,
 42746,
 42750,
 43044,
 43439,
 43489,
 43607,
 43890,
 43892,
 43898,
 43903,
 43904,
 43920,
 43922,
 43923,
 43938]

In [12]:
import openml
#openml.config.apikey = 'FILL_IN_OPENML_API_KEY'  # set the OpenML Api Key
# SUITE_ID = 297 # Regression on numerical features
# SUITE_ID = 298 # Classification on numerical features
# SUITE_ID = 299 # Regression on numerical and categorical features
SUITE_ID = 304 # Classification on numerical and categorical features
benchmark_suite = openml.study.get_suite(SUITE_ID)  # obtain the benchmark suite
for task_id in benchmark_suite.tasks:  # iterate over all tasks
    task = openml.tasks.get_task(task_id, download_data=False)  # download the OpenML task
    # dataset = task.dataset_id(download_data=False)
    print(task.dataset_id)
    # X, y, categorical_indicator, attribute_names = dataset.get_data(
    #     dataset_format="dataframe", target=dataset.default_target_attribute
    # )

44156
44157
44159
44160
44161
44162
44186
