In [21]:
from src.reproduce_utils import Results
import pandas as pd

In [22]:
def get_result(methods, metrics, dataset_names, result_df):
    headers = ["metric", "dataset"]
    indices = [
        "method",
        "seed",
    ]
    columns = pd.MultiIndex.from_product([metrics, dataset_names], names=headers)
    index = pd.MultiIndex.from_product([methods, [1]], names=indices)
    df = pd.DataFrame(columns=columns, index=index)
    df.sort_index(inplace=True)
    for index, row in result_df.iterrows():
        for method in methods:
            if int(row["dataset_id"]) not in dataset_names:
                continue
            if "logistic" in method:
                score_method = "linear"
            else:
                score_method = method
            row_id = (method, 1)
            col = ("acc", row["dataset_id"])
            df.loc[row_id, col] = row[f"score_{score_method}"]
    return Results(df=df)

In [23]:
# autopytorch_cocktails_random_df = pd.read_csv('csv_files/cocktails_random_refit_results.csv', index_col=0)
# autopytorch_cocktails_random_df['dataset_id'] = autopytorch_cocktails_random_df['dataset_id'].astype(int).replace(dict(zip(final_benchmark_dataset_ids, selected_datasets)))
# autopytorch_cocktails_random_df = autopytorch_cocktails_random_df.set_index('dataset_id')
# all_my_results['score_cocktails_random'] = autopytorch_cocktails_random_df['test_score']


In [24]:
all_my_results = pd.read_csv("csv_files/final_all_my_results.csv", index_col=None).set_index("dataset_id").sort_index()


In [25]:
# cocktails_default_df = pd.read_csv("csv_files/cocktails_default_refit_results.csv", index_col=None).set_index('dataset_id')
# all_my_results = pd.read_csv("csv_files/all_results_with_cocktail_random.csv", index_col="dataset_id").sort_index()
# all_my_results["score_cocktails_default"] = cocktails_default_df['test_score']
# all_my_results["score_autopytorch_default"] = scores_autopytorch_default['autopytorch_master_default']
# all_my_results["score_cocktails"] = cocktails_df['test_score']
# all_my_results.columns
# all_my_results = all_my_results.drop("score_autopytorch_default", axis=1)

In [26]:
def get_average_rank_table(results: Results):
    datasets = results.datasets
    metrics = sorted(results.metrics, reverse=True)
    # print(results.methods)
    df = results.df
    results_rank = {}
    results_score = {}
    for metric in metrics:
        if "time" in metric:
            continue
        metric_df = df[metric]
        dataset_rank_dfs = []
        dataset_mean_dfs = []
        for dataset in datasets:
            if dataset not in metric_df.columns:
                continue
            dataset_rank_df = metric_df[dataset].groupby('method').mean().rank(ascending=False)
            dataset_rank_dfs.append(dataset_rank_df)
            dataset_mean_dfs.append(metric_df[dataset])

        results_rank[metric.upper()] = pd.concat(dataset_rank_dfs).groupby("method").mean()
        
        results_score[metric.upper()] = pd.concat(dataset_mean_dfs).groupby("method").mean()
    score_df = pd.DataFrame(results_score).reset_index()
    rank_df = pd.DataFrame(results_rank).reset_index()
    final_table = rank_df.merge(score_df, on="method", suffixes=[" Mean Rank", " Mean Score"]).T
    final_table.columns = final_table.iloc[0]
    final_table = final_table.iloc[1:]
    return final_table

def pprint(df):
    for column in df:
        df[column] = df[column].astype('float').round(decimals=4)

    print(df.to_markdown())

In [27]:
def get_too_easy_select_acc_to_difference(df: pd.DataFrame, methods: list, stddev: float = 0.05):
    std_datasets = df[[f"score_{method}" for method in methods]].std(axis=1)
    selection_criteria = std_datasets < stddev
    too_easy_on_selection_criteria = df.loc[selection_criteria].index.to_list()
    select_on_selection_criteria = df.loc[list(map(lambda x: not x, selection_criteria))].index.to_list()
    return too_easy_on_selection_criteria, select_on_selection_criteria


In [28]:
def get_too_easy_select_acc_to_criteria(df: pd.DataFrame, better_methods: list, worse_methods: list):
        
    lhs = df[better_methods].max(axis=1) if len(better_methods) > 1 else df[better_methods[0]]
    rhs = df[worse_methods].max(axis=1) if len(worse_methods) > 1 else df[worse_methods[0]]
    selection_criteria = lhs < 1.05 * rhs
    too_easy_on_selection_criteria = df.loc[selection_criteria].index.to_list()
    select_on_selection_criteria = df.loc[list(map(lambda x: not x, selection_criteria))].index.to_list()
    return too_easy_on_selection_criteria, select_on_selection_criteria

In [29]:
too_easy_cocktails_default_hgbt_linear_dids, select_cocktails_default_hgbt_linear_dids = get_too_easy_select_acc_to_criteria(all_my_results, better_methods=["score_hgbt", "score_cocktails_default"], worse_methods=["score_linear"])
too_easy_cocktails_default_hgbt_tree_dids, select_cocktails_default_hgbt_tree_dids = get_too_easy_select_acc_to_criteria(all_my_results, better_methods=["score_hgbt", "score_cocktails_default"], worse_methods=["score_tree"])
too_easy_cocktails_default_hgbt_combined_dids, select_cocktails_default_hgbt_combined_dids = get_too_easy_select_acc_to_criteria(all_my_results, better_methods=["score_hgbt", "score_cocktails_default"], worse_methods=["score_tree", "score_linear"])
too_easy_hgbt_tree_dids, select_hgbt_tree_dids = get_too_easy_select_acc_to_criteria(all_my_results, better_methods=["score_hgbt"], worse_methods=["score_tree"])
too_easy_hgbt_linear_dids, select_hgbt_linear_dids = get_too_easy_select_acc_to_criteria(all_my_results, better_methods=["score_hgbt"], worse_methods=["score_linear"])
too_easy_hgbt_combined_dids, select_hgbt_combined_dids = get_too_easy_select_acc_to_criteria(all_my_results, better_methods=["score_hgbt"], worse_methods=["score_tree", "score_linear"])
too_easy_resnet_tree_dids, select_resnet_tree_dids = get_too_easy_select_acc_to_criteria(all_my_results, better_methods=["score_resnet"], worse_methods=["score_tree"])
too_easy_mlp_tree_dids, select_mlp_tree_dids = get_too_easy_select_acc_to_criteria(all_my_results, better_methods=["score_mlp"], worse_methods=["score_tree"])
too_easy_cocktails_default_tree_dids, select_cocktails_default_tree_dids = get_too_easy_select_acc_to_criteria(all_my_results, better_methods=["score_cocktails_default"], worse_methods=["score_tree"])
too_easy_autopytorch_default_cocktails_default_hgbt_combined_dids, select_autopytorch_default_cocktails_default_hgbt_combined_dids = get_too_easy_select_acc_to_criteria(all_my_results, better_methods=["score_hgbt", "score_cocktails_default"], worse_methods=["score_linear", "score_tree"])
too_easy_cocktails_default_linear_dids, select_cocktails_default_linear_dids = get_too_easy_select_acc_to_criteria(all_my_results, better_methods=["score_cocktails_default"], worse_methods=["score_linear"])


In [30]:


ranks_df = {
    # results vs logreg
    # "too_easy_(HGBT,_Cocktails_default)_vs_(Logreg)_on_my": {
    #     "ranks": None,
    #     "dids": too_easy_cocktails_default_hgbt_linear_dids},
    "select_(HGBT,_Cocktails_default)_vs_(Logreg)_on_my": {
        "ranks": None,
        "dids": select_cocktails_default_hgbt_linear_dids},

    # results vs tree
    # "too_easy_(HGBT,_Cocktails_default)_vs_(tree)_on_my": {
    #     "ranks": None,
    #     "dids": too_easy_cocktails_default_hgbt_tree_dids},
    "select_(HGBT,_Cocktails_default)_vs_(tree)_on_my": {
        "ranks": None,
        "dids": select_cocktails_default_hgbt_tree_dids},
    
    # results vs both
    # "too_easy_(HGBT,_Cocktails_default)_vs_(tree,Logreg)_on_my": {
    #     "ranks": None,
    #     "dids": too_easy_cocktails_default_hgbt_combined_dids},
    "select_(HGBT,_Cocktails_default)_vs_(tree,Logreg)_on_my": {
        "ranks": None,
        "dids": select_cocktails_default_hgbt_combined_dids},

    # results individual
    # vs tree
    # "too_easy_(_Cocktails_default)_vs_(tree)_on_my": {
    #     "ranks": None,
    #     "dids": too_easy_cocktails_default_tree_dids},
    "select_(_Cocktails_default)_vs_(tree)_on_my": {
        "ranks": None,
        "dids": select_cocktails_default_tree_dids},
    # vs logreg
    # "too_easy_(_Cocktails_default)_vs_(Logreg)_on_my": {
    #     "ranks": None,
    #     "dids": too_easy_cocktails_default_linear_dids},
    "select_(_Cocktails_default)_vs_(Logreg)_on_my": {
        "ranks": None,
        "dids": select_cocktails_default_linear_dids},

    # all combined
    # "too_easy_(HGBT,_Cocktails_default)_vs_(tree,Logreg)_on_my": {
    #     "ranks": None,
    #     "dids": too_easy_autopytorch_default_cocktails_default_hgbt_combined_dids},
    "select_(HGBT,_Cocktails_default)_vs_(tree,Logreg)_on_my": {
        "ranks": None,
        "dids": select_autopytorch_default_cocktails_default_hgbt_combined_dids},

    # from previous tables
    # "too_easy_(HGBT)_vs_(tree)_on_my": {
    #     "ranks": None,
    #     "dids": too_easy_hgbt_tree_dids},
    "select_(HGBT)_vs_(tree)_on_my": {
        "ranks": None,
        "dids": select_hgbt_tree_dids},
    # "too_easy_(HGBT)_vs_(Logreg)_on_my": {
    #     "ranks": None,
    #     "dids": too_easy_hgbt_linear_dids},
    "select_(HGBT)_vs_(Logreg)_on_my": {
        "ranks": None,
        "dids": select_hgbt_linear_dids},
    # "too_easy_(HGBT)_vs_(tree,Logreg)_on_my": {
    #     "ranks": None,
    #     "dids": too_easy_hgbt_combined_dids},
    "select_(HGBT)_vs_(tree,Logreg)_on_my": {
        "ranks": None,
        "dids": select_hgbt_combined_dids},
        }


In [31]:
methods = [ "hgbt", "logistic", "cocktails_random", "cocktails_default"] # , "resnet", "rf", "tree", "mlp",]
metrics = ["acc"]
for key in ranks_df:
    current_result = get_result(methods, metrics, ranks_df[key]['dids'], all_my_results.reset_index())
    current_ranks = get_average_rank_table(current_result)
    ranks_df[key]['ranks'] = current_ranks
    print(f"\n\n{key.upper().replace('_', ' ')} ({len(ranks_df[key]['dids'])})")
    pprint(current_ranks)



SELECT (HGBT, COCKTAILS DEFAULT) VS (LOGREG) ON MY (19)
|                |   cocktails_default |   cocktails_random |   hgbt |   logistic |
|:---------------|--------------------:|-------------------:|-------:|-----------:|
| ACC Mean Rank  |              2.4211 |             2.2778 | 1.4211 |     3.7895 |
| ACC Mean Score |              0.7771 |             0.7788 | 0.7952 |     0.7042 |


SELECT (HGBT, COCKTAILS DEFAULT) VS (TREE) ON MY (44)
|                |   cocktails_default |   cocktails_random |   hgbt |   logistic |
|:---------------|--------------------:|-------------------:|-------:|-----------:|
| ACC Mean Rank  |              2.7045 |             2.2927 | 1.8182 |     3.0682 |
| ACC Mean Score |              0.7668 |             0.7782 | 0.7781 |     0.7441 |


SELECT (HGBT, COCKTAILS DEFAULT) VS (TREE,LOGREG) ON MY (15)
|                |   cocktails_default |   cocktails_random |   hgbt |   logistic |
|:---------------|--------------------:|-------------------:|------

In [32]:
methods = [ "hgbt", "logistic", "resnet", "rf", "tree", "mlp", "cocktails_default", "cocktails_random"]
metrics = ["acc"]
for key in ranks_df:
    current_result = get_result(methods, metrics, ranks_df[key]['dids'], all_my_results.reset_index())
    current_ranks = get_average_rank_table(current_result)
    ranks_df[key]['ranks'] = current_ranks
    print(f"\n\n{key.upper().replace('_', ' ')} ({len(ranks_df[key]['dids'])})")
    pprint(current_ranks)



SELECT (HGBT, COCKTAILS DEFAULT) VS (LOGREG) ON MY (19)
|                |   cocktails_default |   cocktails_random |   hgbt |   logistic |    mlp |   resnet |     rf |   tree |
|:---------------|--------------------:|-------------------:|-------:|-----------:|-------:|---------:|-------:|-------:|
| ACC Mean Rank  |              4.4211 |             4.2222 | 1.9474 |     7.2632 | 4.1579 |   5.0526 | 2.2632 | 6.4737 |
| ACC Mean Score |              0.7771 |             0.7788 | 0.7952 |     0.7042 | 0.7666 |   0.7531 | 0.7909 | 0.7312 |


SELECT (HGBT, COCKTAILS DEFAULT) VS (TREE) ON MY (44)
|                |   cocktails_default |   cocktails_random |   hgbt |   logistic |    mlp |   resnet |     rf |   tree |
|:---------------|--------------------:|-------------------:|-------:|-----------:|-------:|---------:|-------:|-------:|
| ACC Mean Rank  |              4.7273 |             3.878  | 2.75   |     5.3636 | 3.6591 |   4.4545 | 3.2727 | 7.6136 |
| ACC Mean Score |              

In [33]:
methods = [ "hgbt", "linear", "resnet", "rf", "tree", "mlp", "cocktails_default", "cocktails_random"]

too_easy_std_methods_dids, select_std_methods_dids = get_too_easy_select_acc_to_difference(all_my_results, methods=methods)
# subset_methods = [ "hgbt", "linear", "rf", "tree", "mlp","autopytorch_default", "cocktails_default"]

# too_easy_autopytorch_default_linear_dids, select_autopytorch_default_linear_dids = get_too_easy_select_acc_to_difference(all_my_results, methods=methods)


In [34]:
methods = [ "hgbt", "linear", "resnet", "rf", "tree", "mlp", "cocktails_default"]

too_easy_std_methods_dids, select_std_methods_dids = get_too_easy_select_acc_to_difference(all_my_results, methods=methods)

ranks_df = {
    "std_>_0.05,_methods=all_-_random_(select)": {
        "ranks": None,
        "dids": select_std_methods_dids},
        }
methods = [ "hgbt", "logistic", "resnet", "rf", "tree", "mlp","cocktails_default", "cocktails_random"]
metrics = ["acc"]
for key in ranks_df:
    current_result = get_result(methods, metrics, ranks_df[key]['dids'], all_my_results.reset_index())
    current_ranks = get_average_rank_table(current_result)
    ranks_df[key]['ranks'] = current_ranks
    print(f"\n\n{key.upper().replace('_', ' ')} ({len(ranks_df[key]['dids'])})")
    pprint(current_ranks)



STD > 0.05, METHODS=ALL - RANDOM (SELECT) (5)
|                |   cocktails_default |   cocktails_random |   hgbt |   logistic |    mlp |   resnet |     rf |   tree |
|:---------------|--------------------:|-------------------:|-------:|-----------:|-------:|---------:|-------:|-------:|
| ACC Mean Rank  |              3      |             3.2    | 3      |     6.2    | 4.4    |   5.2    | 4.2    | 6.8    |
| ACC Mean Score |              0.8715 |             0.8391 | 0.8596 |     0.7252 | 0.8274 |   0.7868 | 0.8338 | 0.7475 |


In [35]:
methods = [ "hgbt", "linear", "tree", "cocktails_default"] # , ""]

too_easy_std_methods_dids, select_std_methods_dids = get_too_easy_select_acc_to_difference(all_my_results, methods=methods)

ranks_df = {
    "std_>_0.05,_methods=hgbt,linear,tree,cocktails_default_(select)": {
        "ranks": None,
        "dids": select_std_methods_dids},
        }
methods = [ "hgbt", "logistic", "resnet", "rf", "tree", "mlp", "cocktails_default", "cocktails_random"]
metrics = ["acc"]
for key in ranks_df:
    current_result = get_result(methods, metrics, ranks_df[key]['dids'], all_my_results.reset_index())
    current_ranks = get_average_rank_table(current_result)
    ranks_df[key]['ranks'] = current_ranks
    print(f"\n\n{key.upper().replace('_', ' ')} ({len(ranks_df[key]['dids'])})")
    pprint(current_ranks)



STD > 0.05, METHODS=HGBT,LINEAR,TREE,COCKTAILS DEFAULT (SELECT) (9)
|                |   cocktails_default |   cocktails_random |   hgbt |   logistic |    mlp |   resnet |     rf |   tree |
|:---------------|--------------------:|-------------------:|-------:|-----------:|-------:|---------:|-------:|-------:|
| ACC Mean Rank  |              3.7778 |             3.3333 | 3.1111 |     5.8889 | 4.3333 |   5      | 4      | 6.5556 |
| ACC Mean Score |              0.8739 |             0.8488 | 0.8743 |     0.7723 | 0.8457 |   0.8237 | 0.8567 | 0.7779 |


In [37]:
import openml
for d_id in select_std_methods_dids:
    dataset = openml.datasets.get_dataset(dataset_id=d_id, download_data=False)
    print(dataset.name)

electricity
covertype
poker
pol
elevators
phoneme
twonorm
Indian_pines
SantanderCustomerSatisfaction


In [38]:
their_datasets = [151, 293, 722, 821, 993, 1044, 1120, 1461, 1489, 41150, 41168, 42769]

ranks_df = {
    "select_their_benchmark": {
        "ranks": None,
        "dids": their_datasets},
        }
methods = [ "hgbt", "logistic", "resnet", "rf", "tree", "mlp", "cocktails_default", "cocktails_random"]
metrics = ["acc"]
for key in ranks_df:
    current_result = get_result(methods, metrics, ranks_df[key]['dids'], all_my_results.reset_index())
    current_ranks = get_average_rank_table(current_result)
    ranks_df[key]['ranks'] = current_ranks
    print(f"\n\n{key.upper().replace('_', ' ')} ({len(ranks_df[key]['dids'])})")
    pprint(current_ranks)



SELECT THEIR BENCHMARK (12)
|                |   cocktails_default |   cocktails_random |   hgbt |   logistic |    mlp |   resnet |     rf |   tree |
|:---------------|--------------------:|-------------------:|-------:|-----------:|-------:|---------:|-------:|-------:|
| ACC Mean Rank  |              4.9167 |             4      | 1.9167 |     7.25   | 4.1667 |   5.1667 | 2.25   | 6.3333 |
| ACC Mean Score |              0.8021 |             0.8037 | 0.8369 |     0.7405 | 0.8067 |   0.7843 | 0.8345 | 0.7757 |


In [39]:
methods = [ "hgbt", "linear", "tree", "resnet", "mlp", "rf"] # , ""]

too_easy_std_methods_dids, select_std_methods_dids = get_too_easy_select_acc_to_difference(all_my_results, methods=methods)

ranks_df = {
    "std_>_0.05,_methods=hgbt,linear,tree,resnet_mlp_rf_(select)": {
        "ranks": None,
        "dids": select_std_methods_dids},
        }
methods = [ "hgbt", "logistic", "resnet", "rf", "tree", "mlp", "cocktails_default", "cocktails_random"]
metrics = ["acc"]
for key in ranks_df:
    current_result = get_result(methods, metrics, ranks_df[key]['dids'], all_my_results.reset_index())
    current_ranks = get_average_rank_table(current_result)
    ranks_df[key]['ranks'] = current_ranks
    print(f"\n\n{key.upper().replace('_', ' ')} ({len(ranks_df[key]['dids'])})")
    pprint(current_ranks)



STD > 0.05, METHODS=HGBT,LINEAR,TREE,RESNET MLP RF (SELECT) (7)
|                |   cocktails_default |   cocktails_random |   hgbt |   logistic |    mlp |   resnet |     rf |   tree |
|:---------------|--------------------:|-------------------:|-------:|-----------:|-------:|---------:|-------:|-------:|
| ACC Mean Rank  |              3.5714 |             3.5714 | 3      |     5.7143 | 4.2857 |   5.1429 | 3.8571 | 6.8571 |
| ACC Mean Score |              0.8829 |             0.8486 | 0.8773 |     0.7629 | 0.8495 |   0.8216 | 0.8601 | 0.7732 |


In [40]:
import openml
for d_id in select_std_methods_dids:
    dataset = openml.datasets.get_dataset(dataset_id=d_id, download_data=False)
    print(dataset.name)

covertype
poker
pol
phoneme
twonorm
Indian_pines
SantanderCustomerSatisfaction


In [43]:
methods = [ "hgbt",  "resnet", "mlp"] # , ""]

too_easy_std_methods_dids, select_std_methods_dids = get_too_easy_select_acc_to_difference(all_my_results, methods=methods)

ranks_df = {
    "std_>_0.05,_methods=hgbt,resnet_mlp_rf_(select)": {
        "ranks": None,
        "dids": select_std_methods_dids},
        }
methods = [ "hgbt", "logistic", "resnet", "rf", "tree", "mlp", "cocktails_default", "cocktails_random"]
metrics = ["acc"]
for key in ranks_df:
    current_result = get_result(methods, metrics, ranks_df[key]['dids'], all_my_results.reset_index())
    current_ranks = get_average_rank_table(current_result)
    ranks_df[key]['ranks'] = current_ranks
    print(f"\n\n{key.upper().replace('_', ' ')} ({len(ranks_df[key]['dids'])})")
    pprint(current_ranks)



STD > 0.05, METHODS=HGBT,RESNET MLP RF (SELECT) (3)
|                |   cocktails_default |   cocktails_random |   hgbt |   logistic |    mlp |   resnet |     rf |   tree |
|:---------------|--------------------:|-------------------:|-------:|-----------:|-------:|---------:|-------:|-------:|
| ACC Mean Rank  |              5.3333 |             3.3333 | 1.6667 |     7.6667 | 4      |   6.6667 | 2.6667 | 4.6667 |
| ACC Mean Score |              0.7683 |             0.8567 | 0.837  |     0.7169 | 0.7875 |   0.6894 | 0.8291 | 0.7911 |
