In [56]:
from src.reproduce_utils import Results
import pandas as pd

In [57]:
def get_result(methods, metrics, dataset_names, result_df):
    headers = ["metric", "dataset"]
    indices = [
        "method",
        "seed",
    ]
    columns = pd.MultiIndex.from_product([metrics, dataset_names], names=headers)
    index = pd.MultiIndex.from_product([methods, [1]], names=indices)
    df = pd.DataFrame(columns=columns, index=index)
    df.sort_index(inplace=True)
    for index, row in result_df.iterrows():
        for method in methods:
            if int(row["dataset_id"]) not in dataset_names:
                continue
            if "logistic" in method:
                score_method = "linear"
            else:
                score_method = method
            row_id = (method, 1)
            col = ("acc", row["dataset_id"])
            df.loc[row_id, col] = row[f"score_{score_method}"]
    return Results(df=df)

In [58]:
# autopytorch_cocktails_random_df = pd.read_csv('csv_files/cocktails_random_refit_results.csv', index_col=0)
# autopytorch_cocktails_random_df['dataset_id'] = autopytorch_cocktails_random_df['dataset_id'].astype(int).replace(dict(zip(final_benchmark_dataset_ids, selected_datasets)))
# autopytorch_cocktails_random_df = autopytorch_cocktails_random_df.set_index('dataset_id')
# all_my_results['score_cocktails_random'] = autopytorch_cocktails_random_df['test_score']


In [59]:
all_my_results = pd.read_csv("csv_files/all_results_with_cocktail.csv", index_col=None).set_index("dataset_id").sort_index()


In [60]:
# cocktails_default_df = pd.read_csv("csv_files/cocktails_default_refit_results.csv", index_col=None).set_index('dataset_id')
# all_my_results = pd.read_csv("csv_files/all_results_with_cocktail_random.csv", index_col="dataset_id").sort_index()
# all_my_results["score_cocktails_default"] = cocktails_default_df['test_score']
# all_my_results["score_autopytorch_default"] = scores_autopytorch_default['autopytorch_master_default']
# all_my_results["score_cocktails"] = cocktails_df['test_score']
# all_my_results.columns
# all_my_results = all_my_results.drop("score_autopytorch_default", axis=1)

In [61]:
def get_average_rank_table(results: Results):
    datasets = results.datasets
    metrics = sorted(results.metrics, reverse=True)
    # print(results.methods)
    df = results.df
    results_rank = {}
    results_score = {}
    for metric in metrics:
        if "time" in metric:
            continue
        metric_df = df[metric]
        dataset_rank_dfs = []
        dataset_mean_dfs = []
        for dataset in datasets:
            if dataset not in metric_df.columns:
                continue
            dataset_rank_df = metric_df[dataset].groupby('method').mean().rank(ascending=False)
            dataset_rank_dfs.append(dataset_rank_df)
            dataset_mean_dfs.append(metric_df[dataset])

        results_rank[metric.upper()] = pd.concat(dataset_rank_dfs).groupby("method").mean()
        
        results_score[metric.upper()] = pd.concat(dataset_mean_dfs).groupby("method").mean()
    score_df = pd.DataFrame(results_score).reset_index()
    rank_df = pd.DataFrame(results_rank).reset_index()
    final_table = rank_df.merge(score_df, on="method", suffixes=[" Mean Rank", " Mean Score"]).T
    final_table.columns = final_table.iloc[0]
    final_table = final_table.iloc[1:]
    return final_table

def pprint(df):
    for column in df:
        df[column] = df[column].astype('float').round(decimals=4)

    print(df.to_markdown())

In [62]:
def get_too_easy_select_acc_to_difference(df: pd.DataFrame, methods: list, stddev: float = 0.05):
    std_datasets = df[[f"score_{method}" for method in methods]].std(axis=1)
    selection_criteria = std_datasets < stddev
    too_easy_on_selection_criteria = df.loc[selection_criteria].index.to_list()
    select_on_selection_criteria = df.loc[list(map(lambda x: not x, selection_criteria))].index.to_list()
    return too_easy_on_selection_criteria, select_on_selection_criteria


In [63]:
def get_too_easy_select_acc_to_criteria(df: pd.DataFrame, better_methods: list, worse_methods: list):
        
    lhs = df[better_methods].max(axis=1) if len(better_methods) > 1 else df[better_methods[0]]
    rhs = df[worse_methods].max(axis=1) if len(worse_methods) > 1 else df[worse_methods[0]]
    selection_criteria = lhs < 1.05 * rhs
    too_easy_on_selection_criteria = df.loc[selection_criteria].index.to_list()
    select_on_selection_criteria = df.loc[list(map(lambda x: not x, selection_criteria))].index.to_list()
    return too_easy_on_selection_criteria, select_on_selection_criteria

In [64]:
too_easy_cocktails_default_hgbt_linear_dids, select_cocktails_default_hgbt_linear_dids = get_too_easy_select_acc_to_criteria(all_my_results, better_methods=["score_hgbt", "score_cocktails_default"], worse_methods=["score_linear"])
too_easy_cocktails_default_hgbt_tree_dids, select_cocktails_default_hgbt_tree_dids = get_too_easy_select_acc_to_criteria(all_my_results, better_methods=["score_hgbt", "score_cocktails_default"], worse_methods=["score_tree"])
too_easy_cocktails_default_hgbt_combined_dids, select_cocktails_default_hgbt_combined_dids = get_too_easy_select_acc_to_criteria(all_my_results, better_methods=["score_hgbt", "score_cocktails_default"], worse_methods=["score_tree", "score_linear"])
too_easy_hgbt_tree_dids, select_hgbt_tree_dids = get_too_easy_select_acc_to_criteria(all_my_results, better_methods=["score_hgbt"], worse_methods=["score_tree"])
too_easy_hgbt_linear_dids, select_hgbt_linear_dids = get_too_easy_select_acc_to_criteria(all_my_results, better_methods=["score_hgbt"], worse_methods=["score_linear"])
too_easy_hgbt_combined_dids, select_hgbt_combined_dids = get_too_easy_select_acc_to_criteria(all_my_results, better_methods=["score_hgbt"], worse_methods=["score_tree", "score_linear"])
too_easy_resnet_tree_dids, select_resnet_tree_dids = get_too_easy_select_acc_to_criteria(all_my_results, better_methods=["score_resnet"], worse_methods=["score_tree"])
too_easy_mlp_tree_dids, select_mlp_tree_dids = get_too_easy_select_acc_to_criteria(all_my_results, better_methods=["score_mlp"], worse_methods=["score_tree"])
too_easy_cocktails_default_tree_dids, select_cocktails_default_tree_dids = get_too_easy_select_acc_to_criteria(all_my_results, better_methods=["score_cocktails_default"], worse_methods=["score_tree"])
too_easy_autopytorch_default_cocktails_default_hgbt_combined_dids, select_autopytorch_default_cocktails_default_hgbt_combined_dids = get_too_easy_select_acc_to_criteria(all_my_results, better_methods=["score_hgbt", "score_cocktails_default"], worse_methods=["score_linear", "score_tree"])
too_easy_cocktails_default_linear_dids, select_cocktails_default_linear_dids = get_too_easy_select_acc_to_criteria(all_my_results, better_methods=["score_cocktails_default"], worse_methods=["score_linear"])


In [65]:


ranks_df = {
    # results vs logreg
    # "too_easy_(HGBT,_Cocktails_default)_vs_(Logreg)_on_my": {
    #     "ranks": None,
    #     "dids": too_easy_cocktails_default_hgbt_linear_dids},
    "select_(HGBT,_Cocktails_default)_vs_(Logreg)_on_my": {
        "ranks": None,
        "dids": select_cocktails_default_hgbt_linear_dids},

    # results vs tree
    # "too_easy_(HGBT,_Cocktails_default)_vs_(tree)_on_my": {
    #     "ranks": None,
    #     "dids": too_easy_cocktails_default_hgbt_tree_dids},
    "select_(HGBT,_Cocktails_default)_vs_(tree)_on_my": {
        "ranks": None,
        "dids": select_cocktails_default_hgbt_tree_dids},
    
    # results vs both
    # "too_easy_(HGBT,_Cocktails_default)_vs_(tree,Logreg)_on_my": {
    #     "ranks": None,
    #     "dids": too_easy_cocktails_default_hgbt_combined_dids},
    "select_(HGBT,_Cocktails_default)_vs_(tree,Logreg)_on_my": {
        "ranks": None,
        "dids": select_cocktails_default_hgbt_combined_dids},

    # results individual
    # vs tree
    # "too_easy_(_Cocktails_default)_vs_(tree)_on_my": {
    #     "ranks": None,
    #     "dids": too_easy_cocktails_default_tree_dids},
    "select_(_Cocktails_default)_vs_(tree)_on_my": {
        "ranks": None,
        "dids": select_cocktails_default_tree_dids},
    # vs logreg
    # "too_easy_(_Cocktails_default)_vs_(Logreg)_on_my": {
    #     "ranks": None,
    #     "dids": too_easy_cocktails_default_linear_dids},
    "select_(_Cocktails_default)_vs_(Logreg)_on_my": {
        "ranks": None,
        "dids": select_cocktails_default_linear_dids},

    # all combined
    # "too_easy_(HGBT,_Cocktails_default)_vs_(tree,Logreg)_on_my": {
    #     "ranks": None,
    #     "dids": too_easy_autopytorch_default_cocktails_default_hgbt_combined_dids},
    "select_(HGBT,_Cocktails_default)_vs_(tree,Logreg)_on_my": {
        "ranks": None,
        "dids": select_autopytorch_default_cocktails_default_hgbt_combined_dids},

    # from previous tables
    # "too_easy_(HGBT)_vs_(tree)_on_my": {
    #     "ranks": None,
    #     "dids": too_easy_hgbt_tree_dids},
    "select_(HGBT)_vs_(tree)_on_my": {
        "ranks": None,
        "dids": select_hgbt_tree_dids},
    # "too_easy_(HGBT)_vs_(Logreg)_on_my": {
    #     "ranks": None,
    #     "dids": too_easy_hgbt_linear_dids},
    "select_(HGBT)_vs_(Logreg)_on_my": {
        "ranks": None,
        "dids": select_hgbt_linear_dids},
    # "too_easy_(HGBT)_vs_(tree,Logreg)_on_my": {
    #     "ranks": None,
    #     "dids": too_easy_hgbt_combined_dids},
    "select_(HGBT)_vs_(tree,Logreg)_on_my": {
        "ranks": None,
        "dids": select_hgbt_combined_dids},
        }


In [66]:
selected_datasets = [151, 293, 722, 821, 993, 1044, 1120, 1461, 1489, 41150, 41168, 42769, 44089, 44090, 44091]


In [67]:
set(select_hgbt_linear_dids) - set(selected_datasets)

{354, 1222, 4541}

In [68]:
methods = [ "hgbt", "logistic", "cocktails_random", "cocktails_default"] # , "resnet", "rf", "tree", "mlp",]
metrics = ["acc"]
for key in ranks_df:
    current_result = get_result(methods, metrics, ranks_df[key]['dids'], all_my_results.reset_index())
    current_ranks = get_average_rank_table(current_result)
    ranks_df[key]['ranks'] = current_ranks
    print(f"\n\n{key.upper().replace('_', ' ')} ({len(ranks_df[key]['dids'])})")
    pprint(current_ranks)



SELECT (HGBT, COCKTAILS DEFAULT) VS (LOGREG) ON MY (19)
|                |   cocktails_default |   cocktails_random |   hgbt |   logistic |
|:---------------|--------------------:|-------------------:|-------:|-----------:|
| ACC Mean Rank  |              2.5263 |             2.1667 | 1.3684 |     3.8421 |
| ACC Mean Score |              0.7771 |             0.7788 | 0.7952 |     0.7042 |


SELECT (HGBT, COCKTAILS DEFAULT) VS (TREE) ON MY (44)
|                |   cocktails_default |   cocktails_random |   hgbt |   logistic |
|:---------------|--------------------:|-------------------:|-------:|-----------:|
| ACC Mean Rank  |              2.7727 |             2.2791 | 1.7955 |     3.1136 |
| ACC Mean Score |              0.7668 |             0.7703 | 0.7781 |     0.7441 |


SELECT (HGBT, COCKTAILS DEFAULT) VS (TREE,LOGREG) ON MY (15)
|                |   cocktails_default |   cocktails_random |   hgbt |   logistic |
|:---------------|--------------------:|-------------------:|------

In [69]:
methods = [ "hgbt", "logistic", "resnet", "rf", "tree", "mlp", "cocktails_default", "cocktails_random"]
metrics = ["acc"]
for key in ranks_df:
    current_result = get_result(methods, metrics, ranks_df[key]['dids'], all_my_results.reset_index())
    current_ranks = get_average_rank_table(current_result)
    ranks_df[key]['ranks'] = current_ranks
    print(f"\n\n{key.upper().replace('_', ' ')} ({len(ranks_df[key]['dids'])})")
    pprint(current_ranks)



SELECT (HGBT, COCKTAILS DEFAULT) VS (LOGREG) ON MY (19)
|                |   cocktails_default |   cocktails_random |   hgbt |   logistic |    mlp |   resnet |     rf |   tree |
|:---------------|--------------------:|-------------------:|-------:|-----------:|-------:|---------:|-------:|-------:|
| ACC Mean Rank  |              4.5263 |             3.9444 | 1.8947 |     7.3158 | 4.2105 |   5.1579 | 2.1579 | 6.5789 |
| ACC Mean Score |              0.7771 |             0.7788 | 0.7952 |     0.7042 | 0.7666 |   0.7531 | 0.7909 | 0.7312 |


SELECT (HGBT, COCKTAILS DEFAULT) VS (TREE) ON MY (44)
|                |   cocktails_default |   cocktails_random |   hgbt |   logistic |    mlp |   resnet |     rf |   tree |
|:---------------|--------------------:|-------------------:|-------:|-----------:|-------:|---------:|-------:|-------:|
| ACC Mean Rank  |              4.7955 |             3.907  | 2.7273 |     5.4091 | 3.6591 |   4.5    | 3.2273 | 7.6818 |
| ACC Mean Score |              

In [70]:
methods = [ "hgbt", "linear", "resnet", "rf", "tree", "mlp", "cocktails_default", "cocktails_random"]

too_easy_std_methods_dids, select_std_methods_dids = get_too_easy_select_acc_to_difference(all_my_results, methods=methods)
# subset_methods = [ "hgbt", "linear", "rf", "tree", "mlp","autopytorch_default", "cocktails_default"]

# too_easy_autopytorch_default_linear_dids, select_autopytorch_default_linear_dids = get_too_easy_select_acc_to_difference(all_my_results, methods=methods)


In [81]:
methods = [ "hgbt", "linear", "resnet", "rf", "tree", "mlp", "cocktails_default"]

too_easy_std_methods_dids, select_std_methods_dids = get_too_easy_select_acc_to_difference(all_my_results, methods=methods)

ranks_df = {
    "std_>_0.05,_methods=all_-_random_(select)": {
        "ranks": None,
        "dids": select_std_methods_dids},
        }
methods = [ "hgbt", "logistic", "resnet", "rf", "tree", "mlp","cocktails_default", "cocktails_random"]
metrics = ["acc"]
for key in ranks_df:
    current_result = get_result(methods, metrics, ranks_df[key]['dids'], all_my_results.reset_index())
    current_ranks = get_average_rank_table(current_result)
    ranks_df[key]['ranks'] = current_ranks
    print(f"\n\n{key.upper().replace('_', ' ')} ({len(ranks_df[key]['dids'])})")
    pprint(current_ranks)



STD > 0.05, METHODS=ALL - RANDOM (SELECT) (5)
|                |   cocktails_default |   cocktails_random |   hgbt |   logistic |    mlp |   resnet |     rf |   tree |
|:---------------|--------------------:|-------------------:|-------:|-----------:|-------:|---------:|-------:|-------:|
| ACC Mean Rank  |              3      |             3.2    | 3      |     6.2    | 4.4    |   5.2    | 4.2    | 6.8    |
| ACC Mean Score |              0.8715 |             0.8391 | 0.8596 |     0.7252 | 0.8274 |   0.7868 | 0.8338 | 0.7475 |


In [82]:
methods = [ "hgbt", "linear", "tree", "cocktails_default"] # , ""]

too_easy_std_methods_dids, select_std_methods_dids = get_too_easy_select_acc_to_difference(all_my_results, methods=methods)

ranks_df = {
    "std_>_0.05,_methods=hgbt,linear,tree,cocktails_default_(select)": {
        "ranks": None,
        "dids": select_std_methods_dids},
        }
methods = [ "hgbt", "logistic", "resnet", "rf", "tree", "mlp", "cocktails_default", "cocktails_random"]
metrics = ["acc"]
for key in ranks_df:
    current_result = get_result(methods, metrics, ranks_df[key]['dids'], all_my_results.reset_index())
    current_ranks = get_average_rank_table(current_result)
    ranks_df[key]['ranks'] = current_ranks
    print(f"\n\n{key.upper().replace('_', ' ')} ({len(ranks_df[key]['dids'])})")
    pprint(current_ranks)



STD > 0.05, METHODS=HGBT,LINEAR,TREE,COCKTAILS DEFAULT (SELECT) (9)
|                |   cocktails_default |   cocktails_random |   hgbt |   logistic |    mlp |   resnet |     rf |   tree |
|:---------------|--------------------:|-------------------:|-------:|-----------:|-------:|---------:|-------:|-------:|
| ACC Mean Rank  |              3.8889 |             2.8889 | 3.1111 |     5.8889 | 4.4444 |   5.1111 | 4      | 6.6667 |
| ACC Mean Score |              0.8739 |             0.86   | 0.8743 |     0.7723 | 0.8457 |   0.8237 | 0.8567 | 0.7779 |


In [73]:
import openml
for d_id in select_std_methods_dids:
    dataset = openml.datasets.get_dataset(dataset_id=d_id, download_data=False)
    print(dataset.name)

electricity
covertype
poker
pol
elevators
phoneme
twonorm
Indian_pines
SantanderCustomerSatisfaction


In [74]:
their_datasets = [151, 293, 722, 821, 993, 1044, 1120, 1461, 1489, 41150, 41168, 42769, 44089, 44090, 44091]

ranks_df = {
    "select_their_benchmark": {
        "ranks": None,
        "dids": their_datasets},
        }
methods = [ "hgbt", "logistic", "resnet", "rf", "tree", "mlp", "cocktails_default", "cocktails_random"]
metrics = ["acc"]
for key in ranks_df:
    current_result = get_result(methods, metrics, ranks_df[key]['dids'], all_my_results.reset_index())
    current_ranks = get_average_rank_table(current_result)
    ranks_df[key]['ranks'] = current_ranks
    print(f"\n\n{key.upper().replace('_', ' ')} ({len(ranks_df[key]['dids'])})")
    pprint(current_ranks)



SELECT THEIR BENCHMARK (15)
|                |   cocktails_default |   cocktails_random |   hgbt |   logistic |    mlp |   resnet |     rf |   tree |
|:---------------|--------------------:|-------------------:|-------:|-----------:|-------:|---------:|-------:|-------:|
| ACC Mean Rank  |              4.7333 |             4.0667 | 1.7333 |     7.4    | 4.2    |   5.2    | 2      | 6.6667 |
| ACC Mean Score |              0.8047 |             0.8011 | 0.8356 |     0.7459 | 0.8063 |   0.7878 | 0.8328 | 0.7724 |


In [75]:
current_result

Results(df=metric                       acc                                          \
dataset                    151       293       722       821       993     
method            seed                                                     
cocktails_default 1     0.812899    0.8374  0.907769  0.854213  0.834297   
cocktails_random  1     0.818778  0.849633  0.909421  0.879911  0.841362   
hgbt              1     0.866601  0.818973   0.98519  0.881245  0.879512   
logistic          1     0.740939  0.616487  0.856595  0.823771  0.829159   
mlp               1     0.787723  0.784513  0.985785  0.874376  0.846371   
resnet            1     0.789512  0.824347  0.695339  0.871164  0.836994   
rf                1     0.862358   0.82788  0.981686  0.878181  0.882338   
tree              1     0.834464  0.764407  0.972231  0.813146  0.840976   

metric                                                                    \
dataset                    1044      1120      1461      1489      41150   


In [76]:
methods = [ "hgbt", "linear", "tree", "resnet", "mlp", "rf"] # , ""]

too_easy_std_methods_dids, select_std_methods_dids = get_too_easy_select_acc_to_difference(all_my_results, methods=methods)

ranks_df = {
    "std_>_0.05,_methods=hgbt,linear,tree,resnet_mlp_rf_(select)": {
        "ranks": None,
        "dids": select_std_methods_dids},
        }
methods = [ "hgbt", "logistic", "resnet", "rf", "tree", "mlp", "cocktails_default", "cocktails_random"]
metrics = ["acc"]
for key in ranks_df:
    current_result = get_result(methods, metrics, ranks_df[key]['dids'], all_my_results.reset_index())
    current_ranks = get_average_rank_table(current_result)
    ranks_df[key]['ranks'] = current_ranks
    print(f"\n\n{key.upper().replace('_', ' ')} ({len(ranks_df[key]['dids'])})")
    pprint(current_ranks)



STD > 0.05, METHODS=HGBT,LINEAR,TREE,RESNET MLP RF (SELECT) (7)
|                |   cocktails_default |   cocktails_random |   hgbt |   logistic |    mlp |   resnet |     rf |   tree |
|:---------------|--------------------:|-------------------:|-------:|-----------:|-------:|---------:|-------:|-------:|
| ACC Mean Rank  |              3.7143 |             3      | 3      |     5.7143 | 4.4286 |   5.2857 | 3.8571 | 7      |
| ACC Mean Score |              0.8829 |             0.8631 | 0.8773 |     0.7629 | 0.8495 |   0.8216 | 0.8601 | 0.7732 |


In [77]:
import openml
for d_id in select_std_methods_dids:
    dataset = openml.datasets.get_dataset(dataset_id=d_id, download_data=False)
    print(dataset.name)

covertype
poker
pol
phoneme
twonorm
Indian_pines
SantanderCustomerSatisfaction


In [78]:
methods = [ "hgbt",  "resnet", "mlp"] # , ""]

too_easy_std_methods_dids, select_std_methods_dids = get_too_easy_select_acc_to_difference(all_my_results, methods=methods)

ranks_df = {
    "std_>_0.05,_methods=hgbt,resnet_mlp_rf_(select)": {
        "ranks": None,
        "dids": select_std_methods_dids},
        }
methods = [ "hgbt", "logistic", "resnet", "rf", "tree", "mlp", "cocktails_default", "cocktails_random"]
metrics = ["acc"]
for key in ranks_df:
    current_result = get_result(methods, metrics, ranks_df[key]['dids'], all_my_results.reset_index())
    current_ranks = get_average_rank_table(current_result)
    ranks_df[key]['ranks'] = current_ranks
    print(f"\n\n{key.upper().replace('_', ' ')} ({len(ranks_df[key]['dids'])})")
    pprint(current_ranks)



STD > 0.05, METHODS=HGBT,RESNET MLP RF (SELECT) (1)
|                |   cocktails_default |   cocktails_random |   hgbt |   logistic |    mlp |   resnet |     rf |   tree |
|:---------------|--------------------:|-------------------:|-------:|-----------:|-------:|---------:|-------:|-------:|
| ACC Mean Rank  |              6      |             5      | 2      |     7      | 1      |   8      | 3      | 4      |
| ACC Mean Score |              0.9078 |             0.9094 | 0.9852 |     0.8566 | 0.9858 |   0.6953 | 0.9817 | 0.9722 |


In [79]:
all_my_results.index

Int64Index([   44,    60,   151,   279,   293,   351,   354,   357,   720,
              722,   725,   734,   735,   737,   761,   803,   816,   819,
              821,   823,   833,   846,   847,   871,   976,   979,   993,
             1044,  1053,  1110,  1113,  1119,  1120,  1222,  1241,  1242,
             1461,  1476,  1477,  1478,  1486,  1489,  1503,  1507,  1526,
             1590,  4134,  4541, 23517, 40685, 40923, 41146, 41147, 41150,
            41162, 41163, 41164, 41166, 41168, 41169, 41671, 41972, 42206,
            42343, 42395, 42468, 42477, 42742, 42746, 42769, 43489, 44089,
            44090, 44091],
           dtype='int64', name='dataset_id')

In [80]:
dl_models = ["cocktails_default", "mlp", "resnet", "cocktails_random"]
ml_models = ["hgbt", "logistic", "rf", "tree"]


methods = dl_models + ml_models

current_result = get_result(methods, ["acc"], all_my_results.index, all_my_results.reset_index())






In [37]:
get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_linear_dids)])).loc['ACC Mean Rank', dl_models].mean()


4.541666666666666

In [83]:

# select hgbt vs logreg (should be biased against DL)
dl_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_linear_dids)])).loc['ACC Mean Rank', dl_models].median()
ml_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_linear_dids)])).loc['ACC Mean Rank', ml_models].median()
print(dl_average_rank - ml_average_rank)

# select hgbt vs tree
dl_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_tree_dids)])).loc['ACC Mean Rank', dl_models].median()
ml_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_tree_dids)])).loc['ACC Mean Rank', ml_models].median()
print(dl_average_rank - ml_average_rank)

# select hgbt vs logreg, tree
dl_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_combined_dids)])).loc['ACC Mean Rank', dl_models].median()
ml_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_combined_dids)])).loc['ACC Mean Rank', ml_models].median()
print(dl_average_rank - ml_average_rank)

# select cocktails default vs logreg
dl_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_cocktails_default_linear_dids)])).loc['ACC Mean Rank', dl_models].median()
ml_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_cocktails_default_linear_dids)])).loc['ACC Mean Rank', ml_models].median()
print(dl_average_rank - ml_average_rank)

# select cocktails default vs tree (should be biased against tree based)
dl_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_cocktails_default_tree_dids)])).loc['ACC Mean Rank', dl_models].median()
ml_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_cocktails_default_tree_dids)])).loc['ACC Mean Rank', ml_models].median()
print(dl_average_rank - ml_average_rank)

# select using STD (should not have a huge bias)
# [ "hgbt", "linear", "tree", "resnet", "mlp", "rf"]
select_std_methods_dids = [293, 354, 722, 1489, 1507, 41972, 42395]
dl_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_std_methods_dids)])).loc['ACC Mean Rank', dl_models].median()
ml_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_std_methods_dids)])).loc['ACC Mean Rank', ml_models].median()
print(dl_average_rank - ml_average_rank)

  dl_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_linear_dids)])).loc['ACC Mean Rank', dl_models].median()
  ml_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_linear_dids)])).loc['ACC Mean Rank', ml_models].median()
  dl_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_tree_dids)])).loc['ACC Mean Rank', dl_models].median()
  ml_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_tree_dids)])).loc['ACC Mean Rank', ml_models].median()
  dl_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_combined_dids)])).loc['ACC Mean Rank', dl_models].median()
  ml_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_combined_dids)])).loc['ACC Mean Rank', ml_models].median()
  dl_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc"

0.20000000000000107
-1.166666666666667
-2.75
0.20000000000000107
-1.166666666666667
-0.7142857142857144


  dl_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_cocktails_default_tree_dids)])).loc['ACC Mean Rank', dl_models].median()
  ml_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_cocktails_default_tree_dids)])).loc['ACC Mean Rank', ml_models].median()


In [84]:

# select hgbt vs logreg (should be biased against DL)
dl_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_linear_dids)])).loc['ACC Mean Rank', dl_models].mean()
ml_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_linear_dids)])).loc['ACC Mean Rank', ml_models].mean()
print(dl_average_rank - ml_average_rank)

# select hgbt vs tree
dl_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_tree_dids)])).loc['ACC Mean Rank', dl_models].mean()
ml_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_tree_dids)])).loc['ACC Mean Rank', ml_models].mean()
print(dl_average_rank - ml_average_rank)

# select hgbt vs logreg, tree
dl_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_combined_dids)])).loc['ACC Mean Rank', dl_models].mean()
ml_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_combined_dids)])).loc['ACC Mean Rank', ml_models].mean()
print(dl_average_rank - ml_average_rank)

# select cocktails default vs logreg
dl_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_cocktails_default_linear_dids)])).loc['ACC Mean Rank', dl_models].mean()
ml_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_cocktails_default_linear_dids)])).loc['ACC Mean Rank', ml_models].mean()
print(dl_average_rank - ml_average_rank)

# select cocktails default vs tree (should be biased against tree based)
dl_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_cocktails_default_tree_dids)])).loc['ACC Mean Rank', dl_models].mean()
ml_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_cocktails_default_tree_dids)])).loc['ACC Mean Rank', ml_models].mean()
print(dl_average_rank - ml_average_rank)

# select using STD (should not have a huge bias)
# [ "hgbt", "linear", "tree", "resnet", "mlp", "rf"]
select_std_methods_dids = [293, 354, 722, 1489, 1507, 41972, 42395]
dl_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_std_methods_dids)])).loc['ACC Mean Rank', dl_models].mean()
ml_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_std_methods_dids)])).loc['ACC Mean Rank', ml_models].mean()
print(dl_average_rank - ml_average_rank)

  dl_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_linear_dids)])).loc['ACC Mean Rank', dl_models].mean()
  ml_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_linear_dids)])).loc['ACC Mean Rank', ml_models].mean()
  dl_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_tree_dids)])).loc['ACC Mean Rank', dl_models].mean()
  ml_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_tree_dids)])).loc['ACC Mean Rank', ml_models].mean()
  dl_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_combined_dids)])).loc['ACC Mean Rank', dl_models].mean()
  ml_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_hgbt_combined_dids)])).loc['ACC Mean Rank', ml_models].mean()
  dl_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_coc

-0.3999999999999986
-1.75
-2.5
-0.3999999999999986
-1.75
-0.7857142857142865


  dl_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_cocktails_default_tree_dids)])).loc['ACC Mean Rank', dl_models].mean()
  ml_average_rank = get_average_rank_table(Results(current_result.df.loc[:, ("acc", select_cocktails_default_tree_dids)])).loc['ACC Mean Rank', ml_models].mean()


In [55]:
their_datasets

[151,
 293,
 722,
 821,
 993,
 1044,
 1120,
 1461,
 1489,
 41150,
 41168,
 42769,
 44089,
 44090,
 44091]