In [1]:
import pandas as pd
import numpy as np
from glob import glob
import re

In [2]:
scores = {}

In [3]:
scores['pbp-mv'] = {
    'rmse': {
        'bostonHousing': {
            'score': 3.11,
            'ste': 0.15,
        },
        'concrete':{
            'score': 5.08,
            'ste': 0.14,
        },
        'energy':{
            'score': 0.45,
            'ste': 0.01,
        },
        'kin8nm':{
            'score': 0.07,
            'ste': 0.00,
        },
        'naval-propulsion-plant':{
            'score': 0.00,
            'ste': 0.00,
        },
        'power-plant':{
            'score': 3.91,
            'ste': 0.04,
        },
        'protein-tertiary-structure':{
            'score': 3.94,
            'ste': 0.02,
        },
        'wine-quality-red':{
            'score': 0.64,
            'ste': 0.01,
        },
        'yacht':{
            'score': 0.81,
            'ste': 0.06,
        },
    },
    'nll': {
        'bostonHousing': {
            'score': 2.54,
            'ste': 0.08,
        },
        'concrete':{
            'score': 3.04,
            'ste': 0.03,
        },
        'energy':{
            'score': 1.01,
            'ste': 0.01,
        },
        'kin8nm':{
            'score': -1.28,
            'ste': 0.01,
        },
        'naval-propulsion-plant':{
            'score': -4.85,
            'ste': 0.06,
        },
        'power-plant':{
            'score': 2.78,
            'ste': 0.01,
        },
        'protein-tertiary-structure':{
            'score': 2.77,
            'ste': 0.01,
        },
        'wine-quality-red':{
            'score': 0.97,
            'ste': 0.01,
        },
        'yacht':{
            'score': 1.64,
            'ste': 0.02,
        },
    },
}

In [4]:
# parsing dropout scores
scores['dropout'] = {'rmse': {}, 'nll': {}}
for path in sorted(glob('DropoutUncertaintyExps/UCI_Datasets/*')):
    dataset_name = path.split('/')[-1]
    with open(path + "/results/log_100_xepochs_1_hidden_layers.txt") as f:
        logtext = f.read()
        (rmse_raw, rmse_ste_raw) = re.findall(r'errors (-?\d+\.\d+) .* \+- (\d+\.\d+) \(std error\)', logtext)[0]
        rmse = float(rmse_raw)
        rmse_ste = float(rmse_ste_raw)
        scores['dropout']['rmse'][dataset_name] = {
            'score': rmse,
            'ste': rmse_ste,
        }
        (ll_raw, ll_ste_raw) = re.findall(r'lls (-?\d+\.\d+) .* \+- (\d+\.\d+) \(std error\)', logtext)[0]
        nll = -float(ll_raw)
        ll_ste = float(ll_ste_raw)
        scores['dropout']['nll'][dataset_name] = {
            'score': nll,
            'ste': ll_ste,
        }

In [5]:
datasets_order = {
    'bostonHousing': 'boston',
    'concrete': 'concrete',
    'energy': 'energy',
    'kin8nm': 'kin8nm',
    'naval-propulsion-plant': 'naval',
    'power-plant': 'power',
    'protein-tertiary-structure': 'protein',
    'wine-quality-red': 'wine',
    'yacht': 'yacht',
}

In [6]:
csv_results = {
    'ensembles': 'results/ensembles/scores.csv',
    'ensembles_gelu': 'results/ensembles_gelu/scores.csv',
    'mdn': 'results/mdn/scores.csv',
    'mdn_gelu': 'results/mdn_gelu/scores.csv',
    'wcrps': 'results/wcrps/scores.csv',
    'mdn_bnn': 'results/mdn_bnn/scores.csv',
    'mdn_bnn_gelu': 'results/mdn_bnn_gelu/scores.csv',
    'wcrps_ens': 'results/wcrps_ens/scores.csv',
    'crps': 'results/crps/scores.csv',
    'wcrps_mh': 'results/wcrps_mh/scores.csv',
    'wcrps_mh_ml': 'results/wcrps_mh_ml/scores.csv',
    'wcrps_ens_mh': 'results/wcrps_ens_mh/scores.csv',
    'wcrps_ens_mh_ml': 'results/wcrps_ens_mh_ml/scores.csv',
}
for method_name, score_path in csv_results.items():
    scores[method_name] = {'rmse': {}, 'nll': {}}
    for dataset, split_scores in pd.read_csv(score_path).groupby('dataset'):
        scores[method_name]['rmse'][dataset] = {
            'score': split_scores['rmse'].mean(),
            'ste': split_scores['rmse'].std()/np.sqrt(split_scores.shape[0]),
        }
        scores[method_name]['nll'][dataset] = {
            'score': split_scores['nll'].mean(),
            'ste': split_scores['nll'].std()/np.sqrt(split_scores.shape[0]),
        }

In [7]:
base_best = {'nll': {}}
base_best['nll']['naval-propulsion-plant'] = -5.64
base_best['nll']['yacht'] = 1.1
for bm in ['dropout', 'pbp-mv', 'ensembles', 'mdn', 'wcrps', 'mdn_bnn', 'wcrps_ens']:
    for dataset in datasets_order:
        base_best['nll'][dataset] = min(scores[bm]['nll'][dataset]['score'], base_best['nll'].get(dataset, np.inf))

In [8]:
methods_order = dict(
    **{
        'dropout': 'Dropout',
        'pbp-mv': 'PBP-MV',
        'ensembles': 'Ensembles',
        'mdn_gelu': 'MDN',
        'mdn_bnn': '$\\textrm{MDN}_{\\textrm{bnn}}$',
        'wcrps': 'WCRPS',
        'wcrps_ens': '$\\textrm{WCRPS}_{\\textrm{e}}$',
    },
)

In [9]:
format_score = "nll"
rows = []
for dataset in datasets_order.keys():
    samples_num = pd.read_csv('DropoutUncertaintyExps/UCI_Datasets/'+dataset+'/data/data.txt', header=None).shape[0]
    row = ["\\small %s"%datasets_order[dataset], "\\small %.1fk"%(samples_num/1000)]
    base_best_sc = base_best['nll'][dataset]

    for method in methods_order.keys():
        score = scores[method][format_score][dataset]['score']
        row.append(
            (("\\textbf{\small %.2f}" if score<=base_best_sc else "\small %.2f") + "{\\tiny\color{gray}$\pm$%.2f}")%(
                scores[method][format_score][dataset]['score'], scores[method][format_score][dataset]['ste'])
            )
    rows.append(row)

nll_df = pd.DataFrame(rows, columns=['\\small dataset', '\multicolumn{1}{c}{\\small n}'] + ['\multicolumn{1}{c}{\\small %s}'%v for k,v in methods_order.items()])
tab = nll_df.to_latex(
    index=False,
    bold_rows=True,
    float_format="%.2f",
    column_format="lr|rrr|rrrr",
    formatters=['{0}'.format, '{0}'.format] + len(datasets_order)*['{0}'.format],
    caption='NLL results on the UCI Datasets benchmark (lower is better)}\centering {',
    label='tab:nll_uci'
)
print(re.sub(r'-([0-9]+(\.[0-9]+)?)', r'$\\mathllap{\\textrm{-}}$\1', tab))

\begin{table}
\caption{NLL results on the UCI Datasets benchmark (lower is better)}\centering {}
\label{tab:nll_uci}
\begin{tabular}{lr|rrr|rrrr}
\toprule
\small dataset & \multicolumn{1}{c}{\small n} & \multicolumn{1}{c}{\small Dropout} & \multicolumn{1}{c}{\small PBP-MV} & \multicolumn{1}{c}{\small Ensembles} & \multicolumn{1}{c}{\small MDN} & \multicolumn{1}{c}{\small $\textrm{MDN}_{\textrm{bnn}}$} & \multicolumn{1}{c}{\small WCRPS} & \multicolumn{1}{c}{\small $\textrm{WCRPS}_{\textrm{e}}$} \\
\midrule
\small boston & \small 0.5k & \small 2.40{\tiny\color{gray}$\pm$0.04} & \small 2.54{\tiny\color{gray}$\pm$0.08} & \small 2.44{\tiny\color{gray}$\pm$0.05} & \small 2.52{\tiny\color{gray}$\pm$0.04} & \small 2.36{\tiny\color{gray}$\pm$0.03} & \small 2.40{\tiny\color{gray}$\pm$0.05} & \textbf{\small 2.32}{\tiny\color{gray}$\pm$0.05} \\
\small concrete & \small 1.0k & \textbf{\small 2.94}{\tiny\color{gray}$\pm$0.02} & \small 3.04{\tiny\color{gray}$\pm$0.03} & \small 2.97{\tiny\color{gray}$

In [10]:
base_best['rmse'] = {}
for bm in ['dropout', 'pbp-mv', 'ensembles', 'mdn_gelu', 'wcrps', 'mdn_bnn', 'wcrps_ens']:
    for dataset in datasets_order:
        base_best['rmse'][dataset] = min(scores[bm]['rmse'][dataset]['score'], base_best['rmse'].get(dataset, np.inf))

In [11]:
format_score = "rmse"
rows = []
for dataset in datasets_order.keys():
    samples_num = pd.read_csv('DropoutUncertaintyExps/UCI_Datasets/'+dataset+'/data/data.txt', header=None).shape[0]
    row = ["\\small %s"%datasets_order[dataset], "\\small %.1fk"%(samples_num/1000)]

    base_best_sc = round(base_best['rmse'][dataset], 2)
    if dataset=="naval-propulsion-plant":
        base_best_sc = -1 # no winner
    for method in methods_order.keys():
        score = scores[method][format_score][dataset]['score']
        row.append(
            (("\\textbf{\\small %.2f}" if round(score, 2)<=round(base_best_sc, 2) else "\small %.2f") + "{\\tiny\color{gray}$\pm$%.2f}")%(
                scores[method][format_score][dataset]['score'], scores[method][format_score][dataset]['ste'])
            )
    rows.append(row)


nll_df = pd.DataFrame(rows, columns=['dataset', '\multicolumn{1}{c}{n}'] + ['\multicolumn{1}{c}{\\small %s}'%v for k,v in methods_order.items()])
print(nll_df.to_latex(
    index=False,
    bold_rows=True,
    float_format="%.2f",
    column_format="lr|rrr|rrrr",
    formatters=['{0}'.format, '{0}'.format] + len(datasets_order)*['{0}'.format],
    caption='RMSE results on the UCI Datasets benchmark (lower is better)}\centering {',
    label='tab:rmse_uci'
))

\begin{table}
\caption{RMSE results on the UCI Datasets benchmark (lower is better)}\centering {}
\label{tab:rmse_uci}
\begin{tabular}{lr|rrr|rrrr}
\toprule
dataset & \multicolumn{1}{c}{n} & \multicolumn{1}{c}{\small Dropout} & \multicolumn{1}{c}{\small PBP-MV} & \multicolumn{1}{c}{\small Ensembles} & \multicolumn{1}{c}{\small MDN} & \multicolumn{1}{c}{\small $\textrm{MDN}_{\textrm{bnn}}$} & \multicolumn{1}{c}{\small WCRPS} & \multicolumn{1}{c}{\small $\textrm{WCRPS}_{\textrm{e}}$} \\
\midrule
\small boston & \small 0.5k & \small 3.61{\tiny\color{gray}$\pm$0.23} & \small 3.11{\tiny\color{gray}$\pm$0.15} & \small 3.37{\tiny\color{gray}$\pm$0.17} & \small 3.73{\tiny\color{gray}$\pm$0.25} & \small 2.93{\tiny\color{gray}$\pm$0.20} & \small 3.07{\tiny\color{gray}$\pm$0.23} & \textbf{\small 2.91}{\tiny\color{gray}$\pm$0.18} \\
\small concrete & \small 1.0k & \small 5.45{\tiny\color{gray}$\pm$0.19} & \small 5.08{\tiny\color{gray}$\pm$0.14} & \small 5.19{\tiny\color{gray}$\pm$0.19} & \small 5.

In [12]:
methods_order = dict(
    **{
        'crps': 'CRPS',
        'wcrps': 'WCRPS',
        'wcrps_mh': '$\\textrm{WCRPS}^{\\textrm{mh}}$',
        'wcrps_mh_ml': '$\\textrm{WCRPS}^{\\textrm{lmh}}$',
        'wcrps_ens': '$\\textrm{WCRPS}_{\\textrm{e}}$',
        'wcrps_ens_mh': '$\\textrm{WCRPS}^{\\textrm{mh}}_{\\textrm{e}}$',
        'wcrps_ens_mh_ml': '$\\textrm{WCRPS}^{\\textrm{lmh}}_{\\textrm{e}}$',
    },
)

In [13]:
format_score = "nll"
rows = []
for dataset in datasets_order.keys():
    samples_num = pd.read_csv('DropoutUncertaintyExps/UCI_Datasets/'+dataset+'/data/data.txt', header=None).shape[0]
    row = ["\\small %s"%datasets_order[dataset]]
    base_best_sc = base_best['nll'][dataset]

    for method in methods_order.keys():
        score = scores[method][format_score][dataset]['score']
        row.append(
            (("\\textbf{\small %.2f}" if score<=base_best_sc else "\small %.2f") + "{\\tiny\color{gray}$\pm$%.2f}")%(
                scores[method][format_score][dataset]['score'], scores[method][format_score][dataset]['ste'])
            )
    rows.append(row)

nll_df = pd.DataFrame(rows, columns=['\\small dataset'] + ['\multicolumn{1}{c}{\\small %s}'%v for k,v in methods_order.items()])
tab = nll_df.to_latex(
    index=False,
    bold_rows=True,
    float_format="%.2f",
    column_format="l|r|rrr|rrr",
    formatters=['{0}'.format, '{0}'.format] + len(datasets_order)*['{0}'.format],
    caption='NLL results on the UCI Datasets benchmark (lower is better). Scores that perform at least on par when compared to the results in Table~\\ref{tab:nll_uci} are highlighted in \\textbf{bold}.}\centering {',
    label='tab:nll_uci_full'
)
print(re.sub(r'-([0-9]+(\.[0-9]+)?)', r'$\\mathllap{\\textrm{-}}$\1', tab))

\begin{table}
\caption{NLL results on the UCI Datasets benchmark (lower is better). Scores that perform at least on par when compared to the results in Table~\ref{tab:nll_uci} are highlighted in \textbf{bold}.}\centering {}
\label{tab:nll_uci_full}
\begin{tabular}{l|r|rrr|rrr}
\toprule
\small dataset & \multicolumn{1}{c}{\small CRPS} & \multicolumn{1}{c}{\small WCRPS} & \multicolumn{1}{c}{\small $\textrm{WCRPS}^{\textrm{mh}}$} & \multicolumn{1}{c}{\small $\textrm{WCRPS}^{\textrm{lmh}}$} & \multicolumn{1}{c}{\small $\textrm{WCRPS}_{\textrm{e}}$} & \multicolumn{1}{c}{\small $\textrm{WCRPS}^{\textrm{mh}}_{\textrm{e}}$} & \multicolumn{1}{c}{\small $\textrm{WCRPS}^{\textrm{lmh}}_{\textrm{e}}$} \\
\midrule
\small boston & \small 2.38{\tiny\color{gray}$\pm$0.06} & \small 2.40{\tiny\color{gray}$\pm$0.05} & \small 2.40{\tiny\color{gray}$\pm$0.04} & \small 2.40{\tiny\color{gray}$\pm$0.06} & \textbf{\small 2.32}{\tiny\color{gray}$\pm$0.05} & \textbf{\small 2.32}{\tiny\color{gray}$\pm$0.05} & \tex

In [14]:
format_score = "rmse"
rows = []
for dataset in datasets_order.keys():
    samples_num = pd.read_csv('DropoutUncertaintyExps/UCI_Datasets/'+dataset+'/data/data.txt', header=None).shape[0]
    row = ["\\small %s"%datasets_order[dataset]]

    base_best_sc = round(base_best['rmse'][dataset], 2)
    if dataset=="naval-propulsion-plant":
        base_best_sc = -1 # no winner
    for method in methods_order.keys():
        score = scores[method][format_score][dataset]['score']
        row.append(
            (("\\textbf{\\small %.2f}" if round(score, 2)<=round(base_best_sc, 2) else "\small %.2f") + "{\\tiny\color{gray}$\pm$%.2f}")%(
                scores[method][format_score][dataset]['score'], scores[method][format_score][dataset]['ste'])
            )
    rows.append(row)


nll_df = pd.DataFrame(rows, columns=['dataset'] + ['\multicolumn{1}{c}{\\small %s}'%v for k,v in methods_order.items()])
print(nll_df.to_latex(
    index=False,
    bold_rows=True,
    float_format="%.2f",
    column_format="l|r|rrr|rrr",
    formatters=['{0}'.format, '{0}'.format] + len(datasets_order)*['{0}'.format],
    caption='RMSE results on the UCI Datasets benchmark (lower is better).  Scores that perform at least on par when compared to the results in Table~\\ref{tab:rmse_uci} are highlighted in \\textbf{bold}.}\centering {',
    label='tab:rmse_uci_full'
))

\begin{table}
\caption{RMSE results on the UCI Datasets benchmark (lower is better).  Scores that perform at least on par when compared to the results in Table~\ref{tab:rmse_uci} are highlighted in \textbf{bold}.}\centering {}
\label{tab:rmse_uci_full}
\begin{tabular}{l|r|rrr|rrr}
\toprule
dataset & \multicolumn{1}{c}{\small CRPS} & \multicolumn{1}{c}{\small WCRPS} & \multicolumn{1}{c}{\small $\textrm{WCRPS}^{\textrm{mh}}$} & \multicolumn{1}{c}{\small $\textrm{WCRPS}^{\textrm{lmh}}$} & \multicolumn{1}{c}{\small $\textrm{WCRPS}_{\textrm{e}}$} & \multicolumn{1}{c}{\small $\textrm{WCRPS}^{\textrm{mh}}_{\textrm{e}}$} & \multicolumn{1}{c}{\small $\textrm{WCRPS}^{\textrm{lmh}}_{\textrm{e}}$} \\
\midrule
\small boston & \small 3.01{\tiny\color{gray}$\pm$0.21} & \small 3.07{\tiny\color{gray}$\pm$0.23} & \small 3.08{\tiny\color{gray}$\pm$0.20} & \small 3.12{\tiny\color{gray}$\pm$0.25} & \textbf{\small 2.91}{\tiny\color{gray}$\pm$0.18} & \small 2.98{\tiny\color{gray}$\pm$0.20} & \small 2.93{\tin

In [15]:
methods_order = dict(
    **{
        'ensembles': 'Ensembles',
        'ensembles_gelu': '$\\textrm{Ensembles}^{\\textrm{gelu}}$',
        'mdn': 'MDN',
        'mdn_gelu': '$\\textrm{MDN}^{\\textrm{gelu}}$',
        'mdn_bnn': '$\\textrm{MDN}_{\\textrm{bnn}}$',
        'mdn_bnn_gelu': '$\\textrm{MDN}^{\\textrm{gelu}}_{\\textrm{bnn}}$',
        'wcrps_ens': '$\\textrm{WCRPS}_{\\textrm{e}}$',
    },
)

In [16]:
pairwise_best = {}
for dataset in datasets_order:
    pairwise_best[dataset] = {}
    for bm1, bm2 in [['ensembles', 'ensembles_gelu'], ['mdn', 'mdn_gelu'], ['mdn_bnn', 'mdn_bnn_gelu']]:
        sc1 = round(scores[bm1]['nll'][dataset]['score'], 2)
        sc2 = round(scores[bm2]['nll'][dataset]['score'], 2)
        if sc1 <= sc2:
            pairwise_best[dataset][bm1] = True
        if sc2 <= sc1:
            pairwise_best[dataset][bm2] = True

In [17]:
format_score = "nll"
rows = []
for dataset in datasets_order.keys():
    samples_num = pd.read_csv('DropoutUncertaintyExps/UCI_Datasets/'+dataset+'/data/data.txt', header=None).shape[0]
    row = ["\\small %s"%datasets_order[dataset]]
    base_best_sc = base_best['nll'][dataset]

    for method in methods_order.keys():
        score = scores[method][format_score][dataset]['score']
        sc = ("\\textbf{\small %.2f}" if score<=base_best_sc else "\small %.2f")%scores[method][format_score][dataset]['score']
        sc = ("\\underline{%s}" if pairwise_best[dataset].get(method, False) else "%s")%sc
        sc += "{\\tiny\color{gray}$\pm$%.2f}"%scores[method][format_score][dataset]['ste']
        row.append(sc)
    rows.append(row)

nll_df = pd.DataFrame(rows, columns=['\\small dataset'] + ['\multicolumn{1}{c}{\\small %s}'%v for k,v in methods_order.items()])
tab = nll_df.to_latex(
    index=False,
    bold_rows=True,
    float_format="%.2f",
    column_format="l|rr|rr|rr|r",
    formatters=['{0}'.format, '{0}'.format] + len(datasets_order)*['{0}'.format],
    caption='Pairwise comparison of ReLU and GELU based baseline variants, evaluated in NLL. Pairwise best scores are highlighted with \\underline{underline}. Scores that perform at least on par when compared to the results in Table~\\ref{tab:nll_uci} are highlighted in \\textbf{bold}.}\centering {',
    label='tab:nll_uci_gelu'
)
print(re.sub(r'-([0-9]+(\.[0-9]+)?)', r'$\\mathllap{\\textrm{-}}$\1', tab))

\begin{table}
\caption{Pairwise comparison of ReLU and GELU based baseline variants, evaluated in NLL. Pairwise best scores are highlighted with \underline{underline}. Scores that perform at least on par when compared to the results in Table~\ref{tab:nll_uci} are highlighted in \textbf{bold}.}\centering {}
\label{tab:nll_uci_gelu}
\begin{tabular}{l|rr|rr|rr|r}
\toprule
\small dataset & \multicolumn{1}{c}{\small Ensembles} & \multicolumn{1}{c}{\small $\textrm{Ensembles}^{\textrm{gelu}}$} & \multicolumn{1}{c}{\small MDN} & \multicolumn{1}{c}{\small $\textrm{MDN}^{\textrm{gelu}}$} & \multicolumn{1}{c}{\small $\textrm{MDN}_{\textrm{bnn}}$} & \multicolumn{1}{c}{\small $\textrm{MDN}^{\textrm{gelu}}_{\textrm{bnn}}$} & \multicolumn{1}{c}{\small $\textrm{WCRPS}_{\textrm{e}}$} \\
\midrule
\small boston & \small 2.44{\tiny\color{gray}$\pm$0.05} & \underline{\small 2.41}{\tiny\color{gray}$\pm$0.05} & \underline{\small 2.45}{\tiny\color{gray}$\pm$0.06} & \small 2.52{\tiny\color{gray}$\pm$0.04} & \s

In [18]:
pairwise_best = {}
for dataset in datasets_order:
    pairwise_best[dataset] = {}
    for bm1, bm2 in [['ensembles', 'ensembles_gelu'], ['mdn', 'mdn_gelu'], ['mdn_bnn', 'mdn_bnn_gelu']]:
        sc1 = round(scores[bm1]['rmse'][dataset]['score'], 2)
        sc2 = round(scores[bm2]['rmse'][dataset]['score'], 2)
        if sc1 <= sc2:
            pairwise_best[dataset][bm1] = True
        if sc2 <= sc1:
            pairwise_best[dataset][bm2] = True

In [19]:
format_score = "rmse"
rows = []
for dataset in datasets_order.keys():
    samples_num = pd.read_csv('DropoutUncertaintyExps/UCI_Datasets/'+dataset+'/data/data.txt', header=None).shape[0]
    row = ["\\small %s"%datasets_order[dataset]]
    base_best_sc = base_best['nll'][dataset]

    for method in methods_order.keys():
        score = scores[method][format_score][dataset]['score']
        sc = ("\\textbf{\small %.2f}" if score<=base_best_sc else "\small %.2f")%scores[method][format_score][dataset]['score']
        sc = ("\\underline{%s}" if pairwise_best[dataset].get(method, False) else "%s")%sc
        sc += "{\\tiny\color{gray}$\pm$%.2f}"%scores[method][format_score][dataset]['ste']
        row.append(sc)
    rows.append(row)

nll_df = pd.DataFrame(rows, columns=['\\small dataset'] + ['\multicolumn{1}{c}{\\small %s}'%v for k,v in methods_order.items()])
tab = nll_df.to_latex(
    index=False,
    bold_rows=True,
    float_format="%.2f",
    column_format="l|rr|rr|rr|r",
    formatters=['{0}'.format, '{0}'.format] + len(datasets_order)*['{0}'.format],
    caption='Pairwise comparison of ReLU and GELU based baseline variants, evaluated in RMSE. Pairwise best scores are highlighted with \\underline{underline}.}\centering {',
    label='tab:rmse_uci_gelu'
)
print(re.sub(r'-([0-9]+(\.[0-9]+)?)', r'$\\mathllap{\\textrm{-}}$\1', tab))

\begin{table}
\caption{Pairwise comparison of ReLU and GELU based baseline variants, evaluated in RMSE. Pairwise best scores are highlighted with \underline{underline}.}\centering {}
\label{tab:rmse_uci_gelu}
\begin{tabular}{l|rr|rr|rr|r}
\toprule
\small dataset & \multicolumn{1}{c}{\small Ensembles} & \multicolumn{1}{c}{\small $\textrm{Ensembles}^{\textrm{gelu}}$} & \multicolumn{1}{c}{\small MDN} & \multicolumn{1}{c}{\small $\textrm{MDN}^{\textrm{gelu}}$} & \multicolumn{1}{c}{\small $\textrm{MDN}_{\textrm{bnn}}$} & \multicolumn{1}{c}{\small $\textrm{MDN}^{\textrm{gelu}}_{\textrm{bnn}}$} & \multicolumn{1}{c}{\small $\textrm{WCRPS}_{\textrm{e}}$} \\
\midrule
\small boston & \small 3.37{\tiny\color{gray}$\pm$0.17} & \underline{\small 3.27}{\tiny\color{gray}$\pm$0.24} & \underline{\small 3.54}{\tiny\color{gray}$\pm$0.27} & \small 3.73{\tiny\color{gray}$\pm$0.25} & \underline{\small 2.93}{\tiny\color{gray}$\pm$0.20} & \small 3.10{\tiny\color{gray}$\pm$0.22} & \small 2.91{\tiny\color{gray}$

In [20]:
format_score = "rmse"
rows = []
for dataset in datasets_order.keys():
    samples_num = pd.read_csv('DropoutUncertaintyExps/UCI_Datasets/'+dataset+'/data/data.txt', header=None).shape[0]
    row = ["\\small %s"%datasets_order[dataset], "\\small %.1fk"%(samples_num/1000)]

    base_best_sc = round(base_best['rmse'][dataset], 2)
    if dataset=="naval-propulsion-plant":
        base_best_sc = -1 # no winner
    for method in methods_order.keys():
        score = scores[method][format_score][dataset]['score']
        row.append(
            (("\\textbf{\\small %.2f}" if round(score, 2)<=round(base_best_sc, 2) else "\small %.2f") + "{\\tiny\color{gray}$\pm$%.2f}")%(
                scores[method][format_score][dataset]['score'], scores[method][format_score][dataset]['ste'])
            )
    rows.append(row)


nll_df = pd.DataFrame(rows, columns=['dataset', '\multicolumn{1}{c}{n}'] + ['\multicolumn{1}{c}{\\small %s}'%v for k,v in methods_order.items()])
print(nll_df.to_latex(
    index=False,
    bold_rows=True,
    float_format="%.2f",
    column_format="lr|rr|rr|rr",
    formatters=['{0}'.format, '{0}'.format] + len(datasets_order)*['{0}'.format],
    caption='RMSE results on the UCI Datasets benchmark (lower is better)}\centering {',
    label='tab:rmse_uci_gelu'
))

\begin{table}
\caption{RMSE results on the UCI Datasets benchmark (lower is better)}\centering {}
\label{tab:rmse_uci_gelu}
\begin{tabular}{lr|rr|rr|rr}
\toprule
dataset & \multicolumn{1}{c}{n} & \multicolumn{1}{c}{\small Ensembles} & \multicolumn{1}{c}{\small $\textrm{Ensembles}^{\textrm{gelu}}$} & \multicolumn{1}{c}{\small MDN} & \multicolumn{1}{c}{\small $\textrm{MDN}^{\textrm{gelu}}$} & \multicolumn{1}{c}{\small $\textrm{MDN}_{\textrm{bnn}}$} & \multicolumn{1}{c}{\small $\textrm{MDN}^{\textrm{gelu}}_{\textrm{bnn}}$} & \multicolumn{1}{c}{\small $\textrm{WCRPS}_{\textrm{e}}$} \\
\midrule
\small boston & \small 0.5k & \small 3.37{\tiny\color{gray}$\pm$0.17} & \small 3.27{\tiny\color{gray}$\pm$0.24} & \small 3.54{\tiny\color{gray}$\pm$0.27} & \small 3.73{\tiny\color{gray}$\pm$0.25} & \small 2.93{\tiny\color{gray}$\pm$0.20} & \small 3.10{\tiny\color{gray}$\pm$0.22} & \textbf{\small 2.91}{\tiny\color{gray}$\pm$0.18} \\
\small concrete & \small 1.0k & \small 5.19{\tiny\color{gray}$\pm$0.1

# value frequency stats

In [21]:
frequency = []
for dataset, dname in datasets_order.items():
    data = np.loadtxt('DropoutUncertaintyExps/UCI_Datasets/' + dataset + '/data/data.txt')
    index_target   = np.loadtxt('DropoutUncertaintyExps/UCI_Datasets/' + dataset + '/data/index_target.txt').astype(int)
    Y = data[ : , index_target.tolist() ]
    print(dname, data.shape[0], np.unique(Y).shape[0], pd.Series(Y).value_counts().iloc[0], np.unique(Y).shape[0]/data.shape[0])
    frequency.append((
        dname,
        data.shape[0],
        np.unique(Y).shape[0],
        (np.unique(Y).shape[0]/data.shape[0])*100,
        pd.Series(Y).value_counts().iloc[0],
        (pd.Series(Y).value_counts().iloc[0]/data.shape[0])*100,
    ))

boston 506 229 16 0.4525691699604743
concrete 1030 845 6 0.8203883495145631
energy 768 586 6 0.7630208333333334
kin8nm 8192 8191 2 0.9998779296875
naval 11934 51 234 0.004273504273504274
power 9568 4836 9 0.5054347826086957
protein 45730 15903 272 0.34775858298709816
wine 1599 6 681 0.00375234521575985
yacht 308 258 3 0.8376623376623377


In [22]:
frequency_df = pd.DataFrame(frequency, columns=['dataset', 'n', 'unique y', 'unique / n ratio', 'most frequent y', 'most frequent / n'])
frequency_df

Unnamed: 0,dataset,n,unique y,unique / n ratio,most frequent y,most frequent / n
0,boston,506,229,45.256917,16,3.162055
1,concrete,1030,845,82.038835,6,0.582524
2,energy,768,586,76.302083,6,0.78125
3,kin8nm,8192,8191,99.987793,2,0.024414
4,naval,11934,51,0.42735,234,1.960784
5,power,9568,4836,50.543478,9,0.094064
6,protein,45730,15903,34.775858,272,0.594796
7,wine,1599,6,0.375235,681,42.589118
8,yacht,308,258,83.766234,3,0.974026


In [23]:
print(frequency_df.to_latex(
    index=False,
    bold_rows=True,
    #float_format="%.2f",
    column_format="lrrrrr",
    formatters=['{0}'.format, '{0}'.format, '{0}'.format, '{:.2f}\%'.format, '{0}'.format, '{:.2f}\%'.format],
    caption='Value frequencies in the UCI benchmark datasets}\centering {',
    label='tab:frequency'
))

\begin{table}
\caption{Value frequencies in the UCI benchmark datasets}\centering {}
\label{tab:frequency}
\begin{tabular}{lrrrrr}
\toprule
dataset & n & unique y & unique / n ratio & most frequent y & most frequent / n \\
\midrule
boston & 506 & 229 & 45.26\% & 16 & 3.16\% \\
concrete & 1030 & 845 & 82.04\% & 6 & 0.58\% \\
energy & 768 & 586 & 76.30\% & 6 & 0.78\% \\
kin8nm & 8192 & 8191 & 99.99\% & 2 & 0.02\% \\
naval & 11934 & 51 & 0.43\% & 234 & 1.96\% \\
power & 9568 & 4836 & 50.54\% & 9 & 0.09\% \\
protein & 45730 & 15903 & 34.78\% & 272 & 0.59\% \\
wine & 1599 & 6 & 0.38\% & 681 & 42.59\% \\
yacht & 308 & 258 & 83.77\% & 3 & 0.97\% \\
\bottomrule
\end{tabular}
\end{table}

