In [1]:
%%javascript

IPython.OutputArea.auto_scroll_threshold = 9999

<IPython.core.display.Javascript object>

In [2]:
import sys
sys.path.append('../../')
import sys
sys.path.remove('/anaconda3/lib/python3.7/site-packages/bayesian_benchmarks-alpha-py3.7.egg')
from matplotlib import pyplot
import numpy as np
import pandas
from matplotlib import pyplot as plt
from scipy.stats import rankdata
from IPython.display import display, HTML

from bayesian_benchmarks.database_utils import Database
from bayesian_benchmarks.data import  _ALL_REGRESSION_DATATSETS
ALL_DATATSETS = {}
ALL_DATATSETS.update(_ALL_REGRESSION_DATATSETS)
from bayesian_benchmarks.data import regression_datasets


In [3]:
def rankarray(A):
    ranks = []
    for a in A:
        ranks.append(rankdata(a))
    return np.array(ranks)


def read_regression_classification(fs, models_names, datasets, task,file='results.db'):
    if task == 'classification':
        fields = ['dataset', 'N', 'D', 'K'] + [m[1] for m in models_names]
    else:
        fields = ['dataset', 'N', 'D'] + [m[1] for m in models_names]

    results = {}
    for f in fs:
        results[f] = {'table':{f:[] for f in fields}, 'vals':[]}

    with Database('../results/'+file) as db:

        for dataset in datasets:
            for f in fs:
                results[f]['table']['dataset'].append(dataset[:10])
                results[f]['table']['N'].append(ALL_DATATSETS[dataset].N)
                results[f]['table']['D'].append(ALL_DATATSETS[dataset].D)
                if task == 'classification':
                    results[f]['table']['K'].append(ALL_DATATSETS[dataset].K)

            row = {f:[] for f in fs}
            for model, name in models_names:
                res = db.read(task, fs, {'model':model, 
                                         'dataset':dataset})
                print(dataset,model,len(res))
                    
                if len(res) == 0:
                    for f in fs:
                        results[f]['table'][name].append('')
                        row[f].append(np.nan)
                else:
                    for i, f in enumerate(fs):
                        L = [float(l[i]) for l in res]
                        m = np.average(L)
                        std = np.std(L) if len(L) > 1 else np.nan
                        if m < 1000 and m > -1000:
                            r = '{:.3f}({:.3f})'.format(m, std)
                            row[f].append(m)
                        else:
                            r = 'nan'
                            row[f].append(np.nan)

                        results[f]['table'][name].append(r)

            #             stderr = np.std(L)/float(len(L))**0.5
            #             r = '{:.3f} ({:.3f})'.format(m, stderr)
            for f in fs:   
                results[f]['vals'].append(row[f])


    for f in fs:
        if 'unnormalized' not in f:
            vals = np.array(results[f]['vals'])

            avgs = np.nanmean(vals, 0)
            meds = np.nanmedian(vals, 0)
            rks = np.nanmean(rankarray(vals), 0)

            for s, n in [[avgs, 'avg'], [meds, 'median'], [rks, 'avg rank']]:
                results[f]['table']['dataset'].append(n)
                results[f]['table']['N'].append('')
                results[f]['table']['D'].append('')
                if task == 'classification':
                    results[f]['table']['K'].append('')
                for ss, name in zip(s, [m[1] for m in models_names]):
                    results[f]['table'][name].append('{:.3f}'.format(ss))
    
    return results, fields

In [6]:
models_names = [
               
                ['gPoE_50_100_clustering_variance', 'gPoE_50_100_var'],
               ['bar_50_100_clustering_variance', 'bar_50_100_var'],
               ['PoE_1_100_clustering_no_weights', 'PoE_100'],
               ['BCM_1_100_clustering_no_weights', 'BCM_100'],
               ['gp', 'gp'],
               ['linear', 'linear']]
                

fs = 'test_loglik', 'test_rmse', 'test_loglik_unnormalized', 'test_rmse_unnormalized'

results, fields = read_regression_classification(fs, models_names, regression_datasets, 'regression')


concrete gPoE_50_100_clustering_variance 1
concrete bar_50_100_clustering_variance 1
concrete PoE_1_100_clustering_no_weights 1
concrete BCM_1_100_clustering_no_weights 1
concrete gp 3
concrete linear 10
power gPoE_50_100_clustering_variance 3
power bar_50_100_clustering_variance 3
power PoE_1_100_clustering_no_weights 3
power BCM_1_100_clustering_no_weights 3
power gp 0
power linear 10
protein gPoE_50_100_clustering_variance 0
protein bar_50_100_clustering_variance 0
protein PoE_1_100_clustering_no_weights 0
protein BCM_1_100_clustering_no_weights 0
protein gp 0
protein linear 10
wilson_airfoil gPoE_50_100_clustering_variance 3
wilson_airfoil bar_50_100_clustering_variance 3
wilson_airfoil PoE_1_100_clustering_no_weights 3
wilson_airfoil BCM_1_100_clustering_no_weights 3
wilson_airfoil gp 1
wilson_airfoil linear 10
wilson_kin40k gPoE_50_100_clustering_variance 1
wilson_kin40k bar_50_100_clustering_variance 1
wilson_kin40k PoE_1_100_clustering_no_weights 1
wilson_kin40k BCM_1_100_clust

In [7]:

print('normalised test loglikelihood')
display(HTML(pandas.DataFrame(results['test_loglik']['table'], columns=fields).to_html(index=False)))


print('normalised test rmse')
display(HTML(pandas.DataFrame(results['test_rmse']['table'], columns=fields).to_html(index=False)))


normalised test loglikelihood


dataset,N,D,gPoE_50_100_var,bar_50_100_var,PoE_100,BCM_100,gp,linear
concrete,1030.0,8.0,-0.333(nan),-0.333(nan),-0.420(nan),-0.359(nan),-0.251(0.046),-0.953(0.052)
power,9568.0,4.0,0.035(0.022),0.014(0.011),-0.462(0.018),0.016(0.017),,-0.098(0.031)
protein,45730.0,9.0,,,,,,-1.257(0.005)
wilson_air,1503.0,5.0,-0.437(0.129),-0.437(0.129),-0.754(0.093),-0.420(0.130),-0.353(nan),-1.096(0.070)
wilson_kin,40000.0,8.0,0.353(nan),0.355(nan),-60.728(nan),-7.825(nan),,-1.419(0.013)
wilson_par,5875.0,20.0,-0.137(0.039),-0.134(0.037),-10.880(0.957),-0.674(0.253),,-1.282(0.025)
avg,,,-0.104,-0.107,-14.649,-1.853,-0.302,-1.018
median,,,-0.137,-0.134,-0.754,-0.420,-0.302,-1.176
avg rank,,,4.000,3.833,1.833,3.667,6.000,1.667


normalised test rmse


dataset,N,D,gPoE_50_100_var,bar_50_100_var,PoE_100,BCM_100,gp,linear
concrete,1030.0,8.0,0.374(nan),0.374(nan),0.384(nan),0.392(nan),0.329(0.018),0.626(0.031)
power,9568.0,4.0,0.232(0.004),0.238(0.003),0.319(0.003),0.236(0.003),,0.267(0.008)
protein,45730.0,9.0,,,,,,0.850(0.004)
wilson_air,1503.0,5.0,0.350(0.044),0.350(0.044),0.494(0.025),0.341(0.045),0.330(nan),0.721(0.047)
wilson_kin,40000.0,8.0,0.178(nan),0.172(nan),0.699(nan),0.398(nan),,1.000(0.013)
wilson_par,5875.0,20.0,0.351(0.023),0.350(0.023),0.725(0.018),0.388(0.012),,0.871(0.021)
avg,,,0.297,0.297,0.524,0.351,0.330,0.723
median,,,0.350,0.350,0.494,0.388,0.330,0.785
avg rank,,,2.167,2.333,4.333,3.333,4.333,4.500
