## Post-Hoc Analysis

In [12]:
import os
import json
import pandas as pd
import numpy as np
from scipy.stats import friedmanchisquare

print(pd.__version__)

1.2.4


### Loading the results

In [13]:
# Note: The results are collated once the runs are complete for all datasets
# source files for each dataset: main_results_data_frame.csv
df = pd.read_csv("results.csv")

### Friedman test

In [14]:
methods = ['None', 'var_bayes', 'sigmoid', 'isotonic', 'beta']
columns = ['dataset', 'method', 'acc', 'loss', 'brier']

def display_table(df, metric, alpha=0.05, highlight=True, friedman_test=True):
    """Displays result table and highlights the minimum values
    1. Performs Freidman test
    2. Highlights the best value for given metrics
    """
    print(f"Metric = {metric}")
    df_loss = df.pivot_table(index=['dataset'], columns=['method'],
                                    values=[metric], aggfunc=[np.mean])
    
    if friedman_test:
        data = df_loss.to_numpy()[:, :len(methods)]
        measurements = [data[:, x] for x in np.arange(data.shape[1])]
        stat, p_value = friedmanchisquare(*measurements)
        print(f'Statistic = {stat:.6f}, p-value = {p_value:.6f}')
        if p_value > alpha:
            print('Same distributions (fail to reject the null hypothesis)')
        else:
            print('Different distributions (reject null hypothesis)')
    
    if highlight:
        if metric != 'acc':
            return df_loss.style.highlight_min(color = 'yellow', axis = 1)
        else:
            return df_loss.style.highlight_max(color = 'yellow', axis = 1)
    else:
        return df_loss

In [9]:
display_table(df, metric='loss', alpha=0.05, highlight=True, friedman_test=True)

Metric = loss
Statistic = 23.137255, p-value = 0.000119
Different distributions (reject null hypothesis)


Unnamed: 0_level_0,mean,mean,mean,mean,mean
Unnamed: 0_level_1,loss,loss,loss,loss,loss
method,None,beta,isotonic,sigmoid,var_bayes
dataset,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
abalone,0.59789,0.598148,0.691808,0.602227,0.615603
balance-scale,0.098068,0.080771,0.092153,0.089347,0.069333
credit-approval,0.491384,0.522863,0.897678,0.437511,0.457618
german,0.525405,0.517368,0.532982,0.517461,0.565519
ionosphere,0.385311,0.310725,0.334547,0.308752,0.330256
landsat-satellite,0.549391,0.549391,0.549391,0.549391,0.065981
letter,0.015198,0.015066,0.015491,0.022324,0.012077
mfeat-karhunen,0.056259,0.052115,0.165397,0.121016,0.067857
mfeat-morphological,0.325083,0.325096,0.325096,0.325096,0.000206
mfeat-zernike,0.092498,0.070683,0.166497,0.048465,0.073144


In [10]:
print(display_table(df, metric='loss', alpha=0.05, highlight=False, friedman_test=True).to_latex())

Metric = loss
Statistic = 23.137255, p-value = 0.000119
Different distributions (reject null hypothesis)
\begin{tabular}{lrrrrr}
\toprule
{} & \multicolumn{5}{l}{mean} \\
{} & \multicolumn{5}{l}{loss} \\
method &      None &      beta &  isotonic &   sigmoid & var\_bayes \\
dataset             &           &           &           &           &           \\
\midrule
abalone             &  0.597890 &  0.598148 &  0.691808 &  0.602227 &  0.615603 \\
balance-scale       &  0.098068 &  0.080771 &  0.092153 &  0.089347 &  0.069333 \\
credit-approval     &  0.491384 &  0.522863 &  0.897678 &  0.437511 &  0.457618 \\
german              &  0.525405 &  0.517368 &  0.532982 &  0.517461 &  0.565519 \\
ionosphere          &  0.385311 &  0.310725 &  0.334547 &  0.308752 &  0.330256 \\
landsat-satellite   &  0.549391 &  0.549391 &  0.549391 &  0.549391 &  0.065981 \\
letter              &  0.015198 &  0.015066 &  0.015491 &  0.022324 &  0.012077 \\
mfeat-karhunen      &  0.056259 &  0.052115 &  0.165

### Critical Difference Diagrams

We use the following matlab implementation to obtain the ranks and plot our CD diagram with `results.csv`: https://github.com/aciditeam/matlab-ts/blob/master/criticaldifference.m

EOF