# Benchmark results reporting

## Setup

### Prerequirements
This notebook requires a kernel running Python 3.5+.
You can skip this section if the kernel is already configured.

In [None]:
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install seaborn
#!pip install jupyter_contrib_nbextensions
#!jupyter contrib nbextension install --user
#!jupyter nbextension enable python-markdown/main
#!pip install jupyter_nbextensions_configurator
#!jupyter nbextensions_configurator enable --user

### Imports and selection of the results directory

In [None]:
from IPython import display as idisplay
import functools as ft
import os

import pandas as pd
import numpy as np
import matplotlib as mp
import scipy as sp
import seaborn as sb
import warnings

warnings.filterwarnings('ignore')

## Results

##### Parameters

In [None]:
nfolds = 10
ff = '%.6g'
colormap = 'tab10'
# colormap = 'Set2'
# colormap = 'Dark2'

renamings = dict(
    constantpredictor_enc='constantpredictor'
)
excluded_frameworks = ['oboe']
binary_score_label = 'AUC'
multiclass_score_label = 'logloss'

# impute_missing_with = 'constantpredictor'
impute_missing_with = 'randomforest'
zero_one_refs = ('constantpredictor', 'tunedrandomforest')

all_results_files = {
    'old': [
        "results_valid_ref.csv", "results_valid.csv",
        "results_small-2c1h_ref.csv", "results_small-2c1h.csv",
        "results_medium-4c1h_ref.csv", "results_medium-4c1h.csv",
        "results_medium-4c4h_ref.csv", "results_medium-4c4h.csv",
    ],
    '1h': [
        "results_small-8c1h_ref.csv", "results_small-8c1h.csv",
        "results_medium-8c1h_ref.csv", "results_medium-8c1h.csv",            
    ],
    '4h': [
        "results_small-8c4h_ref.csv", "results_small-8c4h.csv",
        "results_medium-8c4h_ref.csv", "results_medium-8c4h.csv",    
        "results_large-8c4h_ref.csv", "results_large-8c4h.csv",       
    ],
    '8h': [
        "results_large-8c8h_ref.csv", "results_large-8c8h.csv",        
    ]
}

results_group = '4h'
results_files = all_results_files[results_group]

#### Loading results, formatting and adding columns
- `result` is the raw result metric computed from predictions at the end the benchmark.
    For classification problems, it is usually `auc` for binomial classification and `logloss` for multinomial classification.
- `score` ensures a standard comparison between tasks: **higher is always better**.
- `norm_score` is a normalization of `score` on a `[0, 1]` scale, with `{{zero_one_refs[0]}}` score as `0` and `{{zero_one_refs[1]}}` score as `1`.
- `imp_result` and `imp_score` for imputed results/scores. Given a task and a framework:
    - if **all folds results/scores are missing**, then no imputation occurs, and the result is `nan` for each fold.
    - if **only some folds results/scores are missing**, then the missing result is imputed by the `{{impute_missing_with}}` result for this fold.

In [None]:
def load_results(files=results_files):
    return pd.concat([pd.read_csv(file) for file in files], ignore_index=True)

def create_file(*path_tokens):
    path = os.path.realpath(os.path.join(*path_tokens))
    if not os.path.exists(path):
        dirname, basename = os.path.split(path)
        if not os.path.exists(dirname):
            os.makedirs(dirname, exist_ok=True)
        if basename:
            open(path, 'a').close()
    return path

def display(fr, pretty=True, float_format=ff):
    with pd.option_context(
        'display.max_rows', len(fr), 
        'display.float_format', lambda f: float_format % f
        ):
        if type(fr) is pd.Series:
            fr = fr.to_frame()
        if pretty and type(fr) is pd.DataFrame:
            fr.style.set_properties(**{'vertical-align':'top'})
            idisplay.display(idisplay.HTML(fr.to_html()))
        else:
            print(fr)
        
        
def build_classification_type_map(results_df):
    cp = results_df.loc[(results_df.framework=='constantpredictor')&(results_df.fold==0)]
#     binary_tasks = cp.where(pd.notna(cp.auc))['task'].dropna().tolist()
    return (cp.apply(lambda r: pd.Series([r.task, 'binary' if not np.isnan(r.auc) else 'multiclass'], 
                                         index=['task', 'type']), 
                     axis=1,
                     result_type='expand')
              .set_index('task')['type']
              .to_dict())    
    

def classification_type(row, type_map):
    return type_map.get(row.task)


def impute_result(row, results_df, res_col='result', ref_framework=impute_missing_with):
    if pd.notna(row[res_col]):
        return row[res_col]
    # if all folds are failed or missing, don't impute
    if pd.isna(results_df.loc[(results_df.task==row.task)&(results_df.framework==row.framework)][res_col]).all():
        return np.nan
    # impute with ref framework corresponding value
    return (results_df.loc[(results_df.framework==ref_framework)
                           &(results_df.task==row.task)
                           &(results_df.fold==row.fold)][res_col]
                     .item())


def imputed(row):
    return pd.isna(row.result) and pd.notna(row.imp_result)
    

def score(row, res_col='result'):
    return row[res_col] if row[res_col] in [row.auc, row.acc]\
                        else - row[res_col]


def norm_score(row, results_df, score_col='score', zero_one_refs=zero_one_refs):
    zero, one = (results_df.loc[(results_df.framework==ref)
                                &(results_df.task==row.task)
                                &(results_df.fold==row.fold)][score_col]
                           .item()
                 for ref in zero_one_refs)
    return (row[score_col] - zero) / (one - zero)
 
    
def sorted_ints(arr): 
    return sorted(list(map(int, arr[~np.isnan(arr)])))

all_results = load_results().replace(renamings)
all_results = all_results.loc[~all_results.framework.isin(excluded_frameworks)]
all_results.task = all_results.task.str.lower()
all_results.framework = all_results.framework.str.lower()
all_results.fold = all_results.fold.apply(int)

all_frameworks = all_results.framework.unique()
all_frameworks.sort()
all_tasks = all_results.task.unique()
all_tasks.sort()
all_folds = all_results.fold.unique()
class_type_map = build_classification_type_map(all_results)


all_done = all_results.set_index(['task', 'fold', 'framework'])
if not all_done.index.is_unique:
    print("Duplicate entries:")
    display(all_done[all_done.index.duplicated(keep=False)].sort_values(by=all_done.index.names), 
            pretty=False)
assert all_done.index.is_unique
all_missing = pd.DataFrame([(task, fold, framework, 'missing') 
                            for task in all_tasks 
                            for fold in range(nfolds)
                            for framework in all_frameworks 
                            if (task, fold, framework) not in all_done.index],
                           columns=[*all_done.index.names, 'info'])\
                          .set_index(all_done.index.names)
assert all_missing.index.is_unique
all_failed = all_results.loc[pd.notna(all_results['info'])]\
                        .set_index(all_done.index.names)
assert all_failed.index.is_unique

# extending the data frame 
all_results = all_results.append(all_missing.reset_index())
all_results['type'] = [classification_type(row, class_type_map) for _, row in all_results.iterrows()]
all_results['score'] = [score(row) for _, row in all_results.iterrows()]

all_results['imp_result'] = [impute_result(row, all_results) for _, row in all_results.iterrows()]
all_results['imp_score'] = [impute_result(row, all_results, 'score') for _, row in all_results.iterrows()]
all_results['norm_score'] = [norm_score(row, all_results, 'imp_score') for _, row in all_results.iterrows()]

all_results.to_csv(create_file("tables", results_group, "all_results.csv"), 
                   index=False, 
                   float_format=ff)

### Tasks list

In [None]:
tasks = (all_results.groupby(['task', 'type'])['id']
                    .unique()
                    .map(lambda id: id[0]))
display(tasks)

### Completed tasks/folds

In [None]:
done = (all_done.reset_index()
                .groupby(['task', 'framework'])['fold']
                .unique())
display(done, pretty=False)

### Missing or crashed/aborted tasks/folds

In [None]:
# not_done = pd.DataFrame([(task, framework) for task in all_tasks 
#                                            for framework in all_frameworks 
#                                            if (task, framework) not in done.index],
#                         columns=['task', 'framework'])
# missing = all_results.append(not_done)\
#                      .groupby(['task', 'framework'])['fold']\
#                      .unique()\
#                      .map(sorted_ints)\
#                      .map(lambda arr: sorted(list(set(range(0, nfolds)) - set(arr))))\
#                      .where(lambda values: values.map(lambda arr: len(arr) > 0))\
#                      .dropna()

missing = (all_missing.reset_index()
                      .groupby(['task', 'framework'])['fold']
                      .unique())
display(missing, pretty=False)

### Failing tasks/folds

In [None]:
# failed = all_results.where(np.isnan(all_results.result))\
#                     .groupby(['task', 'framework'])['fold']\
#                     .unique()\
#                     .map(sorted_ints)

failed = (all_failed.reset_index()
                    .groupby(['task', 'framework'])['fold']
                    .unique())
display(failed, pretty=False)

### Results anomalies

In [None]:
def list_outliers(col, results=all_results, z_threshold=3):
    df = results.pivot_table(index=['type','task', 'framework'], columns='fold', values=col)
    df_mean = df.mean(axis=1)
    df_std = df.std(axis=1)
    z_score = (df.sub(df_mean, axis=0)
                 .div(df_std, axis=0)
                 .abs())
    return z_score.where(z_score > z_threshold).dropna(axis=0, how='all')
    
display(list_outliers('result', 
                      z_threshold=2.5,
#                       results=all_results.loc[all_results.framework=='h2oautoml']
                     ))

## Data reports

### Results summary

Averaging using arithmetic mean over fold `result` or `score`.
In following summaries, if not mentioned otherwise, the means are computed over imputed results/scores.
Given a task and a framework:
- if **all folds results/scores are missing**, then no imputation occured, and the mean result is `nan`.
- if **only some folds results/scores are missing**, then the amount of imputed results that contributed to the mean are displayed between parenthesis.

In [None]:
def add_imputed_mark(values, imp, val_type=float, val_format=None):
    formats = dict(float="{:,.6g}{}", int="{0:d}{}", str="{}{}")
    format_value = (val_format if val_format is not None
                               else lambda *val: formats[val_type.__name__].format(*val))
    return (values.astype(object)
                  .combine(imp, 
                           lambda val, imp: format_value(val, " ({:.0g})".format(imp) if imp else '')))

def render_summary(col, results=all_results, show_imputations=True, filename=None, float_format=ff):
    res_group = results.groupby(['type', 'task', 'framework'])
    df = res_group[col].mean().unstack()
    if show_imputations:
        imputed_df = (res_group['result', 'imp_result']
                          .apply(lambda df: sum(imputed(row) for _, row in df.iterrows()))
                          .unstack())    
        df = df.combine(imputed_df, ft.partial(add_imputed_mark, 
                                               val_format=lambda *v: (float_format+"%s") % tuple(v)))
    display(df, float_format=float_format)
    if filename is not None:
        df.to_csv(create_file("tables", results_group, filename), float_format=float_format)


summary_results = all_results

#### Number of models trained

When available, displays the average amount of models trained by the framework for each dataset.

This amount should be interpreted differently for each framework.
For example, with *RandomForest*, this amount corresponds to the number of trees.

In [None]:
render_summary('models', 
               results=summary_results, 
               filename="models_summary.csv", 
               float_format="%.f")

#### Results mean

In [None]:
render_summary('result', 
               results=summary_results)

In [None]:
render_summary('imp_result', 
               results=summary_results,
               filename="result_summary.csv")

#### Score mean

In [None]:
render_summary('imp_score', 
               results=summary_results,
               filename="score_summary.csv")

In [None]:
render_summary('norm_score', 
               results=summary_results,
               filename="norm_score_summary.csv")

### Tasks leaderboard

In [None]:
def rank(scores):
    sorted_scores = pd.Series(scores.unique()).sort_values(ascending=False)
    ranks = pd.Series(index=scores.index)
    for idx, value in scores.items():
        try:
            ranks.at[idx] = np.where(sorted_scores == value)[0][0]+1
        except IndexError:
            ranks.at[idx] = np.nan
    return ranks

def render_leaderboard(col, results=all_results, aggregate=False, show_imputations=False, filename=None):
    res_group = results.groupby(['type', 'task', 'framework'])
    df = (res_group[col].mean().unstack() if aggregate 
          else results.pivot_table(index=['type','task', 'fold'], columns='framework', values=col))
    df = (df.apply(rank, axis=1, result_type='broadcast')
            .astype(object)) 
    if show_imputations:
        imputed_df = (res_group['result', 'imp_result']
                          .apply(lambda df: sum(imputed(row) for _, row in df.iterrows()))
                          .unstack())    
        df = df.combine(imputed_df, add_imputed_mark)
    display(df)
    if filename is not None:
        df.to_csv(create_file("tables", results_group, filename), float_format='%.f')
     
    
leaderboard_results = all_results.loc[~all_results.framework.isin(['constantpredictor', 'randomforest'])]

In [None]:
render_leaderboard('imp_score', 
                   results=leaderboard_results,
                   aggregate=True, 
                   show_imputations=True, 
                   filename="tasks_leaderboard.csv")

### Folds leaderboard

In [None]:
render_leaderboard('score', filename="folds_leaderboard.csv");

## Visualizations

In [None]:
def savefig(fig, path):
    fig.savefig(path, bbox_inches='tight')
    

def task_labels(index):
    max_length = 16
    return (index.droplevel('type')
                .map(lambda x: x if len(x) <= max_length else u'{}…'.format(x[:max_length-1]))
                .values)

def set_labels(axes, 
               title=None,
               xlabel=None, ylabel=None,
               x_labels=None, y_labels=None, 
               legend_title=None):
    axes.set_title(title, fontsize='xx-large')
    axes.set_xlabel(xlabel, fontsize='x-large')
    axes.set_ylabel(ylabel, fontsize='x-large')
    axes.tick_params(labelsize='x-large')
    if x_labels is not None:
        axes.set_xticklabels(x_labels)
    if y_labels is not None:
        axes.set_yticklabels(y_labels)
    legend = axes.get_legend()
    if legend is not None:
        legend_title = legend_title or legend.get_title().get_text()
        legend.set_title(legend_title, prop=dict(size='x-large'))
        for text in legend.get_texts():
            text.set_fontsize('x-large')
            
def set_scales(axes, xscale=None, yscale=None):
    if isinstance(xscale, str):
        axes.set_xscale(xscale)
    elif isinstance(xscale, tuple):
        axes.set_xscale(xscale[0], **xscale[1])
    if isinstance(yscale, str):
        axes.set_yscale(yscale)
    elif isinstance(yscale, tuple):
        axes.set_yscale(yscale[0], **yscale[1])


### Heatmaps

In [None]:
def draw_heatmap(df, 
                 x_labels=True, y_labels=True, 
                 title=None, xlabel=None, ylabel=None,
                 **kwargs):
    with sb.axes_style('white'), sb.plotting_context('paper'):
#         print(sb.axes_style())
#         print(sb.plotting_context())
        axes = sb.heatmap(df, xticklabels=x_labels, yticklabels=y_labels,
                          annot=True, cmap='RdYlGn', robust=True,
                          **kwargs)
        axes.tick_params(axis='y', labelrotation=0) 
        set_labels(axes, title=title, xlabel=xlabel, ylabel=ylabel)
        fig = axes.get_figure()
        fig.set_size_inches(10, df.shape[0]/2)
        fig.set_dpi(120)
        return fig

def draw_score_heatmap(col, results=all_results, type_filter='all', filename=None, **kwargs):
    df = (results.groupby(['type', 'task', 'framework'])[col]
                 .mean()
                 .unstack())
    df = (df if type_filter == 'all'
             else df[df.index.get_loc(type_filter)])
    fig = draw_heatmap(df, 
                       y_labels=task_labels(df.index), 
#                        xlabel="Framework", ylabel="Task",
                       **kwargs)
    if filename is not None:
        savefig(fig, create_file("graphics", results_group, filename))
    return fig


# heatmap_results = all_results.loc[~all_results.framework.isin(['constantpredictor', 'randomforest'])]
heatmap_results = all_results.loc[~all_results.framework.isin(['constantpredictor'])]

In [None]:
draw_score_heatmap('imp_score',
                   results=heatmap_results,
                   type_filter='binary', 
                   title=f"Scores ({binary_score_label}) on {results_group} binary classification problems",
                   filename="binary_score_heat.png",
                   center=0.5);

In [None]:
draw_score_heatmap('imp_score', 
                   results=heatmap_results,
                   type_filter='multiclass', 
                   title=f"Scores ({multiclass_score_label}) on {results_group} multi-class classification problems",
                   filename="multiclass_score_heat.png",
                   center=0);

In [None]:
draw_score_heatmap('norm_score', 
                   results=heatmap_results,
                   type_filter='binary', 
                   title=f"Normalized scores on {results_group} binary classification problems",
                   filename="binary_norm_score_heat.png",
                   center=0);

In [None]:
draw_score_heatmap('norm_score',
                   results=heatmap_results,
                   type_filter='multiclass', 
                   title=f"Normalized scores on {results_group} multi-class classification problems",
                   filename="multiclass_norm_score_heat.png",
                   center=0);

### Linear plots

In [None]:
def draw_parallel_coord(df, class_column, 
                        x_labels=True, yscale='linear', 
                        title=None, xlabel=None, ylabel=None,
                        legend_loc='best', legend_title=None, colormap=colormap):
    with sb.axes_style('ticks', rc={'grid.linestyle': 'dotted'}), sb.plotting_context('paper'):
#         print(sb.axes_style())
        parallel_fig = mp.pyplot.figure(dpi=120, figsize=(10, df.shape[0]))
        # select the first colors from the colormap to ensure we use the same colors as in the stripplot later
        colors = mp.cm.get_cmap(colormap).colors[:len(df[class_column].unique())]
        axes = pd.plotting.parallel_coordinates(df, 
                                                class_column=class_column, 
                                                colors=colors,
                                                axvlines=False,
                                               )
        axes.tick_params(axis='x', labelrotation=90) 
        set_scales(axes, yscale=yscale)
        handles, labels = axes.get_legend_handles_labels()
        axes.legend(handles, labels, loc=legend_loc, title=legend_title)
        set_labels(axes, title=title, xlabel=xlabel, ylabel=ylabel, x_labels=x_labels)
        return parallel_fig


def draw_score_parallel_coord(col, results=all_results, type_filter='all', 
                              ylabel=None, filename=None, **kwargs):
    res_group = results.groupby(['type', 'task', 'framework'])
    df = res_group[col].mean().unstack(['type', 'task'])
    df = df if type_filter == 'all' \
            else df.iloc[:, df.columns.get_loc(type_filter)]
    df.reset_index(inplace=True)
    fig = draw_parallel_coord(df, 
                              'framework',
                              x_labels=task_labels(df.columns.drop('framework')),
#                               xlabel="Task",
                              ylabel=ylabel or "Score",
                              legend_title="Framework",
                              **kwargs) 
    if filename is not None:
        savefig(fig, create_file("graphics", results_group, filename))
    return fig


# parallel_coord_results = all_results.loc[~all_results.framework.isin(['randomforest'])]
parallel_coord_results = all_results

In [None]:
draw_score_parallel_coord('imp_score',
                          results=parallel_coord_results,
                          type_filter='binary', 
                          title=f"Scores ({binary_score_label}) on {results_group} binary classification problems",
                          ylabel=binary_score_label,
                          legend_loc='lower left',
                          filename="binary_score_parallel_ccord.png");


In [None]:
draw_score_parallel_coord('imp_score',
                          results=parallel_coord_results,
                          type_filter='multiclass',
                          title=f"Scores ({multiclass_score_label}) on {results_group} multi-class classification problems",
                          ylabel=multiclass_score_label,
                          yscale=('symlog', dict(linthreshy=0.5)),
                          legend_loc='lower left',
                          filename="multiclass_score_parallel_ccord.png");


In [None]:
draw_score_parallel_coord('norm_score', 
                          results=parallel_coord_results,
                          type_filter='binary', 
                          title=f"Normalized scores on {results_group} binary classification problems",
                          filename="binary_norm_score_parallel_ccord.png");


In [None]:
draw_score_parallel_coord('norm_score', 
                          results=parallel_coord_results,
                          type_filter='multiclass',
                          title=f"Normalized scores on {results_group} multi-class classification problems",
                          filename="multiclass_norm_score_parallel_ccord.png", 
                          yscale='symlog',
                         );


### Scatterplots

In [None]:
def draw_stripplot(df, x, y, hue, 
                   xscale='linear', xbound=None, 
                   xlabel=None, ylabel=None, y_labels=None, title=None,
                   legend_title=None, legend_loc='best', colormap=colormap):
    with sb.axes_style('whitegrid', rc={'grid.linestyle': 'dotted'}), sb.plotting_context('paper'):
#         print(sb.axes_style())
        # Initialize the figure
        strip_fig, axes = mp.pyplot.subplots(dpi=120, figsize=(10, len(df.index.unique())))
        set_scales(axes, xscale=xscale)
        if xbound is not None:   
            axes.set_autoscalex_on(False)
            axes.set_xbound(*xbound)
#             axes.invert_xaxis()
        sb.despine(bottom=True, left=True)

        # Show each observation with a scatterplot
        sb.stripplot(x=x, y=y, hue=hue,
                     data=df, dodge=True, jitter=True, palette=colormap,
                     alpha=.25, zorder=1)

        # Show the conditional means
        sb.pointplot(x=x, y=y, hue=hue,
                     data=df, dodge=.5, join=False, palette=colormap,
                     markers='d', scale=.75, ci=None)

        # Improve the legend 
        handles, labels = axes.get_legend_handles_labels()
        dist = int(len(labels)/2)
        axes.legend(handles[dist:], labels[dist:], title=legend_title or hue,
                    handletextpad=0, columnspacing=1,
                    loc=legend_loc, ncol=1, frameon=True)
        set_labels(axes, title=title, xlabel=xlabel, ylabel=ylabel, y_labels=y_labels)
        return strip_fig


def draw_score_stripplot(col, results=all_results, type_filter='all', filename=None, **kwargs):
    scatterplot_df = results.set_index(['type', 'task']).sort_index()
    df = scatterplot_df if type_filter == 'all' \
                        else scatterplot_df[scatterplot_df.index.get_loc(type_filter)]
    fig = draw_stripplot(
        df,
        x=col,
        y=df.index,
        hue='framework',
#         ylabel='Task',
        y_labels=task_labels(df.index.unique()),
        legend_title="Framework",
        **kwargs
    )
    if filename is not None:
        savefig(fig, create_file("graphics", results_group, filename))
    return fig


# scatterplot_results = (all_results.loc[~all_results.framework.isin(['randomforest'])]
#                                   .sort_values(by=['framework']))  # sorting for colors consistency
scatterplot_results = all_results.sort_values(by=['framework'])  # sorting for colors consistency

In [None]:
draw_score_stripplot('imp_result', 
                     results=scatterplot_results,
                     type_filter='binary', 
                     title=f"Scores on {results_group} binary classification problems",
                     xlabel=binary_score_label,
                     filename="binary_results_stripplot.png");

In [None]:
draw_score_stripplot('imp_result',
                     results=scatterplot_results,
                     type_filter='multiclass', 
#                      xbound=(0,10),
                     xscale=('symlog', dict(linthreshx=0.5)),
                     title=f"Scores on {results_group} multi-class classification problems",
                     xlabel=multiclass_score_label, 
                     filename="multiclass_results_stripplot.png");

In [None]:
draw_score_stripplot('norm_score', 
                     results=scatterplot_results,
                     type_filter='binary', 
                     xbound=(-0.2, 2),
                     xscale='linear',
                     title=f"Normalized scores on {results_group} binary classification problems",
                     filename="binary_norm_score_stripplot.png");

In [None]:
draw_score_stripplot('norm_score', 
                     results=scatterplot_results,
                     type_filter='multiclass', 
                     xbound=(-0.2, 2.5),
                     xscale='linear',
                     title=f"Normalized scores on {results_group} multi-class classification problems",
                     filename="multiclass_norm_score_stripplot.png");

## Playground

In [None]:
all_results.loc[(all_results.task.str.contains('jungle'))&(all_results.framework=='tunedrandomforest')];

In [None]:
done.iloc[done.index.get_level_values('framework').isin(['autosklearn', 'h2oautoml', 'tpot'])]\
    .apply(sorted_ints);

In [None]:
failures = all_failed.groupby(['task', 'fold', 'framework'])['info']\
                     .unique()
#display(failures)