# Collating results from `comparison_runs.py`

Going to use this for my paper. Might copy some portions from `collate_csvs.py`, which this notebook supersedes.

In [None]:
%matplotlib inline
import collections
import colorsys
import glob
import os
import re
import sys

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

sys.path.append('.')
from comparison_runs import EnvName
del sys.path[-1]

sns.set(context='paper', style='darkgrid')

In [None]:
CSV_PATTERN = '../scratch/full-runs-2020-05-26/run*/eval*.csv'

In [None]:
csv_paths = glob.glob(os.path.expanduser(CSV_PATTERN))
print(f"Found {len(csv_paths)} CSV files")
loaded_csvs = [pd.read_csv(c) for c in csv_paths]
frame = pd.concat(loaded_csvs)
frame

Now going to add some extra data:

- Human-readable test variant names.
- String-formatted mean and median for the LaTeX tables.

In [None]:
def strip_preproc(env_name):
    en = EnvName(env_name)
    return en.name_prefix + en.demo_test_spec + en.version_suffix

def to_variant(env_name):
    en = EnvName(env_name)
    return en.demo_test_spec.strip('-')

def to_prefix(env_name):
    en = EnvName(env_name)
    return en.name_prefix.strip('-')

# reformat env names
frame['test_env'] = frame['test_env'].map(strip_preproc)
frame['demo_env'] = frame['demo_env'].map(strip_preproc)
frame['variant'] = frame['test_env'].map(to_variant)
frame['env_prefix'] = frame['demo_env'].map(to_prefix)

In [None]:
blacklist_names = [
    "bc-mt-ft",
    # "bc-mt-aug",
    # "bc-mt-allo",
    # "gail-mt-aug",
    # "gail-mt-allo",
    "gail-trans",
]
keep_mask = frame['run_id'].map(lambda name: not any(bl in name for bl in blacklist_names))
print(f"Stripping out {(~keep_mask).sum()}/{len(keep_mask)} entries")
frame = frame[keep_mask]

## Separate tables for each task

Here I'm going to create separate LaTeX tables for each task. Each row will be a method, and each column will be a variant, with cells showing both mean and standard deviation of performance. Should have the same set of columns for each method.

In [None]:
VARIANT_SHORT_NAMES = collections.OrderedDict([
  ("Demo", "Demo"),
  ("TestJitter", "Jitter"),
  ("TestLayout", "Layout"),
  ("TestColour", "Colour"),
  ("TestShape", "Shape"),
  ("TestCountPlus", "CountPlus"),
  ("TestDynamics", "Dynamics"),
  ("TestAll", "All"),
])
VARIANT_ORDER = {
    env_name: index for index, env_name
    in enumerate(VARIANT_SHORT_NAMES.keys())
}
PROBLEM_PREFIX_ORDER = [
    'MoveToCorner',
    'MoveToRegion',
    'MatchRegions',
    'MakeLine',
    'FindDupe',
    'FixColour',
    'ClusterColour',
    'ClusterShape',
]

Going to do some sanity checks on the variants to make sure that none are missing from `VARIANT_SHORT_NAMES` or `VARIANT_ORDER`.

In [None]:
all_variants = set(frame['variant'].unique())
missing = all_variants - VARIANT_ORDER.keys()
# if missing:
#     print(f"{len(missing)} variants are missing:")
#     print(sorted(missing))
#     assert False, 'figure out what is going wrong!'

In [None]:
def lighten(rgba):
    """Convert an RGBA colour to an RGB colour in which original alpha
    channel is used to rescale luminance (hacky, but whatever)."""
    # TODO: if you have time, design a custom colour space by just
    # walking through a range of nice CIELAB colours (e.g. all the
    # CIELAB blues).
    assert len(rgba) == 4, len(rgba)
    rgb = rgba[:3]
    a = rgba = rgba[3]
    h, l, s = colorsys.rgb_to_hls(*rgb)
    l = 1 - (1 - l) * a
    rgb = colorsys.hls_to_rgb(h, l, s)
    return rgb

def assign_colours(values, legend=False):
    r"""Create the LaTeX \cellcolor commands necessary for a series of values.
    Values are assumed to be in [0,1]."""
    # (requires \usepackage[table]{xcolor}, per
    # https://tex.stackexchange.com/a/50351)
    values = np.asarray(values)
    values[values < 0] = 0
    values[values > 1] = 1
    cmap = plt.get_cmap('Blues')
    cols_rgba = cmap(values, alpha=0.2)
    if isinstance(cols_rgba, tuple) and len(cols_rgba) == 4:
        # annoying, when you give cmap() an array with a single
        # element it returns a tuple instead of an ndarray
        cols_rgba = np.asarray([cols_rgba])
    cols_rgb = map(lighten, cols_rgba)
    if legend:
        # a crude LaTeX colour gradient
        return r'\!'.join(r'\crule{%.2f,%.2f,%.2f}' % spec for spec in cols_rgb)
    else:
        specs = [
            r'\cellcolor[rgb]{%.2f,%.2f,%.2f}' % tuple(spec)
            for spec in cols_rgb
        ]
    return specs


def assign_colours_improved(values, legend=False):
    # this is the colour for a value of 1.0; we lighten it in CIELab to get other values
    base_colour_rgb = (180/255.0, 208/255.0, 248/255.0)
    raise NotImplementedError(
        "doing this properly (with right colour space in LaTeX etc.) "
        "seems like too much work, so skipping it for now")

def compute_cell_contents(tab):
    (_, col_name), *_ = tab.columns.to_flat_index()
    mean_series_means = tab['mean_score__reduced_mean'].squeeze(axis=1)
    std_series_means = tab['std_score__reduced_mean'].squeeze(axis=1)
    mean_series_stds = tab['mean_score__reduced_std'].squeeze(axis=1)
    std_series_stds = tab['std_score__reduced_std'].squeeze(axis=1)
    cols = assign_colours(mean_series_means)
    result_list = [
        # r'%s %.2f$\pm$%.2f (%.2f$\pm$%.2f)' % col_mu_std
        r'{0} {1:.2f}$\pm${2:.2f}'.format(*col_mu_std)
        for col_mu_std in zip(cols, mean_series_means, mean_series_stds,
                              std_series_means, std_series_stds)
    ]
    # now construct a new frame with column name taken from 'tab', and indices
    # taken from 'mean_series' and 'std_series'
    contents = pd.Series(data=result_list, index=tab.index, name=col_name)
    return contents

def flatten_top_n_levels(columns, n=2):
    """A bit like `MultiIndex.flatten_index()`, except it only flattens the
    first `n` levels. Useful if you want to, e.g., flatten the top two levels
    of a three-level multi-index (which I want to do during aggregation!)"""
    flat_index = columns.to_flat_index()
    new_index_list = []
    for entry_tuple in flat_index:
        new_tuple = ('__'.join(entry_tuple[:n]), ) + entry_tuple[n:]
        new_index_list.append(new_tuple)
    new_index = pd.MultiIndex.from_tuples(new_index_list)
    return new_index

def reduced_mean(vals):
    # this function only exists to give a nice name to aggfunc in pivot table
    return np.mean(vals)

def reduced_std(vals):
    # another function that only exists to give a nice name to aggfunc
    return np.std(vals)

In [None]:
def format_alg_name(alg_name):
    global _fan_full_re  # cached regex
    
    if '_fan_full_re' not in globals():
        # for parsing the environment at the end of the name
        env_names = [
            'move-to-corner', 'move-to-region', 'match-regions', 'make-line',
             'find-dupe', 'fix-colour', 'cluster-shape', 'cluster-colour'
        ]
        env_re_s = f'(?P<env>{"|".join(env_names)})'
        # for parsing the "top-level" algorithm that will be displayed in the table
        top_levels = collections.OrderedDict([
            # lack of closing parens is intentional! We'll insert them at the end,
            # after adding ego/allo designator, augmentation info, etc.
            ('bc-st-', 'BC (ST'),
            ('bc-mt-', 'BC (MT'),
            ('bc-mt-ft-', 'BC (FT-MT'),
            ('gail-st-', 'GAIL (ST'),
            ('gail-mt-', 'GAIL (MT'),
            # this is the catch-all
            ('gail-', 'GAIL (ST'),
        ])
        top_level_re_s = f'(?P<top_level>{"|".join(top_levels)})'
        # for parsing any augmentation ablations
        augmentations = {
            'aug-no-col-': 'no col. aug.',
            'aug-no-trans-': 'no trans./rot. aug.',
            'aug-none-': 'no aug.',
        }
        aug_re_s = f'(?P<aug>{"|".join(augmentations)})'
        preprocs = {'allo-': 'allo.'}
        preproc_re_s = f'(?P<preproc>{"|".join(preprocs)})'
        variants = {
            'demo': 'trans. to Demo',
            'jitter': 'trans. to Jitter',
            'layout': 'trans. to Layout',
            'colour': 'trans. to Colour',
            'shape': 'trans. to Shape',
            'countplus': 'trans. to CountPlus',
            'dynamics': 'trans. to Dynamics',
            'all': 'trans. to All',
        }
        trans_re_s = f'(trans-(?P<trans_env>{"|".join(variants)})-)'
        full_re_s = f'{top_level_re_s}{preproc_re_s}?{aug_re_s}?{trans_re_s}?on-{env_re_s}'
        full_re = re.compile(full_re_s)
    else:
        full_re = _fan_full_re

    match = full_re.match(alg_name)
    if not match:
        # couldn't process :(
        return alg_name

    # can also access match.group('env') if needed
    match_top_level = top_levels.get(match.group('top_level'))
    match_aug = augmentations.get(match.group('aug'))
    match_trans = variants.get(match.group('trans_env'))
    match_preproc = preprocs.get(match.group('preproc'))
    # the top level includes an opening paren, but not a closing paren
    result_parts = [match_top_level]
    designators = (match_preproc, match_aug, match_trans)
    if any(designators):
        result_parts.append(', ')
        result_parts.append(', '.join(filter(bool, designators)))
    result_parts.append(')')
    # TODO: when you actually go to use this, double-check that it doesn't
    # accidentally alias unrelated things
    return ''.join(result_parts)

Here's the code to produce a full table of **all** results. This is pretty dense; in the next cell, I'm going to summarise it by averaging results for each variant over all tasks (this is only meaningful for some variants).

In [None]:
print('Colour scale:', assign_colours(np.linspace(0, 1, 8), legend=True))
print('')

demo_envs = sorted(
    frame['demo_env'].unique(),
    key=lambda s: PROBLEM_PREFIX_ORDER.index(s.split('-')[0]))
tables_as_latex = []
for demo_env in demo_envs:
    prefix = to_prefix(demo_env)
    subset = frame[frame['demo_env'] == demo_env]
    pivot = pd.pivot_table(
        subset, index='latex_alg_name', columns='variant',
        values=('mean_score', 'std_score'),
        aggfunc=(reduced_mean, reduced_std),
        dropna=False)
    pivot.columns = flatten_top_n_levels(pivot.columns, 2)
    renamed_pivot = pivot.groupby(axis=1, level=1).apply(compute_cell_contents)
    for col_name in VARIANT_SHORT_NAMES:
        if col_name not in renamed_pivot.columns:
            renamed_pivot[col_name] = '-'
    sorted_pivot = renamed_pivot[list(VARIANT_SHORT_NAMES)]
    sorted_cols = [VARIANT_SHORT_NAMES.get(c, c) for c in sorted_pivot.columns]
    sorted_pivot.columns = pd.MultiIndex.from_product([(r'\textbf{%s}' % prefix, ), sorted_cols], names=('Task', 'Variant'))
    sorted_pivot.index = sorted_pivot.index.map(format_alg_name).rename("Method")
    sorted_pivot.sort_index(axis=0, inplace=True)
    with pd.option_context("max_colwidth", 1000):
        # max_colwidth is dealing with weird pandas bug:
        # https://github.com/pandas-dev/pandas/issues/6491
        latex_formatted = sorted_pivot.to_latex(
            # label=f'tab:res-{prefix.lower()}',
            column_format='l' + 'c' * len(sorted_cols),
            bold_rows=False,
            escape=False,
        ).replace('{l}', '{c}')

    latex_lines = []
    for line in latex_formatted.splitlines():
        # throw out useless "method" line
        if line.startswith('Method  '):
            continue
        to_strip = ('Task ', 'Variant ')
        for t in to_strip:
            if line.startswith(t):
                line = ' ' * len(t) + line[len(t):]
        latex_lines.append(line)
    tables_as_latex.append(latex_lines)
    
tabs_per_out = 2
for tab_start in range(0, len(tables_as_latex), tabs_per_out):
    sub_tabs = tables_as_latex[tab_start:tab_start + tabs_per_out]
    out_lines = []
    for tab_num, latex_lines in enumerate(sub_tabs, start=1):
        if not out_lines:
            # keep first few lines (including the \begin{tabular})
            out_lines.extend(latex_lines[:2])
        out_lines.extend(latex_lines[2:-2])
        if tab_num == len(sub_tabs):
            out_lines.extend(latex_lines[-2:])
        else:
            out_lines.extend([r'\midrule'])
    print('\n'.join(out_lines))
    print('\n' * 5)

Now here is a version of the tables above where we just summarise over each variant. In the colour/shape case, this is a little misleading, since some tasks benefit from randomisation of those attributes, while others do not.

In [None]:
variant_frame = frame.copy()
variant_frame['latex_alg_name'] = variant_frame['latex_alg_name'].map(format_alg_name)
variant_pivot = pd.pivot_table(
    variant_frame, index='latex_alg_name', columns='variant',
    values=('mean_score', 'std_score'),
    aggfunc=(reduced_mean, reduced_std),
    dropna=False)
variant_pivot.columns = flatten_top_n_levels(variant_pivot.columns, 2)
renamed_variant_pivot = variant_pivot.groupby(axis=1, level=1).apply(compute_cell_contents)
renamed_variant_pivot = renamed_variant_pivot[list(VARIANT_SHORT_NAMES)]
renamed_variant_pivot.index = renamed_variant_pivot.index.rename("Method")
variant_sorted_cols = [r'\textbf{' + VARIANT_SHORT_NAMES.get(c, c) + '}' for c in renamed_variant_pivot.columns]
renamed_variant_pivot.columns = pd.CategoricalIndex(variant_sorted_cols)
with pd.option_context("max_colwidth", 1000):
    # max_colwidth is dealing with weird pandas bug:
    # https://github.com/pandas-dev/pandas/issues/6491
    variant_latex_formatted = renamed_variant_pivot.to_latex(
        # label=f'tab:res-{prefix.lower()}',
        column_format='l' + 'c' * len(variant_sorted_cols),
        bold_rows=True,
        escape=False,
    ).replace('{l}', '{c}')
print(variant_latex_formatted)

In [None]:
# filtered version of the above for final paper
variant_frame_reduced = frame.copy()
variant_frame_reduced['latex_alg_name'] = variant_frame_reduced['latex_alg_name'].map(format_alg_name)
variant_pivot_reduced = pd.pivot_table(
    variant_frame_reduced, index='latex_alg_name', columns='variant',
    values=('mean_score', 'std_score'),
    aggfunc=(reduced_mean, reduced_std),
    dropna=False)
variant_pivot_reduced.columns = flatten_top_n_levels(variant_pivot_reduced.columns, 2)
renamed_variant_pivot_reduced = variant_pivot_reduced.groupby(axis=1, level=1).apply(compute_cell_contents)
variant_shorter_names = collections.OrderedDict([
  ("Demo", "Demo"),
  ("TestJitter", "Jitter"),
  ("TestLayout", "Layout"),
  ("TestColour", "Colour"),
  ("TestShape", "Shape"),
  # ("TestCountPlus", "CountPlus"),
  # ("TestDynamics", "Dynamics"),
  # ("TestAll", "All"),
])
renamed_variant_pivot_reduced = renamed_variant_pivot_reduced[list(variant_shorter_names)]
renamed_variant_pivot_reduced.index = renamed_variant_pivot_reduced.index.rename("Method")
variant_sorted_cols_reduced = [VARIANT_SHORT_NAMES.get(c, c) for c in renamed_variant_pivot_reduced.columns]
renamed_variant_pivot_reduced.columns = pd.CategoricalIndex(variant_sorted_cols_reduced)
with pd.option_context("max_colwidth", 1000):
    # max_colwidth is dealing with weird pandas bug:
    # https://github.com/pandas-dev/pandas/issues/6491
    variant_latex_formatted_reduced = renamed_variant_pivot_reduced.to_latex(
        # label=f'tab:res-{prefix.lower()}',
        column_format='l' + 'c' * len(variant_sorted_cols_reduced),
        bold_rows=True,
        escape=False,
    ).replace('{l}', '{c}')
print(variant_latex_formatted_reduced)

In [None]:
# do the same thing again, but with Markdown for the README

def compute_cell_contents(tab):
    (_, col_name), *_ = tab.columns.to_flat_index()
    mean_series_means = tab['mean_score__reduced_mean'].squeeze(axis=1)
    std_series_means = tab['std_score__reduced_mean'].squeeze(axis=1)
    mean_series_stds = tab['mean_score__reduced_std'].squeeze(axis=1)
    std_series_stds = tab['std_score__reduced_std'].squeeze(axis=1)
    cols = assign_colours(mean_series_means)
    result_list = [
        r'{1:.2f}±{2:.2f}'.format(*col_mu_std)
        for col_mu_std in zip(cols, mean_series_means, mean_series_stds,
                              std_series_means, std_series_stds)
    ]
    # now construct a new frame with column name taken from 'tab', and indices
    # taken from 'mean_series' and 'std_series'
    contents = pd.Series(data=result_list, index=tab.index, name=col_name)
    return contents

variant_frame_reduced = frame.copy()
variant_frame_reduced['latex_alg_name'] = variant_frame_reduced['latex_alg_name'].map(format_alg_name)
variant_pivot_reduced = pd.pivot_table(
    variant_frame_reduced, index='latex_alg_name', columns='variant',
    values=('mean_score', 'std_score'),
    aggfunc=(reduced_mean, reduced_std),
    dropna=False)
variant_pivot_reduced.columns = flatten_top_n_levels(variant_pivot_reduced.columns, 2)
renamed_variant_pivot_reduced = variant_pivot_reduced.groupby(axis=1, level=1).apply(compute_cell_contents)
renamed_variant_pivot_reduced = renamed_variant_pivot_reduced[list(variant_shorter_names)]
renamed_variant_pivot_reduced.index = renamed_variant_pivot_reduced.index.rename("Method")
variant_sorted_cols_reduced = [VARIANT_SHORT_NAMES.get(c, c) for c in renamed_variant_pivot_reduced.columns]
renamed_variant_pivot_reduced.columns = pd.CategoricalIndex(variant_sorted_cols_reduced)
with pd.option_context("max_colwidth", 1000):
    variant_markdown_formatted_reduced = renamed_variant_pivot_reduced.to_markdown()
print(variant_markdown_formatted_reduced)

## Figuring out average running time

GAIL is easy because the logger records cumulative time by default. BC is much harder; I need to kind of triangulate it from debug.log :(

In [None]:
PROGRESS_CSV_PATTERN = '../scratch/full-runs-2020-05-26/run_gail*/progress.csv'
progress_paths = glob.glob(os.path.expanduser(PROGRESS_CSV_PATTERN))
print(f"Found {len(progress_paths)} progress.csv files")
times = []
for progress_path in progress_paths:
    time_csv = pd.read_csv(progress_path)
    try:
        times.append(time_csv['CumTime (s)'].max())
    except KeyError as ex:
        raise KeyError(f"exception when processing '{progress_path}': {ex}")
print(f'Mean time (h): {np.mean(times) / (60*60)}')

In [None]:
import dateparser
import datetime

DEBUG_LOG_PATTERN = '../scratch/full-runs-2020-05-26/run_bc*/debug.log'
debug_log_paths = glob.glob(os.path.expanduser(DEBUG_LOG_PATTERN))
print(f"Found {len(debug_log_paths)} progress.csv files")
bc_durations = []
hour = datetime.timedelta(hours=1)
for debug_log_path in debug_log_paths:
    with open(debug_log_path, 'r') as fp:
        first_line = None
        last_line = None
        for l in fp:
            l = l.strip()
            if not l:
                continue
            last_line, *_ = l.split('|', 1)
            if first_line is None:
                first_line = last_line
        if last_line is not None:
            first_time = dateparser.parse(first_line)
            last_time = dateparser.parse(last_line)
            bc_durations.append((last_time - first_time) / hour)
        else:
            print(f"Couldn't parse times from {debug_log_path} (??)")

In [None]:
print(f'Mean running time: {np.mean(bc_durations)}h')