# Adaptive CMA-ES configurations - Pre-processing

This Python Notebook covers the pre-processing of data for the adaptive CMA-ES research.

The input data consists of raw BBOB logging files (a few GB's).

As output, we store a CSV with the required runtime in terms of evaluations to reach each specified target value for all runs, separated into files for each function/dimensionality pair.

> Sander van Rijn<br>
> s.j.van.rijn@liacs.leidenuniv.nl<br>
> LIACS<br>
> 2018-03-19

In [1]:
%matplotlib inline

from __future__ import division, print_function

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import product
from collections import Counter

In [2]:
# Some utility functions for dealing with the representations

# First, some hardcoded variables
num_options_per_module = [2]*9        # Binary part
num_options_per_module.extend([3]*2)  # Ternary part
max_length = 11
factors = [2304, 1152, 576, 288, 144, 72, 36, 18, 9, 3, 1]

def list_all_representations():
    """ Create a list of all possible representations for the modular CMA-ES.
        Each representation is itself a list with <max_length> integer entries {0, 1, ..., n},
        where 'n' is the number of options for the module in that position.
    """
    products = []
    # count how often there is a choice of x options
    counts = Counter(num_options_per_module)
    for num, count in sorted(counts.items(), key=lambda x: x[0]):
        products.append(product(range(num), repeat=count))
    all_representations = []
    for representation in list(product(*products)):
        all_representations.append(list(sum(representation, ())))
    return all_representations


def reprToString(representation):
    """ Function that converts the structure parameters of a given ES-structure representation to a string

        >>> reprToInt([0,0,0,0,0,1,0,1,0,1,0])
        >>> '00000101010'
    """
    return ''.join([str(i) for i in representation[:max_length]])

In [3]:
main_location = '/media/rijnsjvan/Data/SurfDrive/Research Data/Adaptive ES/'
data_location = main_location + 'test_results/'
output_location = main_location + 'anytime_convergence/data/'

repetition_format = '-{rep:02d}'
file_name = '{config}/{D}d-f{f}/data_f{f}/bbobexp{rep}_f{f}_DIM{D}.dat'

instances = list(range(5))
num_repetitions = 5
ndims = [5, 20]
fids = [1, 10, 15, 20]



num_steps = 51
exponents = np.round(np.linspace(2, -8, num_steps), decimals=1)
target_values = np.power([10]*num_steps, exponents)

all_configurations = list_all_representations()

In [4]:
# A utility function for loading a full result file
def loadfile(fname):
    return np.genfromtxt(fname, delimiter=' ', skip_header=1, dtype=[np.int, np.float])


def indexbasedToStepwise(data, max_budget):
    """ Inflates the data to a full-length stepwise decreasing array: [5, 5, 5, 5, 2, 2, 2, 1, 1, 1, ...]
    """
    indices, values = map(np.array, list(zip(*data)))
    repetitions = np.append(indices[1:],[max_budget + 1]) - indices
    
    if repetitions[-1] < 0:
        repetitions[-1] = 0
    
    return np.repeat(values, repetitions)[:max_budget]


def indexbasedToLinear(data, max_budget):
    """ Inflates the data to a full-length interpolated decreasing array: [5, 4.25, 3.5, 2.75, 2, 1.67, 1.33, 1, ...]
    """
    indices, values = map(np.array, list(zip(*data)))
    shifted_indices = np.append(indices[1:],[max_budget + 1])
    
    repetitions = shifted_indices - indices
    
    if repetitions[-1] < 0:
        repetitions[-1] = 0
    
    interpolated = []
    for start, end, reps in zip(values[:-1], values[1:], repetitions):
        interpolation = np.linspace(start, end, reps, endpoint=False)
        interpolated.append(interpolation)
        
    interpolated.append([values[-1]])
    
    return np.concatenate(interpolated)

In [5]:
def determineTimesToTargets(data, targets):
    """ Given the entire run-data of an algorithm, determine the required run-time in evaluations
        for each algorithm to reach each of the specified targets.
    """
    times_to_targets = np.array([np.NaN]*len(targets), dtype=np.float)
    prev_idx = 0
    for idx, target in enumerate(targets):
        below_target = data < target
        indices = np.argwhere(below_target)
        if len(indices) > 0:
            times_to_targets[idx] = np.min(indices)
        else:
            break
    
    return times_to_targets

# Simplifying: loads of data to managable CSV's
So far, this has all been basic setup stuff. Now we're going to actually summarize our data.

Rather than working with the data of all complete runs, we will summarize to what we are actually interested in: the required number of evaluations for each algorithm to reach various targets during the optimization process.

In [6]:
def createRuntimeRecord(representation, ndim, fid, iid, rep, *, budget_factor=1e4):
    """ Create a single record: what are the required runtimes for all targets for 
        a given run: algorithm {representation} on {ndim}D f{fid}, instance {iid} repetition {rep}
    """
    budget = int(ndim * budget_factor)
    run_num = iid*num_repetitions + rep
    if run_num == 0:
        run_num = ''
    else:
        run_num = repetition_format.format(rep=run_num)

    fname = file_name.format(config=reprToString(representation), f=fid, D=ndim, rep=run_num)
    raw_data = loadfile(fname)
    data = indexbasedToLinear(raw_data, budget)

    runtimes = determineTimesToTargets(data, target_values)
    return (representation, ndim, fid, iid, rep, *runtimes)

# Labels for the records created by the function above to be used when loading the records into a pandas dataframe
record_labels = [
    'Representation', 
    'ndim', 
    'function ID', 
    'instance ID', 
    'repetition', 
    *('10e{}'.format(exponent) for exponent in exponents)
]

In [7]:
os.chdir(data_location)
record = createRuntimeRecord(list_all_representations()[0], 5, 1, 1, 1)
print(record)

([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 5, 1, 1, 1, 2.0, 5.0, 8.0, 10.0, 12.0, 213.0, 285.0, 338.0, 346.0, 351.0, 354.0, 356.0, 371.0, 404.0, 439.0, 443.0, 450.0, 486.0, 493.0, 496.0, 516.0, 528.0, 536.0, 541.0, 544.0, 640.0, 645.0, 651.0, 679.0, 685.0, 689.0, 699.0, 704.0, 720.0, 758.0, 775.0, 786.0, 807.0, 811.0, 828.0, 840.0, 851.0, 871.0, 878.0, 882.0, 884.0, 886.0, 887.0, 887.0, 887.0, 961.0)


In [8]:
# defining a progress bar (https://github.com/alexanderkuk/log-progress)
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

** WARNING **

The following block of code does the heavy lifting. It is parallelized into one thread per ndim/fid combination. In a single thread it takes 30-60 seconds for every 1.000 records to create.

Runtime duration on i7-6700 CPU @ 3.40GHz (4-core, 8-threads), parallelized for 8 experiments: 2-3 hours (900.000 records).

_ You have been warned... _

In [9]:
def createsummarycsv(ndim, fid, cases):
    all_records = []
    for configuration, iid, rep in log_progress(cases, every=100, name='{}D F{}'.format(ndim, fid)):
        try:
            record = createRuntimeRecord(configuration, ndim, fid, iid, rep)
            all_records.append(record)
        except FileNotFoundError:
            pass
        except OSError:
            pass

    df = pd.DataFrame.from_records(all_records, columns=record_labels)
    df.to_csv(output_location + 'interpolated_ART_data_{}D-f{}.csv'.format(ndim, fid))


os.chdir(data_location)

cases = list(product(all_configurations, instances, list(range(num_repetitions))))
num_cases = len(all_configurations)*len(ndims)*len(fids)*len(instances)*num_repetitions
print('Found {} cases to process. This may take a while...'.format(num_cases))

# Parallel execution
from IPython.lib import backgroundjobs as bg
jobs = bg.BackgroundJobManager()
for ndim, fid in product(ndims, fids):
    jobs.new(createsummarycsv, ndim, fid, cases)

Found 921600 cases to process. This may take a while...
Starting job # 0 in a separate thread.
Starting job # 2 in a separate thread.
Starting job # 3 in a separate thread.
Starting job # 4 in a separate thread.
Starting job # 5 in a separate thread.
Starting job # 6 in a separate thread.
Starting job # 7 in a separate thread.
Starting job # 8 in a separate thread.


And we're done. Now we have the pre-processed CSV files to work with instead, which we will do in another Notebook for clarity's sake.

Of course, if the data in the CSVs has to be changed, this script has to be run again.

# Aggregating

Now we have summarized the data, we can clean it up further by using the bootstrapping procedure as described in https://arxiv.org/abs/1605.03560.

In [24]:
# output_fname = 'steepness_data_{}D-f{}.csv'
output_fname = 'interpolated_ART_data_{}D-f{}.csv'

def get_data(ndim, fid):
    return pd.read_csv(output_location + output_fname.format(ndim, fid), index_col=0)

## ERT

As it requires no further simulation, we create aggregations based on the ERT first

In [27]:
def aggregateByERT(df, max_budget):
    df = df.drop(columns=['instance ID', 'repetition'])

    sums = df.fillna(max_budget).groupby(by=['Representation', 'ndim', 'function ID']).sum()
    counts = df.groupby(by=['Representation', 'ndim', 'function ID']).count()

    ERTs = sums/counts
    ERTs = ERTs.replace(np.inf, np.NaN).reset_index()
    return ERTs


# df = get_data(5, 1)
# new_df = aggregateByERT(df, 5*10e4)
# print(new_df)
# print(new_df.shape)

In [29]:
for ndim, fid in product(ndims, fids):
    df = get_data(ndim=ndim, fid=fid)
    df = aggregateByERT(df, ndim*10e4)
    df.to_csv(output_location + 'ERT_data_{}D-f{}.csv'.format(ndim, fid))
#     df.to_csv(output_location + 'stepwise_ERT_data_{}D-f{}.csv'.format(ndim, fid))

## Bootstrapping aRT

Now for the slightly more difficult/time consuming option: actual bootstrapping the aRT

In [None]:
num_samples = 100

def bootstrapART(df, max_budget):
    df = df.drop(columns=['instance ID', 'repetition'])
    pass
    #TODO: implement bootstrapping
#     sums = df.fillna(max_budget).groupby(by=['Representation', 'ndim', 'function ID']).sum()
#     counts = df.groupby(by=['Representation', 'ndim', 'function ID']).count()

#     ERTs = sums/counts
#     ERTs = ERTs.reset_index()
#     return ERTs