# Benchmarks to try and improve the run time of minlp solver scaling up

In [1]:
%load_ext autoreload
%autoreload 2

In [80]:
import itertools
import numpy as np
from collections import namedtuple
np.random.seed(42)

# For starters, we'll simply benchmark on fixed randomly generated test sets of increasing size, dimension and scale (decreased density).
sizes = [100]
dimensions = [2,4]
scales = [1]

Dataset = namedtuple("Dataset", "data size dimension scale source")

datasets = {(i,j,k): Dataset(np.random.rand(i,j) * k, i, j, k, "random") for i,j,k in itertools.product(sizes, dimensions, scales)}

Note: Below we're setting values for the min_volume and absolute errors that are based on the fact that we here know the underlying distribution to be uniform, which means that the true underlying $f(h)$ value for all patterns $h$ is actually $1$. We really only care about compuational effects, but the hope is that these are roughly the kinds of ranges that are relevant when dealing with actual anomaly detection problems and other distributions

In [81]:
from typing import Callable
import time
from rare_pattern_detect.minlp_based import MINLPModel

Parameter = namedtuple("Parameter", "name value")
# solver parameters
bound_included = [Parameter("bound included", v) for v in [0.05, 1]]
initial_patterns = [Parameter("initial pattern", v) for v in ["minimal", "maximal"]]
min_volumes = [Parameter("min volume", v) for v in [0.05, 0.1]]
absolute_errors = [Parameter("absolute error", v) for v in [1e-3, 0.1]] # thinking this through I believe whatever absolute error we set, we can simply add it to the epsilon of the pac performance.
relative_errors = [Parameter("relative error", v) for v in [1e-3, 0.1]] # this becomes useful especially if we set epsilon to be relative to an estimate of f.
# use_parallel_threads = [2, 4, 6]
# use_different_solvers = [Falase, ]

parameters = [bound_included,
              initial_patterns,
              min_volumes,
              absolute_errors,
              relative_errors
              ]

def generate_solver_settings(in_dict):
    solver_settings = {}
    if "absolute error" in in_dict:
        solver_settings["absolute_bound_tolerance"] = in_dict["absolute error"]
    if "relative error" in in_dict:
        solver_settings["relative_bound_tolerance"] = in_dict["relative error"]
    return solver_settings

def run_on_testset(data, testdata, **kwargs):
    solutions = []
    for point in testdata:
        model = MINLPModel(data, min_volume="kwargs", **kwargs)
        solutions.append(model.find_min_f_hat(point, solver_settings=generate_solver_settings(kwargs)))
    return solutions, model.solver_settings

def run_on_whole_dataset(data):
    return run_on_testset(data, data)

def run_on_fixed_size_sample(data, size, **kwargs):
    np.random.seed(0)
    N = len(data)
    assert size <= N, "size larger than dataset"
    sample_indices = np.random.choice(N, max(1, size))
    return run_on_testset(data, data[sample_indices], **kwargs)

def run_on_fraction(data, fraction, **kwargs):
    return run_on_fixed_size_sample(data, round(len(data)*fraction), **kwargs)

def benchmark_and_store_result(expression: Callable):
    start = time.time()
    res = expression.__call__()
    end = time.time()
    return *res, end - start

In [83]:
import mlflow

for parameter_combo in itertools.product(*parameters):
    for dataset in datasets.values():
        print(f"Running experiment for dataset {(dataset.size, dataset.dimension)} with parameters {[(p.name, p.value) for p in parameter_combo]}")
        with mlflow.start_run():
            # dataset parameters
            mlflow.log_param("size", dataset.size)
            mlflow.log_param("dimension", dataset.dimension)
            mlflow.log_param("density", dataset.scale)
            mlflow.log_param("source", dataset.source)

            # option parameters
            for parameter in parameter_combo:
                mlflow.log_param(parameter.name, parameter.value)

            parameter_dict = {parameter.name : parameter.value for parameter in parameter_combo}

            # min_volume is expressed as fraction of scale
            parameter_dict["min volume"] *= dataset.scale

            # timing results
            f_hats, solver_settings, time_passed = benchmark_and_store_result(lambda: run_on_fixed_size_sample(dataset.data,1, **parameter_dict))
            performance = np.mean(f_hats)
            mlflow.log_metric("time", time_passed)
            mlflow.log_metric("average f_hat", performance)
            print(f"Results: f_hat: {performance}, time: {time}")

            for k,v in solver_settings.items():
                mlflow.log_param(k,v)

Running experiment for dataset (100, 2) with parameters [('bound included', 0.05), ('initial pattern', 'minimal'), ('min volume', 0.05), ('absolute error', 0.001), ('relative error', 0.001)]
Running experiment for dataset (100, 4) with parameters [('bound included', 0.05), ('initial pattern', 'minimal'), ('min volume', 0.05), ('absolute error', 0.001), ('relative error', 0.001)]
Running experiment for dataset (100, 2) with parameters [('bound included', 0.05), ('initial pattern', 'minimal'), ('min volume', 0.05), ('absolute error', 0.001), ('relative error', 0.1)]
Running experiment for dataset (100, 4) with parameters [('bound included', 0.05), ('initial pattern', 'minimal'), ('min volume', 0.05), ('absolute error', 0.001), ('relative error', 0.1)]
Running experiment for dataset (100, 2) with parameters [('bound included', 0.05), ('initial pattern', 'minimal'), ('min volume', 0.05), ('absolute error', 0.1), ('relative error', 0.001)]
Running experiment for dataset (100, 4) with parame

In [None]:
!mlflow ui

[2023-01-27 11:27:24 +0100] [25367] [INFO] Starting gunicorn 20.1.0
[2023-01-27 11:27:24 +0100] [25367] [INFO] Listening at: http://127.0.0.1:5000 (25367)
[2023-01-27 11:27:24 +0100] [25367] [INFO] Using worker: sync
[2023-01-27 11:27:24 +0100] [25369] [INFO] Booting worker with pid: 25369
[2023-01-27 11:27:24 +0100] [25370] [INFO] Booting worker with pid: 25370
[2023-01-27 11:27:24 +0100] [25371] [INFO] Booting worker with pid: 25371
[2023-01-27 11:27:24 +0100] [25372] [INFO] Booting worker with pid: 25372
