# Benchmarks to try and improve the run time of minlp solver scaling up

In [1]:
%load_ext autoreload
%autoreload 2

In [47]:
import itertools
import numpy as np
from collections import namedtuple
np.random.seed(42)

# For starters, we'll simply benchmark on fixed randomly generated test sets of increasing size, dimension and scale (decreased density).
sizes = [10,100]
dimensions = [1,2,3,10]
scales = [1, 10]

Dataset = namedtuple("Dataset", "data size dimension scale source")

datasets = {(i,j,k): Dataset(np.random.rand(i,j) * k, i, j, k, "random") for i,j,k in itertools.product(sizes, dimensions, scales)}

Some considerations on how to set the min_volume, the absolute_error and the relative_error below:

As we're sampling from a uniform distribution, we know that the true value of f for every pattern is 1. In this sense, for this benchmark the ground truth is trivial. But we care about the computational effect of tweaks and settings, so this is fine (we'll also benchmark other datasets).

Nevertheless, we do need to "simulate" an anomaly detection experiment, because we want to set ranges for the min_volume and the errors that would be relevant if

The f_hat samples should then be normally distributed around 1, with decreasing variance prop to 1/sqrt(N). To remind ourselves, epsilon bounds If we want to achieve a certain delta and epsilon values, then we have to choose

In [78]:
from typing import Callable
import time
from rare_pattern_detect.minlp_based import MINLPModel

Parameter = namedtuple("Parameter", "name value")
# solver parameters
bound_included = [Parameter("bound included", v) for v in [0.05, 1]]
initial_patterns = [Parameter("initial pattern", v) for v in ["minimal", "maximal"]]
min_volumes = [Parameter("min volume", v) for v in [0.01, 0.05, 0.1]]
absolute_errors = [Parameter("absolute error", v) for v in [1e-3, 0.1, 0.2]] # thinking this through I believe whatever absolute error we set, we can simply add it to the epsilon of the pac performance.
relative_errors = [Parameter("relative error", v) for v in [1e-3, 0.1, 0.2]] # this becomes useful especially if we set epsilon to be relative to an estimate of f.
# use_parallel_threads = [2, 4, 6]
# use_different_solvers = [Falase, ]

parameters = [bound_included,
              initial_patterns,
              min_volumes,
              absolute_errors,
              relative_errors
              ]

def generate_solver_settings(in_dict):
    solver_settings = {}
    if "absolute error" in in_dict:
        solver_settings["absolute_bound_tolerance"] = in_dict["absolute error"]
    if "relative error" in in_dict:
        solver_settings["relative_bound_tolerance"] = in_dict["relative error"]
    return solver_settings

def run_on_testset(data, testdata, **kwargs):
    solutions = []
    for point in testdata:
        model = MINLPModel(data, min_volume="kwargs", **kwargs)
        solutions.append(model.find_min_f_hat(point, solver_settings=generate_solver_settings(kwargs)))
    return solutions, model.solver_settings

def run_on_whole_dataset(data):
    return run_on_testset(data, data)

def run_on_fixed_size_sample(data, size, **kwargs):
    np.random.seed(0)
    N = len(data)
    assert size <= N, "size larger than dataset"
    sample_indices = np.random.choice(N, max(1, size))
    return run_on_testset(data, data[sample_indices], **kwargs)

def run_on_fraction(data, fraction, **kwargs):
    return run_on_fixed_size_sample(data, round(len(data)*fraction), **kwargs)

def benchmark_and_store_result(expression: Callable):
    start = time.time()
    res = expression.__call__()
    end = time.time()
    return *res, end - start

In [None]:
import mlflow

for parameter_combo in itertools.product(*parameters):
    for dataset in datasets.values():

        with mlflow.start_run():
            # dataset parameters
            mlflow.log_param("size", dataset.size)
            mlflow.log_param("dimension", dataset.dimension)
            mlflow.log_param("density", dataset.scale)
            mlflow.log_param("source", dataset.source)

            # option parameters
            for parameter in parameter_combo:
                mlflow.log_param(parameter.name, parameter.value)

            parameter_dict = {parameter.name : parameter.value for parameter in parameter_combo}

            # timing results
            f_hats, solver_settings, time_passed = benchmark_and_store_result(lambda: run_on_fixed_size_sample(dataset.data,1, **parameter_dict))
            mlflow.log_metric("time", time_passed)
            mlflow.log_metric("average f_hat", np.mean(f_hats))

            for k,v in solver_settings.items():
                mlflow.log_param(k,v)

main MILP was unbounded. Resolving with arbitrary bound values of (-1e+15, 1e+15) on the objective. You can change this bound with the option obj_bound.
infeasibility detected in deactivate_trivial_constraints
Feasibility subproblem infeasible. This should never happen.
main MILP was unbounded. Resolving with arbitrary bound values of (-1e+15, 1e+15) on the objective. You can change this bound with the option obj_bound.
infeasibility detected in deactivate_trivial_constraints
Feasibility subproblem infeasible. This should never happen.
main MILP was unbounded. Resolving with arbitrary bound values of (-1e+15, 1e+15) on the objective. You can change this bound with the option obj_bound.
infeasibility detected in deactivate_trivial_constraints
Feasibility subproblem infeasible. This should never happen.
main MILP was unbounded. Resolving with arbitrary bound values of (-1e+15, 1e+15) on the objective. You can change this bound with the option obj_bound.
infeasibility detected in deactiva