In [None]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt

np.set_printoptions(linewidth=1000, edgeitems=30)

# Illustrative proof of concept

Given some function, we want to create a sample of points in such a way that they are likely to be around the minimum of that function, and unlikely to be at the maximum of that function.

To illustrate, let's look at a simple function $f(x) = x^2$

In [None]:
x = np.linspace(start=-5, stop=5, endpoint=True, num=201)
y = x**2
plt.plot(x,y)
plt.savefig(f'plots/example.pdf')
plt.show()

To create our function-specific sample, we first take a sample uniformly at random along the x-axis. These points are potential candidates for our final sample.

In [None]:
sample = np.random.uniform(low=-5, high=5, size=30)

plt.plot(x,y)
plt.scatter(sample, sample**2)
plt.savefig(f'plots/example-raw.pdf')
plt.show()

So how do we decide whether or not to keep each of these points?

We draw another uniform sample, one for each point from our previous sample. These values are scaled to the y-range we are looking at, so [0, 25] in this case.
Then we compare the actual value of our sample to its matched value from this second sample. We accept each point that is lower than its matched value, and ignore each value that is higher than its matched value.

In [None]:
matched_values = np.random.uniform(low=0, high=25, size=30)

plt.plot(x,y)
plt.scatter(sample, sample**2)
plt.bar(x=sample, height=matched_values, width=.05, color='C0')
plt.savefig(f'plots/example-matched.pdf')
plt.show()

In [None]:
accept = matched_values >= sample**2
ignore = matched_values < sample**2

plt.plot(x,y, zorder=1)
plt.bar(x=sample[accept], height=matched_values[accept], width=.05, color='green', zorder=2)
plt.scatter(sample[accept], sample[accept]**2, color='green', zorder=2)
plt.bar(x=sample[ignore], height=matched_values[ignore], width=.05, color='red', alpha=.5, zorder=3)
plt.scatter(sample[ignore], sample[ignore]**2, color='red', alpha=.5, zorder=3)
plt.savefig(f'plots/example-included-excluded.pdf')
plt.show()

# For 2d functions
## Some initial setup

In [None]:
def row_vectorize(func):
    def new_func(X):
        try:
            return np.array([func(row) for row in X])
        except TypeError:
            return func(X)
    return new_func


@row_vectorize
def hm(xx):
    """
    HIMMELBLAU FUNCTION

    INPUT:
    xx = [x1, x2]
    """
    x1, x2 = xx

    term1 = (x1**2 + x2 - 11)**2
    term2 = (x2**2 + x1 - 7)**2

    return term1 + term2


@row_vectorize
def sphere(xx):
    return np.sum([x**2 for x in xx])

hm.name = 'hm'
hm.min_val = 0
hm.max_val = 450

sphere.name = 'sphere'
sphere.min_val = 0
sphere.max_val = 50


fit_func = sphere

## Determining the high/low range of values for this function

In [None]:
upper_lim, lower_lim = 5, -5
step_size = 0.1

num_steps = int((upper_lim-lower_lim)//step_size + 4)
linespec = np.linspace(lower_lim-step_size, upper_lim+step_size, num_steps)

X, Y = np.meshgrid(linespec, linespec)
Z = np.array([fit_func(x_y) for x_y in zip(X.flatten(), Y.flatten())])

In [None]:
plt.pcolor(X, Y, Z.reshape((num_steps, num_steps)), cmap='viridis_r')
plt.colorbar()
plt.tight_layout()
plt.savefig(f'plots/{fit_func.name}.pdf')
plt.show()

In [None]:
percentage = .95

min_val, max_val = Z.min(), Z.max()
print(f'min: {min_val}, max: {max_val}')

print(f'{int(np.round(percentage*100))}th percentile: {np.sort(-Z)[int(Z.size*percentage)]}')

plt.hist(Z, cumulative=True, density=True, bins=100)
plt.axhline(y=percentage, color='black', alpha=.5)
plt.show()

Although the lowest value is -900, only 5% of all values are below ~450, so a slightly higher 'minimum value' should be used when determinning the sampling range.

## Function-dependent sampling

In [None]:
n_samples = 1000
ndim = 2
oversampling_factor = 1.25
min_probability = 0.0
min_val, max_val = fit_func.min_val, fit_func.max_val

raw_sample = np.random.uniform(high=upper_lim, low=lower_lim, size=(int(n_samples*oversampling_factor), ndim))
f_values = fit_func(raw_sample)
f_probabilities = (f_values - min_val)/(max_val - min_val)
f_probabilities = (1-min_probability) * f_probabilities + min_probability

check_values = np.random.uniform(size=f_probabilities.shape)

A quick comparison of the uniform `check_values` distribution (orange) and the probability of accepting/keeping a uniformly sampled point in the search space according to the above calculations.

In [None]:
plt.hist(f_probabilities, label='function')
plt.hist(check_values, alpha=0.5, label='uniform')
plt.legend(loc=0)

print(f"{sum(f_probabilities > check_values)}/{int(n_samples*oversampling_factor)}")

In [None]:
scatter_size = 12

plt.pcolor(X, Y, Z.reshape((num_steps, num_steps)), cmap='viridis_r')
plt.colorbar()
plt.scatter(raw_sample[:,0], raw_sample[:,1], color='orange', s=scatter_size, label='uniform sample')
plt.legend(loc=4)
plt.tight_layout()
plt.savefig(f'plots/{fit_func.name}-raw.pdf')
plt.show()

In [None]:
new_samples = raw_sample[f_probabilities < check_values]
non_samples = raw_sample[f_probabilities >= check_values]

scatter_size = 12

plt.pcolor(X, Y, Z.reshape((num_steps, num_steps)), cmap='viridis_r')
plt.colorbar()
plt.scatter(new_samples[:,0], new_samples[:,1], color='orange', s=scatter_size, label='included in sample')
plt.scatter(non_samples[:,0], non_samples[:,1], color='red', s=scatter_size, label='excluded from sample')
plt.legend(loc=4)
plt.tight_layout()
plt.savefig(f'plots/{fit_func.name}-included-excluded.pdf')
plt.show()

## Now as a function

In [None]:
def sample_by_function(func, ndim, num_samples, min_val, max_val, upper_bound, lower_bound, *, target='minimization', min_probability=0.0):

    target = target.lower()
    if target not in ['minimization', 'maximization']:
        raise ValueError(f"Invalid optimization target '{target}', please choose 'minimization' or 'maximization' instead.")
    
    oversampling_factor = 2.5
    new_sample = np.array([]).reshape((0, ndim))
    
    while len(new_sample) < num_samples:
        raw_sample = np.random.uniform(high=upper_bound, low=lower_bound, size=(int(num_samples*oversampling_factor), ndim))
        f_values = func(raw_sample)
        f_probabilities = (f_values - min_val)/(max_val - min_val)
        f_probabilities = (1-min_probability) * f_probabilities + min_probability

        check_values = np.random.uniform(size=f_probabilities.shape)
        if target == 'minimization':
            sample_filter = f_probabilities < check_values
        elif target == 'maximization':
            sample_filter = f_probabilities > check_values
        
        new_sample = np.vstack((new_sample, raw_sample[sample_filter]))
    
    return new_sample[:num_samples]        

In [None]:
samples = sample_by_function(fit_func, 2, 100, fit_func.min_val, fit_func.max_val, 5, -5)

plt.pcolor(X, Y, Z.reshape((num_steps, num_steps)), cmap='viridis_r')
plt.colorbar()
plt.scatter(samples[:,0], samples[:,1], s=scatter_size, label='sample', color='orange')
plt.legend(loc=4)
plt.tight_layout()
plt.savefig(f'plots/{fit_func.name}-simple-sample.pdf')
plt.show()