In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import sys
from pathlib import Path

module_path = Path('../..')
if module_path not in sys.path:
    sys.path.append(str(module_path.resolve()))

import mf2
import multiLevelCoSurrogates as mlcs
from sklearn.gaussian_process import GaussianProcessRegressor, kernels

np.random.seed(20160501)  # Setting seed for reproducibility
OD = mf2.forrester

np.set_printoptions(linewidth=200, edgeitems=10, precision=4, suppress=True)
plot_dir = Path('../../plots/')
data_dir = Path('../../files/')

# Recreating the example plot in [Forrester2007 (Multi-fidelity optimization via surrogate modelling)](https://royalsocietypublishing.org/doi/full/10.1098/rspa.2007.1900)

<img src="https://royalsocietypublishing.org/cms/attachment/efa57e07-5384-4503-8b2b-ccbe632ffe87/3251fig1.jpg" alt="Forrester2007 example plot" width="400"/>

## Step by step construction

The function in question:

In [None]:
plot_x = np.linspace(start=0,stop=1,num=501).reshape(-1,1)

low_x = np.linspace(0,1,11).reshape(-1,1)
high_x = low_x[[0,4,6,10]]

In [None]:
plot_high = OD.high(plot_x)
plot_low = OD.low(plot_x)

plt.plot(plot_x, plot_high, label='high')
plt.plot(plot_x, plot_low, label='low')
plt.legend(loc=1)
plt.show()

Showing the datapoints selected by the paper.

In [None]:
high_y = OD.high(high_x)
low_y = OD.low(low_x)

line, = plt.plot(plot_x, plot_high, label='high')
plt.scatter(high_x, high_y, color=line.get_color())
line, = plt.plot(plot_x, plot_low, label='low')
plt.scatter(low_x, low_y, color=line.get_color())
plt.legend(loc=1)
plt.show()

In [None]:
# pre-defining a default kernel *with* tunable hyperparameters
kernel = kernels.ConstantKernel(constant_value=1.0) \
            * kernels.RBF(length_scale=1.0, length_scale_bounds=(1e-1, 10.0))

Training Gaussian Process models for each fidelity exclusively. Low-fidelity is a good fit, high fidelity is not.

In [None]:
gp_direct = GaussianProcessRegressor(kernel=kernel)
gp_direct.fit(high_x, high_y)

gp_low = GaussianProcessRegressor(kernel=kernel)
gp_low.fit(low_x, low_y)

line, = plt.plot(plot_x, plot_high, label='high')
plt.scatter(high_x, high_y, color=line.get_color())
line, = plt.plot(plot_x, plot_low, label='low')
plt.scatter(low_x, low_y, color=line.get_color())
plt.plot(plot_x, gp_direct.predict(plot_x), label='high-fit GP')
plt.plot(plot_x, gp_low.predict(plot_x), label='low-fit GP')
plt.legend(loc=1)
plt.show()

Co-Kriging formulation is $\hat{f}_h(x) = \rho * f_l(x) + \delta(x)$. <br>
$\hat{f}_h(x)$ is the high-fidelity prediction at $x$<br>
$\rho$ is a scaling factor<br>
$f_l(x)$ is a low-fidelity information input (either actual or another model) at $x$<br>
$\delta(x)$ is a prediction for the difference between $f_h(x)$ and $\rho * f_l(x)$<br>

$\rho$ is calculated as $1 / (1/n)\Sigma_{i=1}^n f_h(x_i) / f_l(x_i)$, i.e. `1/mean(f_high(x_high) / f_low(x_high))` with `x_high` being all input for which we have high-fidelity outcomes.

Here we start by plotting just the parts of this equation.<br>
In this example, there is an explicit scaling factor of __2__ between high and low fidelity that is seen to be easily captured by the difference model $\delta(x)$, i.e. `gp_diff`

In [None]:
low_at_high = np.array(OD.low([x for x in high_x])).reshape(-1,1)
scale = 1/np.mean(high_y / low_at_high)

diff_x = high_x
diff_y = np.array([(OD.high(x) - scale*OD.low(x)) for x in diff_x])
gp_diff = GaussianProcessRegressor(kernel=kernel)
gp_diff.fit(diff_x, diff_y)

line, = plt.plot(plot_x, plot_high, label='high')
plt.scatter(high_x, high_y, color=line.get_color())
line, = plt.plot(plot_x, plot_low, label='low')
plt.scatter(low_x, low_y, color=line.get_color())
plt.plot(plot_x, gp_direct.predict(plot_x), label='high-fit GP')
plt.plot(plot_x, gp_low.predict(plot_x), label='low-fit GP')
plt.plot(plot_x, plot_high - plot_low, label='diff')
plt.plot(plot_x, gp_diff.predict(plot_x), label='scaled diff-fit GP')
plt.legend(loc=1)
plt.show()

In [None]:
scale

The `scale` parameter here is an estimate based on the datapoints we have. For this example with only four high-fidelity points, this is a reasonable, but not exact fit. The actual value according to the function definition should be 2, and the value stated by the paper to match best in the x-range [0,1] is 1.87.

And now with the actual co-kriging prediction plotted.

In [None]:
co_y = lambda x: scale*gp_low.predict(x) + gp_diff.predict(x)

line, = plt.plot(plot_x, plot_high, label='high')
plt.scatter(high_x, high_y, color=line.get_color())
line, = plt.plot(plot_x, plot_low, label='low')
plt.scatter(low_x, low_y, color=line.get_color())
plt.plot(plot_x, gp_direct.predict(plot_x), label='high-fit GP')
plt.plot(plot_x, gp_low.predict(plot_x), label='low-fit GP')
plt.plot(plot_x, co_y(plot_x), label='co-kriging')
plt.legend(loc=1)
plt.show()

## Direct construction with (Hierarchical)Surrogate

Recreating the same plot as above using our own (Hierarchical)Surrogate interface.

In [None]:
# Archive only has to be created once...
archive = mlcs.CandidateArchive(ndim=1, fidelities=['high', 'low', 'high-low'])
archive.addcandidates(low_x, low_y, fidelity='low')
archive.addcandidates(high_x, high_y, fidelity='high')

### Without normalization by Surrogate

In [None]:
surr_high = mlcs.Surrogate.fromname('Kriging', archive, fidelity='high', normalized=False)
surr_low = mlcs.Surrogate.fromname('Kriging', archive, fidelity='low', normalized=False)
surr_hier = mlcs.HierarchicalSurrogate('Kriging', surr_low, archive, ['high', 'low'], normalized=False)

surr_high.train()
surr_low.train()
surr_hier.train()

# Plotting
plt.plot(plot_x, OD.high(plot_x), label='high')
plt.plot(plot_x, OD.low(plot_x), label='low')
plt.plot(plot_x, surr_high.predict(plot_x), label='high-fit GP')
plt.plot(plot_x, surr_low.predict(plot_x), label='low-fit GP')
plt.plot(plot_x, surr_hier.predict(plot_x), label='co-kriging')
plt.legend(loc=0)
plt.tight_layout()
plt.show()

### With normalization by Surrogate

Just to show that the normalization is correctly implemented.<br>
Because of the values in this example, it's not really needed, but if the results at least don't get worse in this case, it's probably correct.

In [None]:
surr_high = mlcs.Surrogate.fromname('Kriging', archive, fidelity='high', normalized=True)
surr_low = mlcs.Surrogate.fromname('Kriging', archive, fidelity='low', normalized=True)
surr_hier = mlcs.HierarchicalSurrogate('Kriging', surr_low, archive, ['high', 'low'], normalized=True)

surr_high.train()
surr_low.train()
surr_hier.train()

# Plotting
plt.plot(plot_x, OD.high(plot_x), label='high')
plt.plot(plot_x, OD.low(plot_x), label='low')
plt.plot(plot_x, surr_high.predict(plot_x), label='high-fit GP')
plt.plot(plot_x, surr_low.predict(plot_x), label='low-fit GP')
plt.plot(plot_x, surr_hier.predict(plot_x), label='co-kriging')
plt.legend(loc=0)
plt.tight_layout()
plt.show()

## Direct construction with MultiFidelityBO

Recreating the same plot again with the MultiFidelityBO (Bayesian Optimization) interface.<br>
This interface automatically creates a full set of hierarchical models for any number of fidelities.

In [None]:
mfbo = mlcs.MultiFidelityBO(OD, archive)

# Plotting
plt.plot(plot_x, OD.high(plot_x), label='high')
plt.plot(plot_x, OD.low(plot_x), label='low')
plt.plot(plot_x, mfbo.direct_models['high'].predict(plot_x), label='high-fit GP')
plt.plot(plot_x, mfbo.models['low'].predict(plot_x), label='low-fit GP')
plt.plot(plot_x, mfbo.models['high'].predict(plot_x), label='co-kriging')
plt.legend(loc=0)
plt.tight_layout()
plt.savefig(f'{plot_dir}forrester2007_recreated.pdf')
plt.show()

## Making the match exact

We make two changes to the procedure to really recreate the plot:
 1. Using $f_l(x)$ directly rather than model $\hat{f}_l(x)$
 2. Using better scaling values. `1.87` gives the match seen in the original picture, while `2` gives a perfect match 

The first change should actually be used too. If predicting some $\hat{f}_h(x)$ value for a completely new point $x$, then obviously the lower-fidelity models are the only available source of information. But when selecting which point to evaluate in higher fidelity, the exact lower fidelity information is usually available and can therefore be used.

The value `1.87` comes from taking the mean over the entire range (based on 100 samples) rather than just the 4 common datapoints we have, while the value `2` is derived from the function definition.

In [None]:
print('   n  | Mean ratio f_h / f_l')
print('------+----------------------')
for n in [50, 100, 200, 300, 400, 500, 750, 1000]:
    high = OD.high(np.linspace(0,1,n+1))
    low = OD.low(np.linspace(0,1,n+1))
    rho = 1/np.mean(high/low)
    print(f'{n:>5} |       {rho:<.6}  {"<---" if n==100 else ""}')

In [None]:
gp_diff_20 = GaussianProcessRegressor(kernel=kernel).fit(diff_x, np.array([(OD.high(x) - 2*OD.low(x)) for x in diff_x]))
gp_diff_187 = GaussianProcessRegressor(kernel=kernel).fit(diff_x, np.array([(OD.high(x) - 1.87*OD.low(x)) for x in diff_x]))

cokriging_y_20 = lambda x: 2*OD.low(x) + gp_diff_20.predict(x)
cokriging_y_187 = lambda x: 1.87*OD.low(x) + gp_diff_187.predict(x)

line, = plt.plot(plot_x, plot_high, label='high')
plt.scatter(high_x, high_y, color=line.get_color())
line, = plt.plot(plot_x, plot_low, label='low')
plt.scatter(low_x, low_y, color=line.get_color())
plt.plot(plot_x, gp_direct.predict(plot_x), label='high-fit GP')
plt.plot(plot_x, gp_low.predict(plot_x), label='low-fit GP')
plt.plot(plot_x, cokriging_y_20(plot_x), label='co-kriging (2)')
plt.plot(plot_x, cokriging_y_187(plot_x), label='co-kriging (1.87)')
plt.legend(loc=0)
plt.tight_layout()
# plt.savefig(f'{plot_dir}accurate_forrester2007.png')
plt.savefig(f'{plot_dir}accurate_forrester2007.pdf')
plt.show()

### Side by side comparison
<img src="https://royalsocietypublishing.org/cms/attachment/efa57e07-5384-4503-8b2b-ccbe632ffe87/3251fig1.jpg" alt="Forrester2007 example plot" width="362"/><img src="../../plots/accurate_forrester2007.png" alt="Recreated Forrester2007 example plot"/>