In [None]:
%matplotlib inline
from collections import namedtuple
from pprint import pprint
from matplotlib import colors
from mpl_toolkits.axes_grid1 import make_axes_locatable   
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr

import sys
from pathlib import Path

module_path = Path('../..')
if module_path not in sys.path:
    sys.path.append(str(module_path.resolve()))
    
import multifidelityfunctions as mff
import multiLevelCoSurrogates as mlcs
from function_defs import *

np.random.seed(20160501)  # Setting seed for reproducibility

np.set_printoptions(linewidth=200, edgeitems=10, precision=4, suppress=True)
plot_dir = Path('../../plots/')
data_dir = Path('../../files/')

print(f'Python version {sys.version}')

In [None]:
sample = np.load(data_dir/'2d_test_sample.npy')
plt.scatter(sample[:,0], sample[:,1])
plt.tight_layout()
plt.savefig(f'{plot_dir}2d_sample_distribution.pdf')
plt.show()

# MSE errors per sample size combination

In [None]:
sorted(data_dir.iterdir())

## LHS

In [None]:
def display_paired_differences(data, title, vmax=5, num_colors=5, save_as=None):
    paired_differences = data.sel(model='high_hier') - data.sel(model='high')
    mean_paired_diff = paired_differences.mean(dim='rep')
    std_paired_diff = paired_differences.std(dim='rep', ddof=1)
    se_paired_diff = std_paired_diff / np.sqrt(data.shape[2])
    t_scores = abs(mean_paired_diff / se_paired_diff)
    
    norm = colors.Normalize(vmin=0, vmax=vmax, clip=True)
    discrete_cmap = plt.get_cmap('viridis', num_colors)

    fig, ax = plt.subplots(figsize=(9,3.5))
    img = ax.imshow(t_scores, cmap=discrete_cmap, norm=norm, origin='lower')
    fig.colorbar(img, ax=ax, orientation='vertical')
    ax.set_title(f"Paired difference t-scores - {title}")
    
    plt.tight_layout()
    if save_as:
        plt.savefig(save_as)
    plt.show()

In [None]:
def plot_extracts(data, title, save_as=None, show=False):
    fig, ax = plt.subplots(1, 2, figsize=(9,3.5))
    
    n_highs = data.coords['n_high'].values
    for nhigh in range(np.min(n_highs), np.max(n_highs)+1,10):
        to_plot = data.sel(n_high=nhigh, model='high_hier').median(dim='rep')
        ax[0].plot(to_plot, label=nhigh)
        ax[1].plot(to_plot, label=nhigh)

    ax[0].set_title(title)
    ax[1].set_title(title + ' log-scale')
    ax[1].set_yscale('log')
    
    plt.legend(loc=0)
    plt.tight_layout()
    if save_as:
        plt.savefig(save_as)
    if show:
        plt.show()
    plt.close()

In [None]:
#generic flow
Case = namedtuple('Case', 'name ndim vmin vmax max_diff')

cases = [
    Case('Forrester',        1,  None,    None,  100),
#     Case('Forrester',        2,  None,    None,  100),
#     Case('Forrester',        4,  None,    None,   10),
#     Case('Forrester',        6,  None,    None,   10),
#     Case('Forrester',        8,  None,    None,   10),
#     Case('Bohachevsky',      2,   500,   2_000,  200),
#     Case('Booth',            2,   1e5,     5e6, 5000),
    Case('Branin',           2,    10,     1e4, None),
#     Case('Currin',           2,   .01,      10,   50),
#     Case('Himmelblau',       2,  None,    None, 1000),
#     Case('SixHumpCamelBack', 2,  None,    None,  100),
#     Case('Park91a',          4,  None,    None,    1),
#     Case('Park91b',          4,  None,    None,    1),
    Case('Hartmann6',        6,  8e-3,    5e-1,    1),
    Case('Borehole',         8,    10,    3000,  1e4),
]

In [None]:
for c in cases:
    print(c.name, c.ndim)
    with xr.open_dataset(data_dir/f'Matern_{c.ndim}d_{c.name}.nc') as ds:
        mses = ds['mses'].load()

    print(mses.coords)
    print('median')
    pprint([(f'{95+i}%-ile', np.nanpercentile(mses.median(dim='rep'), 95+i)) for i in range(6)])

    plot_name = f'{c.ndim}d-{c.name}-high-low-samples-linear'
    title = f'{c.name} ({c.ndim}D)'
    
    plot_high_vs_low_num_samples(mses, title, vmin=c.vmin, vmax=c.vmax, save_as=plot_dir/f'{plot_name}.pdf')
    plot_high_vs_low_num_samples_diff(mses, title, max_diff=c.max_diff, save_as=plot_dir/f'{plot_name}_diff.pdf')

    display_paired_differences(mses, title=title, save_as=plot_dir/f'{plot_name}_significance.pdf')
    plot_extracts(mses, title, save_as=plot_dir/f'{plot_name}_extracts.pdf', show=True)

Plotting histograms of the MSE distributions along vertical slices

In [None]:
cases = [
    Case('Forrester', 1,  None,    None,  100),
#     Case('Forrester', 2,  None,    None,  100),
#     Case('Forrester', 4,  None,    None,   10),
#     Case('Branin',    2, .0001,   1_000,    1),
#     Case('Currin',    2,   .01,      10,   50),
#     Case('Park91a',   4,  None,    None,  100),
#     Case('Hartmann6', 6,  None,    None, 1e15),
#     Case('Borehole',  8, 1_000,   3_000,   10),
]

hist_dir = plot_dir/'histograms/'
mlcs.Utils.guaranteeFolderExists(hist_dir)

for c in cases:
    print(c.name, c.ndim)
    with xr.open_dataset(data_dir/f'Matern_{c.ndim}d_{c.name}.nc') as ds:
        mses = ds['mses'].load()
    slice_indices = range(20, 121, 20)
    
    paired_differences = mses.sel(model='high') - mses.sel(model='high_hier')
    mean_paired_diff = paired_differences.mean(dim='rep')
    std_paired_diff = paired_differences.std(dim='rep', ddof=1)
    se_paired_diff = std_paired_diff / np.sqrt(mses.shape[2])
    t_scores = mean_paired_diff / se_paired_diff
    
    x_min, x_max = -8, 3
    num_sections = 4
    bins = 10**np.linspace(x_min, x_max, (x_max-x_min)*num_sections + 1)
    
    for slice_idx in slice_indices:
        sub_mses = mses.sel(n_low=slice_idx)
        
        for idx in sub_mses.coords['n_high'].values:
            mse = sub_mses.sel(n_high=idx)
            if np.all(np.isnan(mse)):
                continue
            
            mse_high = mse.sel(model='high')
            mse_hier = mse.sel(model='high_hier')
            
            plot_name = f'{c.ndim}d-{c.name}-{slice_idx}l-{idx:02d}h-histogram'
            
            plt.figure(figsize=(10,4))
            plt.subplot(121)
            plt.hist(mse_hier, bins=bins, label='hierarchical', alpha=.5)
            plt.axvline(np.mean(mse_hier), color='C0')
            plt.hist(mse_high, bins=bins, label='high-only', alpha=.5)
            plt.axvline(np.mean(mse_high), color='C1')
            plt.title('Original distributions (log-histogram)')
            plt.legend(loc=1)
            plt.ylim([0,50])
            plt.xscale('log')
            plt.xlim([10**x_min, 10**x_max])
            
            plt.subplot(122)
            plt.hist(mse_high - mse_hier, color='C2', label='high - hierarchical', alpha=.5)
            plt.axvline(np.mean(mse_high - mse_hier), color='C2')
            plt.title('Histogram of differences')
            plt.legend(loc=1)
            
            plt.suptitle(f'{c.name} ({c.ndim}D) - {idx}:{slice_idx} samples - t-score: {t_scores.sel(n_high=idx, n_low=slice_idx).values:.2f}')
            plt.tight_layout()
            plt.subplots_adjust(top=0.85)
            plt.savefig(hist_dir/f'{plot_name}.png')
            plt.close('all')

In [None]:
def plot_paired_differences(data, title=None, vmax=5, save_as=None):
    mean_high = np.mean(data[:,:,:,1], axis=2)
    mean_hier = np.mean(data[:,:,:,0], axis=2)
    
    paired_differences = data[:,:,:,1] - data[:,:,:,0]
    mean_paired_diff = np.mean(paired_differences, axis=2)
    std_paired_diff = np.std(paired_differences, axis=2, ddof=1)
    se_paired_diff = std_paired_diff / np.sqrt(data.shape[2])
    t_scores = abs(mean_paired_diff / se_paired_diff)
    
    fig, axes = plt.subplots(nrows=4,ncols=2,figsize=(16,9))
    
    axes = axes.flatten()
    for i, nlow in enumerate(range(20,125,20)):
        c = f'C{i}'
        
        axes[0].plot(mean_paired_diff[:,nlow], color=c, label=nlow)
        axes[1].plot(std_paired_diff[:,nlow], color=c, label=nlow)

        axes[2].plot((mean_paired_diff / mean_high)[:,nlow], color=c, label=nlow)
        axes[3].plot((mean_paired_diff / mean_hier)[:,nlow], color=c, label=nlow)
        
        axes[4].plot((std_paired_diff / mean_high)[:,nlow], color=c, label=nlow)
        axes[5].plot((std_paired_diff / mean_hier)[:,nlow], color=c, label=nlow)
                     
        axes[6].plot(t_scores[:,nlow], color=c, label=nlow)
        axes[7].plot((mean_high/mean_hier)[:,nlow], color=c, label=nlow)
    
    
    titles = ['Mean Pairwise Difference (MPD)', 'Std Pairwise Differnce (SPD)',
              'MPD/Mean High-fidelity only MSE', 'MPD/Mean Hierarchical MSE',
              'SPD/Mean High-fidelity only MSE', 'SPD/Mean Hierarchical MSE',
              'T-score for $H_0$: MPD = 0', 'Mean High-fidelity only MSE/Mean Hierarchical MSE']
    for ax, t in zip(axes, titles):
        ax.set_title(t)
    
#     norm = colors.Normalize(vmin=0, vmax=vmax, clip=True)
#     discrete_cmap = plt.get_cmap('viridis', num_colors)

#     fig, ax = plt.subplots(figsize=(9,3.5))
#     img = ax.imshow(t_scores, cmap=discrete_cmap, norm=norm, origin='lower')
#     fig.colorbar(img, ax=ax, orientation='vertical')
#     ax.set_title(f"Paired difference t-scores - {title}")
#     for ax in axes:
#         ax.legend(loc=0)
        
    
    axes[1].legend(bbox_to_anchor=(1.04,0.5), loc='center left', borderaxespad=0)
        
    plt.tight_layout()
    if save_as:
        plt.savefig(save_as)
    plt.show()

In [None]:
for c in cases:
    print(c.name, c.ndim)
    lin_mse_tracking = np.load(f'{data_dir}Matern_{c.ndim}d_{c.name}_lin_mse_tracking.npy')
    plot_paired_differences(lin_mse_tracking)

In [None]:
os.getcwd()

### Forrester function

In [None]:
if 'Matern_1d_forrester_lin_mse_tracking.npy' in data_dir.iterdir:
    lin_mse_tracking = np.load(f'{data_dir}Matern_1d_forrester_lin_mse_tracking.npy')
else:
    pass
#     lin_mse_tracking = create_mse_tracking(TD_inv, low_lhs_sample)
#     np.save(f'{data_dir}Matern_1d_forrester_lin_mse_tracking.npy', lin_mse_tracking)

In [None]:
print('median')
pprint([(f'{95+i}%-ile', np.nanpercentile(np.nanmedian(lin_mse_tracking, axis=2).flatten(), 95+i)) for i in range(6)])

In [None]:
name = '2d-high-low-samples-linear'
plot_high_vs_low_num_samples(lin_mse_tracking, name, vmin=100, vmax=100_000, save_as=f'{plot_dir}{name}.pdf')

In [None]:
name = '2d-high-low-samples-linear'
plot_high_vs_low_num_samples_diff(lin_mse_tracking, name, max_diff=100, save_as=f'{plot_dir}{name}_diff.pdf')

In [None]:
paired_differences = lin_mse_tracking[:,:,:,1] - lin_mse_tracking[:,:,:,0]

In [None]:
mean_paired_diff = np.mean(paired_differences, axis=2)
std_paired_diff = np.std(paired_differences, axis=2, ddof=1)
se_paired_diff = std_paired_diff / np.sqrt(lin_mse_tracking.shape[2])
t_scores = np.abs(mean_paired_diff / se_paired_diff)

In [None]:
norm = colors.Normalize(vmin=0, vmax=5, clip=True)

discrete_cmap = plt.get_cmap('viridis', 5)

fig, ax = plt.subplots(figsize=(9,3.5))
img = ax.imshow(t_scores, cmap=discrete_cmap, norm=norm, origin='lower')
fig.colorbar(img, ax=ax, orientation='vertical')
ax.set_title("Paired difference t-scores")

In [None]:
display_paired_differences(lin_mse_tracking)

fidelities: `high_hierarchical, high, low`

In [None]:
lin_mse_tracking.shape

In [None]:
paired_differences = lin_mse_tracking[:,:,:,1] - lin_mse_tracking[:,:,:,0]

In [None]:
mean_paired_diff = np.mean(paired_differences, axis=2)
std_paired_diff = np.std(paired_differences, axis=2, ddof=1)
se_paired_diff = std_paired_diff / np.sqrt(lin_mse_tracking.shape[2])
t_scores = np.abs(mean_paired_diff / se_paired_diff)

In [None]:
norm = colors.Normalize(vmin=0, vmax=5, clip=True)

discrete_cmap = plt.get_cmap('viridis', 5)

fig, ax = plt.subplots(figsize=(12,5))
img = ax.imshow(t_scores, cmap=discrete_cmap, norm=norm, origin='lower')
fig.colorbar(img, ax=ax, orientation='vertical')
ax.set_title("Paired difference t-scores")

As a function:

In [None]:
def display_paired_differences(data, vmax=5, num_colors=5):
    paired_differences = data[:,:,:,0] - data[:,:,:,1]
    mean_paired_diff = np.mean(paired_differences, axis=2)
    std_paired_diff = np.std(paired_differences, axis=2, ddof=1)
    se_paired_diff = std_paired_diff / np.sqrt(data.shape[2])
    t_scores = abs(mean_paired_diff / se_paired_diff)
    
    norm = colors.Normalize(vmin=0, vmax=vmax, clip=True)
    discrete_cmap = plt.get_cmap('viridis', num_colors)

    fig, ax = plt.subplots(figsize=(12,5))
    img = ax.imshow(t_scores, cmap=discrete_cmap, norm=norm, origin='lower')
    fig.colorbar(img, ax=ax, orientation='vertical')
    ax.set_title("Paired difference t-scores")

In [None]:
display_paired_differences(lin_mse_tracking)

### Branin function

In [None]:
if 'Matern_2d_branin_lin_mse_tracking.npy' in data_dir.iterdir:
    branin_lin_mse_tracking = np.load(f'{data_dir}Matern_2d_branin_lin_mse_tracking.npy')
else:
    branin_lin_mse_tracking = create_mse_tracking(TD_inv, low_lhs_sample)
    np.save(f'{data_dir}2d_lin_mse_tracking.npy', lin_mse_tracking)

In [None]:
print('median')
pprint([(f'{95+i}%-ile', np.nanpercentile(np.nanmedian(branin_lin_mse_tracking, axis=2).flatten(), 95+i)) for i in range(6)])

In [None]:
name = 'Matern-2d-branin-high-low-samples-linear'
plot_high_vs_low_num_samples(branin_lin_mse_tracking, name, vmin=.0001, vmax=1_000, save_as=f'{plot_dir}{name}.pdf')

In [None]:
name = 'Matern-2d-branin-high-low-samples-linear'
plot_high_vs_low_num_samples_diff(branin_lin_mse_tracking, name, max_diff=10, save_as=f'{plot_dir}{name}_diff.pdf')

In [None]:
display_paired_differences(branin_lin_mse_tracking)

### Currin Function

In [None]:
if 'Matern_2d_currin_lin_mse_tracking.npy' in data_dir.iterdir:
    currin_lin_mse_tracking = np.load(f'{data_dir}Matern_2d_currin_lin_mse_tracking.npy')
else:
    currin_lin_mse_tracking = create_mse_tracking(TD_inv, low_lhs_sample)
    np.save(f'{data_dir}2d_lin_mse_tracking.npy', lin_mse_tracking)

In [None]:
print('median')
pprint([(f'{95+i}%-ile', np.nanpercentile(np.nanmedian(currin_lin_mse_tracking, axis=2).flatten(), 95+i)) for i in range(6)])

In [None]:
name = 'Matern-2d-currin-high-low-samples-linear'
plot_high_vs_low_num_samples(currin_lin_mse_tracking, name, vmin=.01, vmax=10, save_as=f'{plot_dir}{name}.pdf')

In [None]:
name = 'Matern-2d-currin-high-low-samples-linear'
plot_high_vs_low_num_samples_diff(currin_lin_mse_tracking, name, max_diff=50, save_as=f'{plot_dir}{name}_diff.pdf')

In [None]:
display_paired_differences(currin_lin_mse_tracking)

### Borehole Function

In [None]:
if 'Matern_8d_borehole_lin_mse_tracking.npy' in data_dir.iterdir:
    borehole_lin_mse_tracking = np.load(f'{data_dir}Matern_8d_borehole_lin_mse_tracking.npy')
else:
    borehole_lin_mse_tracking = create_mse_tracking(TD_inv, low_lhs_sample)
    np.save(f'{data_dir}2d_lin_mse_tracking.npy', lin_mse_tracking)

In [None]:
print('median')
pprint([(f'{95+i}%-ile', np.nanpercentile(np.nanmedian(borehole_lin_mse_tracking, axis=2).flatten(), 95+i)) for i in range(6)])

In [None]:
name = 'Matern-2d-borehole-high-low-samples-linear'
plot_high_vs_low_num_samples(borehole_lin_mse_tracking, name, vmin=100, vmax=10_000, save_as=f'{plot_dir}{name}.pdf')

In [None]:
name = 'Matern-2d-borehole-high-low-samples-linear'
plot_high_vs_low_num_samples_diff(borehole_lin_mse_tracking, name, max_diff=10, save_as=f'{plot_dir}{name}_diff.pdf')

In [None]:
display_paired_differences(borehole_lin_mse_tracking)

## Difference in error between linear and random sample

In [None]:
name = "2D, random - LHS"
plot_inter_method_diff(mse_tracking, lin_mse_tracking, name, save_as=f'{plot_dir}{name}.pdf')

## Maximum found error per experiment

In [None]:
errors = np.load(f'{data_dir}2d_error_tracking.npy')
max_errors = np.max(errors, axis=(2,4))

In [None]:
name = '2d_max_error'

norm = colors.LogNorm(vmin=1, vmax=np.max(max_errors), clip=True)
fig, axes = plt.subplots(ncols=3, nrows=1, figsize=(20,4))
plt.suptitle(name)

for i, ax in enumerate(axes):
    img = ax.imshow(max_errors[:,:,i], cmap='viridis', norm=norm)
    
fig.colorbar(img, ax=axes[-1], orientation='vertical')
plt.tight_layout()
plt.savefig(f'{plot_dir}2d_max_error.pdf')

In [None]:
bins = [0] + [10**e for e in range(-3, 15)]
for i, m in enumerate(['hierarchical', 'high-only', 'low-only']):
    plt.hist(max_errors[:,:,i].flatten(), alpha=.3, bins=bins, label=m)
plt.xscale('log')
plt.legend()
plt.tight_layout()
plt.savefig(f'{plot_dir}2d_max_error.pdf')

In [None]:
lin_errors = np.load(f'{data_dir}2d_lin_error_tracking.npy')
max_lin_errors = np.max(lin_errors, axis=(2,4))

In [None]:
name = '2d_lin_max_error'

norm = colors.LogNorm(vmin=1, vmax=np.max(max_lin_errors), clip=True)
fig, axes = plt.subplots(ncols=3, nrows=1, figsize=(20,4))
plt.suptitle(name)

for i, ax in enumerate(axes):
    img = ax.imshow(max_lin_errors[:,:,i], cmap='viridis', norm=norm)
    
fig.colorbar(img, ax=axes[-1], orientation='vertical')
plt.tight_layout()
plt.savefig(f'{plot_dir}2d_max_lin_error.pdf')

In [None]:
bins = [0] + [10**e for e in range(-3, 15)]
for i, m in enumerate(['hierarchical', 'high-only', 'low-only']):
    plt.hist(max_lin_errors[:,:,i].flatten(), alpha=.3, bins=bins, label=m)
plt.xscale('log')
plt.legend()