In [None]:
# Defaults

# System settings
systmpfs = '/tmp'

# SA/Optimizer settings
num_workers = 2
budget = 128
SASampleN = 2 # Nsamples = N x (D + 2)
algorithm = 'OnePlusOne'
timeout = 30
monitor_interval = 2
target = 'soil_moisture_content_50'

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from subprocess import CalledProcessError, TimeoutExpired
from tempfile import TemporaryDirectory, NamedTemporaryFile
from concurrent.futures import ProcessPoolExecutor

from IPython.display import Image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nevergrad as ng
import hiplot as hip
from SALib.sample import saltelli
from SALib.analyze import sobol

from common import monitor, observations, GEOtopRun

# High-Performance Derivative-Free Optimization for the GE🌍top Model Calibration

### Stefano Campanella

#### Giacomo Bertoldi, Alberto Sartori, Emanuele Cordano

## GEOtop Calibration

GEOtop is a physical model that simulates the heat and water budgets at and below the soil surface. 

Some inputs are unknown/uncertain: find the values such that the ouputs better reproduce the experimental data.

Case study: LTSER sites in Val di Mazia, Alto Adige

## Challenges

1. Many parameters 
2. No good prior
3. Time consuming simulations

<b class=fragment>Parallel Derivative-free heuristic algorithms are the only option</b>

Some points to keep in mind:

* Community of scientists, not HPC experts nor programmers
* Wide range of use cases and applications
* Calibration is CPU bounded
* For these algorithms, <span class="fragment highlight-red">scaling can be a tricky subject</span>

Consider genetic algorithms/PSO.

<span class=fragment>Increasing the number of phenotypes/particles increases the coverage of the parameters space in each generation/iteration, but the principle of operation of these algorithms lies in the correlation between one generation/iteration and the next.</span>

<span class=fragment>Hence, scaling must take into account that cutting the time to solution by increasing the number of processing units and decreasing the number of iterations can lead to worse results.</span>

## The Goals

1. Write a reusable calibration tool, with a simple yet general enough interface
2. Make it easily deployable on HPC systems
3. Perform the optimization and analysis on the case study
4. Benchmark algorithms, objective functions, and hyperparameters

## Constraints

1. 1D simulations
2. Only scalar parameters (however, the framework allows more complex scenarios) 
3. No multi-objective (multiple targets must be squashed into a single one)

## Approach/Design Choices

* Python: simple language, good tooling e wide adoption in scientific computing community.
* Do not reinvent the wheel (use standard libraries as much as possible, otherwise third party libraries).
* Modularity, encapsulation and referential transparency.
* Emphasis on documentation and reproducibility.

## What has been done

<ol>
    <li class=fragment>Preliminary analysis of the case study and visualization</li>
    <li class=fragment><b>GEOtoPy</b>: A wrapper based on IO and keywords</li>
    <li class=fragment>Prototype of calibration with Nevergrad and SA with SALib</li>
    <li class=fragment>Deployment on Ulysses, now moving to VSC-4</li>
</ol>

## GEOtop IO Scheme

GEOtop works on text files, both for input and output.

The main one is `geotop.inpts` and contains settings and comments, other CSV data files may be provided.

Comments begin with a `!`, settings match `<keyword> = <value>`.

The values can be booleans, numbers, arrays or strings. 

Keyword are hardwired in GEOtop source code and have definite type.

## GEOtoPy

GEOtoPy exports the base, abstract class `GEOtop`, which can be subclassed to solve particular use cases.

Object of this class can parse, store and print the settings contained in `geotop.inpts`.

They can also evaluate the model _without side effects_.

## GEOtoPy Evaluation Model

In [None]:
Image(filename="assets/geotopy.png")

In [None]:
stats = monitor(monitor_interval)

In [None]:
class GEOtopRunLogVars(GEOtopRun):
    
    def preprocess(self, working_dir, *args, **kwargs):
        
        for key, value in kwargs.items():
            if variables.type[key] == 'log':
                kwargs[key] = 10 ** value
                
        super().preprocess(working_dir, *args, **kwargs)

In [None]:
model = GEOtopRunLogVars('inputs/Matsch_B2/run',
                         exe='../geotop/build/geotop',
                         run_args={'check': True, 
                                   'capture_output': True, 
                                   'timeout': timeout})

obs = observations('inputs/Matsch_B2/obs.csv')

variables = pd.read_csv('inputs/Matsch_B2/variables.csv', index_col='name')

In [None]:
with TemporaryDirectory(dir=systmpfs) as tmpdir:
    sim = model.eval(tmpdir)
    print(f"Before optimization loss is {obs.metric(target, sim)}")
    fig = obs.compare(target, sim, name="Soil moisture content @ 50 mm", rel=True)
    plt.show()

## Example of Sensitivity Analysis

Right now, we are using the loss $\frac{\langle \left(y_\text{obs} - y_\text{sim} \right)^2 \rangle}{\langle y_\text{obs}^2 \rangle}$

In [None]:
def loss_function(*args, **kwargs):

    with TemporaryDirectory(dir=systmpfs) as tmpdir:
        try:
            sim = model.eval(tmpdir, *args, **kwargs)
        except CalledProcessError:
            return np.nan
        except TimeoutExpired:
            return np.nan

    return obs.metric(target, sim) # < (y_obs - y_sim)^2 > / < y_obs^2  >

In [None]:
def SA_loss(xs):
         
    return loss_function(**dict(zip(variables.index, xs)))

problem = {'num_vars': variables.shape[0],
           'names': variables.index,
           'bounds': list(zip(variables.lower, variables.upper))}

samples = saltelli.sample(problem, SASampleN, calc_second_order=False)

with ProcessPoolExecutor(max_workers=num_workers) as executor:
    losses = np.fromiter(executor.map(SA_loss, samples), dtype=float)

In [None]:
SA = sobol.analyze(problem, losses, calc_second_order=False, parallel=True, n_processors=num_workers);
SA = pd.DataFrame(SA, index=problem['names'])
SA.sort_values('S1', key=np.abs, ascending=False)

## Example of Calibration

In [None]:
kwargs = {name: ng.p.Scalar(init=value.suggested, lower=value.lower, upper=value.upper) 
          for name, value in variables.iterrows()}

optimizer = ng.optimizers.registry[algorithm](parametrization=ng.p.Instrumentation(**kwargs),
                                              budget=budget,
                                              num_workers=num_workers)

logfile = NamedTemporaryFile(dir=systmpfs)
logger = ng.callbacks.ParametersLogger(logfile.name)
optimizer.register_callback("tell",  logger)

In [None]:
for sample, loss in zip(samples, losses):
    settings = dict(zip(variables.index, sample))
    optimizer.suggest(**settings)
    candidate = optimizer.ask()
    optimizer.tell(candidate, loss) 
    
with ProcessPoolExecutor(max_workers=optimizer.num_workers) as executor:
    recommendation = optimizer.minimize(loss_function, 
                                        executor=executor, 
                                        batch_mode=False)

In [None]:
variables['best'] = pd.Series(recommendation.kwargs)
variables

In [None]:
with TemporaryDirectory() as tmpdir:
    print(f"After optimization loss is {recommendation.loss}")
    sim = model.eval(tmpdir, **recommendation.kwargs)
    fig = obs.compare(target, sim, name=f"Soil moisture content @ 50mm", rel=True)
    plt.show()

## Parallel Coordinates Visualization with HiPlot

In [None]:
logger.load()
experiment = logger.to_hiplot_experiment()

In [None]:
hidden_columns = ['uid', 
                  'from_uid', 
                  '#parametrization', 
                  '#optimizer', 
                  '#optimizer#noise_handling', 
                  '#optimizer#mutation',
                  '#optimizer#crossover',
                  '#session', 
                  '#lineage',
                  '#meta-sigma']

for name in variables.index:
    hidden_columns.append(name + '#sigma')
    hidden_columns.append(name + '#sigma#sigma')
    
table = experiment.display_data(hip.Displays.TABLE)
table.update({'hide': hidden_columns,
              'order_by': [['#num-tell', 'asc']]})

plot = experiment.display_data(hip.Displays.PARALLEL_PLOT)
plot.update({'hide': [*hidden_columns, '#num-tell'],
             'order': ['#generation', *SA.sort_values('ST', key=np.abs).index, '#loss']})

In [None]:
experiment.display()

## CPU and Memory Usage

In [None]:
stats.plot()
plt.show()

## Deployment

1. SPACK
2. PyPi
3. Pipenv
4. Papermill

## To Do

1. Assemble the calibration package
2. Benchmarks

## Further developments

1. Ray Tune + Nevergrad (multinode)?
2. Multinode SA with Ray?
3. Inoculate information from SA into optimizer?
4. IO optimizations?
5. BFGS after global optimization?
6. Parameters clustering?

## Acknowledgments

The research reported in this work is supported by OGS, Eurac and CINECA under HPC-TRES program award number 2019-33