### Selected samples Oridnary Kriging validation

**Author:** Jakub Walczak, PhD

This notebook contains validation of the OK method.

In [None]:
import csv
import shutil
from functools import partial
from pathlib import Path
from typing import Any, Callable

import xarray as xr
from bayes_opt import BayesianOptimization
from rich.console import Console

import climatrix as cm

%load_ext rich

In [None]:
console = Console()

INF_LOSS = -1e4

NAN_POLICY = "resample"
console.print("[bold green]Using NaN policy: [/bold green]", NAN_POLICY)

SEED = 1
console.print("[bold green]Using seed: [/bold green]", SEED)

DSET_PATH = Path(__session__).parent.parent.joinpath("data")
console.print("[bold green]Using dataset path: [/bold green]", DSET_PATH)

EUROPE_BOUNDS = {"north": 71, "south": 36, "west": -24, "east": 35}
EUROPE_DOMAIN = cm.Domain.from_lat_lon(
    lat=slice(EUROPE_BOUNDS["south"], EUROPE_BOUNDS["north"], 0.1),
    lon=slice(EUROPE_BOUNDS["west"], EUROPE_BOUNDS["east"], 0.1),
    kind="dense",
)
cm.seed_all(SEED)

In [None]:
def get_all_dataset_idx() -> list[str]:
    return sorted(
        list({path.stem.split("_")[-1] for path in DSET_PATH.glob("*.nc")})
    )

In [None]:
def run_single_method(
    d: str, i: int, method: str, reconstruct_dense: bool = True, **params
):
    cm.seed_all(SEED)
    train_dset = xr.open_dataset(
        DSET_PATH / f"ecad_obs_europe_train_{d}.nc"
    ).cm
    val_dset = xr.open_dataset(DSET_PATH / f"ecad_obs_europe_val_{d}.nc").cm
    reconstructed_dset = train_dset.reconstruct(
        val_dset.domain,
        method=method,
        **params,
    )
    if reconstruct_dense:
        reconstructed_dense = train_dset.reconstruct(
            EUROPE_DOMAIN, method=method, **params
        )
    return val_dset, reconstructed_dset, reconstructed_dense

In [None]:
dset_idx = get_all_dataset_idx()
console.print(
    f"[bold green]There is [bold yellow]{len(dset_idx)}[/bold yellow] samples available [/bold green]"
)

In [None]:
IDX = 0

In [None]:
ok_val_dset, ok_reconstructed_dset, ok_reconstructed_dense = run_single_method(
    dset_idx[IDX],
    IDX,
    "ok",
)

In [None]:
cm.Comparison(ok_val_dset, ok_reconstructed_dset).compute_report()

### After optimising hyperpararmeters

In [None]:
BOUNDS = {
    "nlags": (2, 50),
    "anisotropy_scaling": (1e-5, 5.0),
    "coordinates_type_code": ("1", "2"),
    "variogram_model_code": ("1", "5"),
}
console.print("[bold green]Hyperparameter bounds: [/bold green]", BOUNDS)

OPTIM_INIT_POINTS: int = 50
console.print(
    "[bold green]Using nbr initial points for optimization: [/bold green]",
    OPTIM_INIT_POINTS,
)

OPTIM_N_ITERS: int = 100
console.print(
    "[bold green]Using iterations for optimization[/bold green]", OPTIM_N_ITERS
)

In [None]:
def compute_criterion(
    train_dset: cm.BaseClimatrixDataset,
    val_dset: cm.BaseClimatrixDataset,
    **hparams,
) -> float:
    coordinates_type_code = int(hparams["coordinates_type_code"])
    variogram_model_code = int(hparams["variogram_model_code"])
    nlags = int(hparams["nlags"])
    anisotropy_scaling = float(hparams["anisotropy_scaling"])
    coordinates_type = coordinates_type_mapping[coordinates_type_code]
    variogram_model = variogram_model_mapping[variogram_model_code]

    try:
        recon_dset = train_dset.reconstruct(
            val_dset.domain,
            method="ok",
            nlags=int(nlags),
            anisotropy_scaling=float(anisotropy_scaling),
            coordinates_type=coordinates_type,
            variogram_model=variogram_model,
            backend="vectorized",
        )
    except Exception as e:
        console.print(
            f"[yellow]Error during reconstruction with parameters: "
            f"{hparams}[/yellow]"
        )
        console.print(f"[yellow]{e}[/yellow]")
        return INF_LOSS
    metrics = cm.Comparison(
        recon_dset, val_dset, map_nan_from_source=False
    ).compute_report()
    # NOTE: minus to force maximizing
    return -metrics["MAE"]


def find_hyperparameters(
    train_dset: cm.BaseClimatrixDataset,
    val_dset: cm.BaseClimatrixDataset,
    func: Callable[
        [cm.BaseClimatrixDataset, cm.BaseClimatrixDataset, dict], float
    ],
    bounds: dict[str, tuple],
    n_init_points: int = 30,
    n_iter: int = 200,
    seed: int = 0,
    verbose: int = 2,
) -> tuple[float, dict[str, float]]:
    """
    Find hyperparameters using Bayesian Optimization.

    Parameters
    ----------
    train_dset : cm.BaseClimatrixDataset
        Training dataset.
    val_dset : cm.BaseClimatrixDataset
        Validation dataset.
    func : Callable
        Function to optimize.
        It should take two datasets and a dictionary of hyperparameters,
        and return a float score.
    bounds : dict[str, tuple]
        Dictionary of hyperparameter bounds.
        Keys are hyperparameter names, values are tuples (min, max).
    n_init_points : int, optional
        Number of initial random points to sample, by default 30.
    n_iter : int, optional
        Number of iterations for optimization, by default 200.
    seed : int, optional
        Random seed for reproducibility, by default 0.
    verbose : int, optional
        Verbosity level of the optimizer, by default 2.

    Returns
    -------
    tuple[float, dict[str, float]]
        Best score and best hyperparameters found.
    """
    func = partial(func, train_dset=train_dset, val_dset=val_dset)
    optimizer = BayesianOptimization(
        f=func, pbounds=bounds, random_state=seed, verbose=verbose
    )
    optimizer.maximize(
        init_points=n_init_points,
        n_iter=n_iter,
    )
    return optimizer.max["target"], (
        int(optimizer.max["params"]["nlags"]),
        float(optimizer.max["params"]["anisotropy_scaling"]),
        coordinates_type_mapping[
            int(optimizer.max["params"]["coordinates_type_code"])
        ],
        variogram_model_mapping[
            int(optimizer.max["params"]["variogram_model_code"])
        ],
    )


def run_single_experiment(d: str):
    cm.seed_all(SEED)
    train_dset = xr.open_dataset(
        DSET_PATH / f"ecad_obs_europe_train_{d}.nc"
    ).cm
    val_dset = xr.open_dataset(DSET_PATH / f"ecad_obs_europe_val_{d}.nc").cm
    best_loss, (
        nlags,
        anisotroty_scaling,
        coordinates_type,
        variogram_model,
    ) = find_hyperparameters(
        train_dset,
        val_dset,
        compute_criterion,
        BOUNDS,
        n_init_points=OPTIM_INIT_POINTS,
        n_iter=OPTIM_N_ITERS,
        seed=SEED,
        verbose=2,
    )
    console.print("[bold yellow]Optimized parameters:[/bold yellow]")
    console.print("[yellow]Number of lags:[/yellow]", nlags)
    console.print(
        "[yellow]Anisotropy scaling factor:[/yellow]",
        anisotroty_scaling,
    )
    console.print(
        "[yellow]Coordinates type:[/yellow]",
        coordinates_type,
    )
    console.print(
        "[yellow]Variogram model:[/yellow]",
        variogram_model,
    )
    console.print("[yellow]Best loss:[/yellow]", best_loss)
    reconstructed_dset = train_dset.reconstruct(
        val_dset.domain,
        method="ok",
        nlags=nlags,
        anisotropy_scaling=anisotroty_scaling,
        coordinates_type=coordinates_type,
        variogram_model=variogram_model,
        backend="vectorized",
        pseudo_inv=True,
    )
    cmp = cm.Comparison(reconstructed_dset, val_dset)
    metrics = cmp.compute_report()
    hyperparams = {
        "dataset_id": d,
        "nlags": nlags,
        "anisotropy_scaling": anisotroty_scaling,
        "coordinates_type": coordinates_type,
        "variogram_model": variogram_model,
        "opt_loss": best_loss,
    }
    return (metrics, hyperparams)

In [None]:
metrics, hyperparams = run_single_experiment(dset_idx[IDX])

In [None]:
cm.Comparison(ok_val_dset, ok_reconstructed_dset).compute_report()

In [None]:
metrics